xref: /openbmc/qemu/block/io.c (revision 90c84c56006747537e9e4240271523c4c3b7a481)
1  /*
2   * Block layer I/O functions
3   *
4   * Copyright (c) 2003 Fabrice Bellard
5   *
6   * Permission is hereby granted, free of charge, to any person obtaining a copy
7   * of this software and associated documentation files (the "Software"), to deal
8   * in the Software without restriction, including without limitation the rights
9   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10   * copies of the Software, and to permit persons to whom the Software is
11   * furnished to do so, subject to the following conditions:
12   *
13   * The above copyright notice and this permission notice shall be included in
14   * all copies or substantial portions of the Software.
15   *
16   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22   * THE SOFTWARE.
23   */
24  
25  #include "qemu/osdep.h"
26  #include "trace.h"
27  #include "sysemu/block-backend.h"
28  #include "block/aio-wait.h"
29  #include "block/blockjob.h"
30  #include "block/blockjob_int.h"
31  #include "block/block_int.h"
32  #include "qemu/cutils.h"
33  #include "qapi/error.h"
34  #include "qemu/error-report.h"
35  
36  #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
37  
38  /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
39  #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
40  
41  static void bdrv_parent_cb_resize(BlockDriverState *bs);
42  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
43      int64_t offset, int bytes, BdrvRequestFlags flags);
44  
45  void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
46                                 bool ignore_bds_parents)
47  {
48      BdrvChild *c, *next;
49  
50      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
51          if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
52              continue;
53          }
54          bdrv_parent_drained_begin_single(c, false);
55      }
56  }
57  
58  void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
59                               bool ignore_bds_parents)
60  {
61      BdrvChild *c, *next;
62  
63      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
64          if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
65              continue;
66          }
67          if (c->role->drained_end) {
68              c->role->drained_end(c);
69          }
70      }
71  }
72  
73  static bool bdrv_parent_drained_poll_single(BdrvChild *c)
74  {
75      if (c->role->drained_poll) {
76          return c->role->drained_poll(c);
77      }
78      return false;
79  }
80  
81  static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
82                                       bool ignore_bds_parents)
83  {
84      BdrvChild *c, *next;
85      bool busy = false;
86  
87      QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
88          if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
89              continue;
90          }
91          busy |= bdrv_parent_drained_poll_single(c);
92      }
93  
94      return busy;
95  }
96  
97  void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
98  {
99      if (c->role->drained_begin) {
100          c->role->drained_begin(c);
101      }
102      if (poll) {
103          BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
104      }
105  }
106  
107  static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
108  {
109      dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
110      dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
111      dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
112                                   src->opt_mem_alignment);
113      dst->min_mem_alignment = MAX(dst->min_mem_alignment,
114                                   src->min_mem_alignment);
115      dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
116  }
117  
118  void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
119  {
120      BlockDriver *drv = bs->drv;
121      Error *local_err = NULL;
122  
123      memset(&bs->bl, 0, sizeof(bs->bl));
124  
125      if (!drv) {
126          return;
127      }
128  
129      /* Default alignment based on whether driver has byte interface */
130      bs->bl.request_alignment = (drv->bdrv_co_preadv ||
131                                  drv->bdrv_aio_preadv) ? 1 : 512;
132  
133      /* Take some limits from the children as a default */
134      if (bs->file) {
135          bdrv_refresh_limits(bs->file->bs, &local_err);
136          if (local_err) {
137              error_propagate(errp, local_err);
138              return;
139          }
140          bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
141      } else {
142          bs->bl.min_mem_alignment = 512;
143          bs->bl.opt_mem_alignment = getpagesize();
144  
145          /* Safe default since most protocols use readv()/writev()/etc */
146          bs->bl.max_iov = IOV_MAX;
147      }
148  
149      if (bs->backing) {
150          bdrv_refresh_limits(bs->backing->bs, &local_err);
151          if (local_err) {
152              error_propagate(errp, local_err);
153              return;
154          }
155          bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
156      }
157  
158      /* Then let the driver override it */
159      if (drv->bdrv_refresh_limits) {
160          drv->bdrv_refresh_limits(bs, errp);
161      }
162  }
163  
164  /**
165   * The copy-on-read flag is actually a reference count so multiple users may
166   * use the feature without worrying about clobbering its previous state.
167   * Copy-on-read stays enabled until all users have called to disable it.
168   */
169  void bdrv_enable_copy_on_read(BlockDriverState *bs)
170  {
171      atomic_inc(&bs->copy_on_read);
172  }
173  
174  void bdrv_disable_copy_on_read(BlockDriverState *bs)
175  {
176      int old = atomic_fetch_dec(&bs->copy_on_read);
177      assert(old >= 1);
178  }
179  
180  typedef struct {
181      Coroutine *co;
182      BlockDriverState *bs;
183      bool done;
184      bool begin;
185      bool recursive;
186      bool poll;
187      BdrvChild *parent;
188      bool ignore_bds_parents;
189  } BdrvCoDrainData;
190  
191  static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
192  {
193      BdrvCoDrainData *data = opaque;
194      BlockDriverState *bs = data->bs;
195  
196      if (data->begin) {
197          bs->drv->bdrv_co_drain_begin(bs);
198      } else {
199          bs->drv->bdrv_co_drain_end(bs);
200      }
201  
202      /* Set data->done before reading bs->wakeup.  */
203      atomic_mb_set(&data->done, true);
204      bdrv_dec_in_flight(bs);
205  
206      if (data->begin) {
207          g_free(data);
208      }
209  }
210  
211  /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
212  static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
213  {
214      BdrvCoDrainData *data;
215  
216      if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
217              (!begin && !bs->drv->bdrv_co_drain_end)) {
218          return;
219      }
220  
221      data = g_new(BdrvCoDrainData, 1);
222      *data = (BdrvCoDrainData) {
223          .bs = bs,
224          .done = false,
225          .begin = begin
226      };
227  
228      /* Make sure the driver callback completes during the polling phase for
229       * drain_begin. */
230      bdrv_inc_in_flight(bs);
231      data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
232      aio_co_schedule(bdrv_get_aio_context(bs), data->co);
233  
234      if (!begin) {
235          BDRV_POLL_WHILE(bs, !data->done);
236          g_free(data);
237      }
238  }
239  
240  /* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
241  bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
242                       BdrvChild *ignore_parent, bool ignore_bds_parents)
243  {
244      BdrvChild *child, *next;
245  
246      if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
247          return true;
248      }
249  
250      if (atomic_read(&bs->in_flight)) {
251          return true;
252      }
253  
254      if (recursive) {
255          assert(!ignore_bds_parents);
256          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
257              if (bdrv_drain_poll(child->bs, recursive, child, false)) {
258                  return true;
259              }
260          }
261      }
262  
263      return false;
264  }
265  
266  static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
267                                        BdrvChild *ignore_parent)
268  {
269      return bdrv_drain_poll(bs, recursive, ignore_parent, false);
270  }
271  
272  static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
273                                    BdrvChild *parent, bool ignore_bds_parents,
274                                    bool poll);
275  static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
276                                  BdrvChild *parent, bool ignore_bds_parents);
277  
278  static void bdrv_co_drain_bh_cb(void *opaque)
279  {
280      BdrvCoDrainData *data = opaque;
281      Coroutine *co = data->co;
282      BlockDriverState *bs = data->bs;
283  
284      if (bs) {
285          AioContext *ctx = bdrv_get_aio_context(bs);
286          AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
287  
288          /*
289           * When the coroutine yielded, the lock for its home context was
290           * released, so we need to re-acquire it here. If it explicitly
291           * acquired a different context, the lock is still held and we don't
292           * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
293           */
294          if (ctx == co_ctx) {
295              aio_context_acquire(ctx);
296          }
297          bdrv_dec_in_flight(bs);
298          if (data->begin) {
299              bdrv_do_drained_begin(bs, data->recursive, data->parent,
300                                    data->ignore_bds_parents, data->poll);
301          } else {
302              bdrv_do_drained_end(bs, data->recursive, data->parent,
303                                  data->ignore_bds_parents);
304          }
305          if (ctx == co_ctx) {
306              aio_context_release(ctx);
307          }
308      } else {
309          assert(data->begin);
310          bdrv_drain_all_begin();
311      }
312  
313      data->done = true;
314      aio_co_wake(co);
315  }
316  
317  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
318                                                  bool begin, bool recursive,
319                                                  BdrvChild *parent,
320                                                  bool ignore_bds_parents,
321                                                  bool poll)
322  {
323      BdrvCoDrainData data;
324  
325      /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
326       * other coroutines run if they were queued by aio_co_enter(). */
327  
328      assert(qemu_in_coroutine());
329      data = (BdrvCoDrainData) {
330          .co = qemu_coroutine_self(),
331          .bs = bs,
332          .done = false,
333          .begin = begin,
334          .recursive = recursive,
335          .parent = parent,
336          .ignore_bds_parents = ignore_bds_parents,
337          .poll = poll,
338      };
339      if (bs) {
340          bdrv_inc_in_flight(bs);
341      }
342      aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
343                              bdrv_co_drain_bh_cb, &data);
344  
345      qemu_coroutine_yield();
346      /* If we are resumed from some other event (such as an aio completion or a
347       * timer callback), it is a bug in the caller that should be fixed. */
348      assert(data.done);
349  }
350  
351  void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
352                                     BdrvChild *parent, bool ignore_bds_parents)
353  {
354      assert(!qemu_in_coroutine());
355  
356      /* Stop things in parent-to-child order */
357      if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
358          aio_disable_external(bdrv_get_aio_context(bs));
359      }
360  
361      bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
362      bdrv_drain_invoke(bs, true);
363  }
364  
365  static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
366                                    BdrvChild *parent, bool ignore_bds_parents,
367                                    bool poll)
368  {
369      BdrvChild *child, *next;
370  
371      if (qemu_in_coroutine()) {
372          bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
373                                 poll);
374          return;
375      }
376  
377      bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
378  
379      if (recursive) {
380          assert(!ignore_bds_parents);
381          bs->recursive_quiesce_counter++;
382          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
383              bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
384                                    false);
385          }
386      }
387  
388      /*
389       * Wait for drained requests to finish.
390       *
391       * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
392       * call is needed so things in this AioContext can make progress even
393       * though we don't return to the main AioContext loop - this automatically
394       * includes other nodes in the same AioContext and therefore all child
395       * nodes.
396       */
397      if (poll) {
398          assert(!ignore_bds_parents);
399          BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
400      }
401  }
402  
403  void bdrv_drained_begin(BlockDriverState *bs)
404  {
405      bdrv_do_drained_begin(bs, false, NULL, false, true);
406  }
407  
408  void bdrv_subtree_drained_begin(BlockDriverState *bs)
409  {
410      bdrv_do_drained_begin(bs, true, NULL, false, true);
411  }
412  
413  static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
414                                  BdrvChild *parent, bool ignore_bds_parents)
415  {
416      BdrvChild *child, *next;
417      int old_quiesce_counter;
418  
419      if (qemu_in_coroutine()) {
420          bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
421                                 false);
422          return;
423      }
424      assert(bs->quiesce_counter > 0);
425      old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
426  
427      /* Re-enable things in child-to-parent order */
428      bdrv_drain_invoke(bs, false);
429      bdrv_parent_drained_end(bs, parent, ignore_bds_parents);
430      if (old_quiesce_counter == 1) {
431          aio_enable_external(bdrv_get_aio_context(bs));
432      }
433  
434      if (recursive) {
435          assert(!ignore_bds_parents);
436          bs->recursive_quiesce_counter--;
437          QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
438              bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents);
439          }
440      }
441  }
442  
443  void bdrv_drained_end(BlockDriverState *bs)
444  {
445      bdrv_do_drained_end(bs, false, NULL, false);
446  }
447  
448  void bdrv_subtree_drained_end(BlockDriverState *bs)
449  {
450      bdrv_do_drained_end(bs, true, NULL, false);
451  }
452  
453  void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
454  {
455      int i;
456  
457      for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
458          bdrv_do_drained_begin(child->bs, true, child, false, true);
459      }
460  }
461  
462  void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
463  {
464      int i;
465  
466      for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
467          bdrv_do_drained_end(child->bs, true, child, false);
468      }
469  }
470  
471  /*
472   * Wait for pending requests to complete on a single BlockDriverState subtree,
473   * and suspend block driver's internal I/O until next request arrives.
474   *
475   * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
476   * AioContext.
477   */
478  void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
479  {
480      assert(qemu_in_coroutine());
481      bdrv_drained_begin(bs);
482      bdrv_drained_end(bs);
483  }
484  
485  void bdrv_drain(BlockDriverState *bs)
486  {
487      bdrv_drained_begin(bs);
488      bdrv_drained_end(bs);
489  }
490  
491  static void bdrv_drain_assert_idle(BlockDriverState *bs)
492  {
493      BdrvChild *child, *next;
494  
495      assert(atomic_read(&bs->in_flight) == 0);
496      QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
497          bdrv_drain_assert_idle(child->bs);
498      }
499  }
500  
501  unsigned int bdrv_drain_all_count = 0;
502  
503  static bool bdrv_drain_all_poll(void)
504  {
505      BlockDriverState *bs = NULL;
506      bool result = false;
507  
508      /* bdrv_drain_poll() can't make changes to the graph and we are holding the
509       * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
510      while ((bs = bdrv_next_all_states(bs))) {
511          AioContext *aio_context = bdrv_get_aio_context(bs);
512          aio_context_acquire(aio_context);
513          result |= bdrv_drain_poll(bs, false, NULL, true);
514          aio_context_release(aio_context);
515      }
516  
517      return result;
518  }
519  
520  /*
521   * Wait for pending requests to complete across all BlockDriverStates
522   *
523   * This function does not flush data to disk, use bdrv_flush_all() for that
524   * after calling this function.
525   *
526   * This pauses all block jobs and disables external clients. It must
527   * be paired with bdrv_drain_all_end().
528   *
529   * NOTE: no new block jobs or BlockDriverStates can be created between
530   * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
531   */
532  void bdrv_drain_all_begin(void)
533  {
534      BlockDriverState *bs = NULL;
535  
536      if (qemu_in_coroutine()) {
537          bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true);
538          return;
539      }
540  
541      /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
542       * loop AioContext, so make sure we're in the main context. */
543      assert(qemu_get_current_aio_context() == qemu_get_aio_context());
544      assert(bdrv_drain_all_count < INT_MAX);
545      bdrv_drain_all_count++;
546  
547      /* Quiesce all nodes, without polling in-flight requests yet. The graph
548       * cannot change during this loop. */
549      while ((bs = bdrv_next_all_states(bs))) {
550          AioContext *aio_context = bdrv_get_aio_context(bs);
551  
552          aio_context_acquire(aio_context);
553          bdrv_do_drained_begin(bs, false, NULL, true, false);
554          aio_context_release(aio_context);
555      }
556  
557      /* Now poll the in-flight requests */
558      AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
559  
560      while ((bs = bdrv_next_all_states(bs))) {
561          bdrv_drain_assert_idle(bs);
562      }
563  }
564  
565  void bdrv_drain_all_end(void)
566  {
567      BlockDriverState *bs = NULL;
568  
569      while ((bs = bdrv_next_all_states(bs))) {
570          AioContext *aio_context = bdrv_get_aio_context(bs);
571  
572          aio_context_acquire(aio_context);
573          bdrv_do_drained_end(bs, false, NULL, true);
574          aio_context_release(aio_context);
575      }
576  
577      assert(bdrv_drain_all_count > 0);
578      bdrv_drain_all_count--;
579  }
580  
581  void bdrv_drain_all(void)
582  {
583      bdrv_drain_all_begin();
584      bdrv_drain_all_end();
585  }
586  
587  /**
588   * Remove an active request from the tracked requests list
589   *
590   * This function should be called when a tracked request is completing.
591   */
592  static void tracked_request_end(BdrvTrackedRequest *req)
593  {
594      if (req->serialising) {
595          atomic_dec(&req->bs->serialising_in_flight);
596      }
597  
598      qemu_co_mutex_lock(&req->bs->reqs_lock);
599      QLIST_REMOVE(req, list);
600      qemu_co_queue_restart_all(&req->wait_queue);
601      qemu_co_mutex_unlock(&req->bs->reqs_lock);
602  }
603  
604  /**
605   * Add an active request to the tracked requests list
606   */
607  static void tracked_request_begin(BdrvTrackedRequest *req,
608                                    BlockDriverState *bs,
609                                    int64_t offset,
610                                    uint64_t bytes,
611                                    enum BdrvTrackedRequestType type)
612  {
613      assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
614  
615      *req = (BdrvTrackedRequest){
616          .bs = bs,
617          .offset         = offset,
618          .bytes          = bytes,
619          .type           = type,
620          .co             = qemu_coroutine_self(),
621          .serialising    = false,
622          .overlap_offset = offset,
623          .overlap_bytes  = bytes,
624      };
625  
626      qemu_co_queue_init(&req->wait_queue);
627  
628      qemu_co_mutex_lock(&bs->reqs_lock);
629      QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
630      qemu_co_mutex_unlock(&bs->reqs_lock);
631  }
632  
633  static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
634  {
635      int64_t overlap_offset = req->offset & ~(align - 1);
636      uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
637                                 - overlap_offset;
638  
639      if (!req->serialising) {
640          atomic_inc(&req->bs->serialising_in_flight);
641          req->serialising = true;
642      }
643  
644      req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
645      req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
646  }
647  
648  static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req)
649  {
650      /*
651       * If the request is serialising, overlap_offset and overlap_bytes are set,
652       * so we can check if the request is aligned. Otherwise, don't care and
653       * return false.
654       */
655  
656      return req->serialising && (req->offset == req->overlap_offset) &&
657             (req->bytes == req->overlap_bytes);
658  }
659  
660  /**
661   * Round a region to cluster boundaries
662   */
663  void bdrv_round_to_clusters(BlockDriverState *bs,
664                              int64_t offset, int64_t bytes,
665                              int64_t *cluster_offset,
666                              int64_t *cluster_bytes)
667  {
668      BlockDriverInfo bdi;
669  
670      if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
671          *cluster_offset = offset;
672          *cluster_bytes = bytes;
673      } else {
674          int64_t c = bdi.cluster_size;
675          *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
676          *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
677      }
678  }
679  
680  static int bdrv_get_cluster_size(BlockDriverState *bs)
681  {
682      BlockDriverInfo bdi;
683      int ret;
684  
685      ret = bdrv_get_info(bs, &bdi);
686      if (ret < 0 || bdi.cluster_size == 0) {
687          return bs->bl.request_alignment;
688      } else {
689          return bdi.cluster_size;
690      }
691  }
692  
693  static bool tracked_request_overlaps(BdrvTrackedRequest *req,
694                                       int64_t offset, uint64_t bytes)
695  {
696      /*        aaaa   bbbb */
697      if (offset >= req->overlap_offset + req->overlap_bytes) {
698          return false;
699      }
700      /* bbbb   aaaa        */
701      if (req->overlap_offset >= offset + bytes) {
702          return false;
703      }
704      return true;
705  }
706  
707  void bdrv_inc_in_flight(BlockDriverState *bs)
708  {
709      atomic_inc(&bs->in_flight);
710  }
711  
712  void bdrv_wakeup(BlockDriverState *bs)
713  {
714      aio_wait_kick();
715  }
716  
717  void bdrv_dec_in_flight(BlockDriverState *bs)
718  {
719      atomic_dec(&bs->in_flight);
720      bdrv_wakeup(bs);
721  }
722  
723  static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
724  {
725      BlockDriverState *bs = self->bs;
726      BdrvTrackedRequest *req;
727      bool retry;
728      bool waited = false;
729  
730      if (!atomic_read(&bs->serialising_in_flight)) {
731          return false;
732      }
733  
734      do {
735          retry = false;
736          qemu_co_mutex_lock(&bs->reqs_lock);
737          QLIST_FOREACH(req, &bs->tracked_requests, list) {
738              if (req == self || (!req->serialising && !self->serialising)) {
739                  continue;
740              }
741              if (tracked_request_overlaps(req, self->overlap_offset,
742                                           self->overlap_bytes))
743              {
744                  /* Hitting this means there was a reentrant request, for
745                   * example, a block driver issuing nested requests.  This must
746                   * never happen since it means deadlock.
747                   */
748                  assert(qemu_coroutine_self() != req->co);
749  
750                  /* If the request is already (indirectly) waiting for us, or
751                   * will wait for us as soon as it wakes up, then just go on
752                   * (instead of producing a deadlock in the former case). */
753                  if (!req->waiting_for) {
754                      self->waiting_for = req;
755                      qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
756                      self->waiting_for = NULL;
757                      retry = true;
758                      waited = true;
759                      break;
760                  }
761              }
762          }
763          qemu_co_mutex_unlock(&bs->reqs_lock);
764      } while (retry);
765  
766      return waited;
767  }
768  
769  static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
770                                     size_t size)
771  {
772      if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
773          return -EIO;
774      }
775  
776      if (!bdrv_is_inserted(bs)) {
777          return -ENOMEDIUM;
778      }
779  
780      if (offset < 0) {
781          return -EIO;
782      }
783  
784      return 0;
785  }
786  
787  typedef struct RwCo {
788      BdrvChild *child;
789      int64_t offset;
790      QEMUIOVector *qiov;
791      bool is_write;
792      int ret;
793      BdrvRequestFlags flags;
794  } RwCo;
795  
796  static void coroutine_fn bdrv_rw_co_entry(void *opaque)
797  {
798      RwCo *rwco = opaque;
799  
800      if (!rwco->is_write) {
801          rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
802                                     rwco->qiov->size, rwco->qiov,
803                                     rwco->flags);
804      } else {
805          rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
806                                      rwco->qiov->size, rwco->qiov,
807                                      rwco->flags);
808      }
809      aio_wait_kick();
810  }
811  
812  /*
813   * Process a vectored synchronous request using coroutines
814   */
815  static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
816                          QEMUIOVector *qiov, bool is_write,
817                          BdrvRequestFlags flags)
818  {
819      Coroutine *co;
820      RwCo rwco = {
821          .child = child,
822          .offset = offset,
823          .qiov = qiov,
824          .is_write = is_write,
825          .ret = NOT_DONE,
826          .flags = flags,
827      };
828  
829      if (qemu_in_coroutine()) {
830          /* Fast-path if already in coroutine context */
831          bdrv_rw_co_entry(&rwco);
832      } else {
833          co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
834          bdrv_coroutine_enter(child->bs, co);
835          BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
836      }
837      return rwco.ret;
838  }
839  
840  /*
841   * Process a synchronous request using coroutines
842   */
843  static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
844                        int nb_sectors, bool is_write, BdrvRequestFlags flags)
845  {
846      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf,
847                                              nb_sectors * BDRV_SECTOR_SIZE);
848  
849      if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
850          return -EINVAL;
851      }
852  
853      return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
854                          &qiov, is_write, flags);
855  }
856  
857  /* return < 0 if error. See bdrv_write() for the return codes */
858  int bdrv_read(BdrvChild *child, int64_t sector_num,
859                uint8_t *buf, int nb_sectors)
860  {
861      return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
862  }
863  
864  /* Return < 0 if error. Important errors are:
865    -EIO         generic I/O error (may happen for all errors)
866    -ENOMEDIUM   No media inserted.
867    -EINVAL      Invalid sector number or nb_sectors
868    -EACCES      Trying to write a read-only device
869  */
870  int bdrv_write(BdrvChild *child, int64_t sector_num,
871                 const uint8_t *buf, int nb_sectors)
872  {
873      return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
874  }
875  
876  int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
877                         int bytes, BdrvRequestFlags flags)
878  {
879      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
880  
881      return bdrv_prwv_co(child, offset, &qiov, true,
882                          BDRV_REQ_ZERO_WRITE | flags);
883  }
884  
885  /*
886   * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
887   * The operation is sped up by checking the block status and only writing
888   * zeroes to the device if they currently do not return zeroes. Optional
889   * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
890   * BDRV_REQ_FUA).
891   *
892   * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
893   */
894  int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
895  {
896      int ret;
897      int64_t target_size, bytes, offset = 0;
898      BlockDriverState *bs = child->bs;
899  
900      target_size = bdrv_getlength(bs);
901      if (target_size < 0) {
902          return target_size;
903      }
904  
905      for (;;) {
906          bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
907          if (bytes <= 0) {
908              return 0;
909          }
910          ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
911          if (ret < 0) {
912              return ret;
913          }
914          if (ret & BDRV_BLOCK_ZERO) {
915              offset += bytes;
916              continue;
917          }
918          ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
919          if (ret < 0) {
920              return ret;
921          }
922          offset += bytes;
923      }
924  }
925  
926  int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
927  {
928      int ret;
929  
930      ret = bdrv_prwv_co(child, offset, qiov, false, 0);
931      if (ret < 0) {
932          return ret;
933      }
934  
935      return qiov->size;
936  }
937  
938  int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
939  {
940      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
941  
942      if (bytes < 0) {
943          return -EINVAL;
944      }
945  
946      return bdrv_preadv(child, offset, &qiov);
947  }
948  
949  int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
950  {
951      int ret;
952  
953      ret = bdrv_prwv_co(child, offset, qiov, true, 0);
954      if (ret < 0) {
955          return ret;
956      }
957  
958      return qiov->size;
959  }
960  
961  int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
962  {
963      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
964  
965      if (bytes < 0) {
966          return -EINVAL;
967      }
968  
969      return bdrv_pwritev(child, offset, &qiov);
970  }
971  
972  /*
973   * Writes to the file and ensures that no writes are reordered across this
974   * request (acts as a barrier)
975   *
976   * Returns 0 on success, -errno in error cases.
977   */
978  int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
979                       const void *buf, int count)
980  {
981      int ret;
982  
983      ret = bdrv_pwrite(child, offset, buf, count);
984      if (ret < 0) {
985          return ret;
986      }
987  
988      ret = bdrv_flush(child->bs);
989      if (ret < 0) {
990          return ret;
991      }
992  
993      return 0;
994  }
995  
996  typedef struct CoroutineIOCompletion {
997      Coroutine *coroutine;
998      int ret;
999  } CoroutineIOCompletion;
1000  
1001  static void bdrv_co_io_em_complete(void *opaque, int ret)
1002  {
1003      CoroutineIOCompletion *co = opaque;
1004  
1005      co->ret = ret;
1006      aio_co_wake(co->coroutine);
1007  }
1008  
1009  static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1010                                             uint64_t offset, uint64_t bytes,
1011                                             QEMUIOVector *qiov, int flags)
1012  {
1013      BlockDriver *drv = bs->drv;
1014      int64_t sector_num;
1015      unsigned int nb_sectors;
1016  
1017      assert(!(flags & ~BDRV_REQ_MASK));
1018      assert(!(flags & BDRV_REQ_NO_FALLBACK));
1019  
1020      if (!drv) {
1021          return -ENOMEDIUM;
1022      }
1023  
1024      if (drv->bdrv_co_preadv) {
1025          return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1026      }
1027  
1028      if (drv->bdrv_aio_preadv) {
1029          BlockAIOCB *acb;
1030          CoroutineIOCompletion co = {
1031              .coroutine = qemu_coroutine_self(),
1032          };
1033  
1034          acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
1035                                     bdrv_co_io_em_complete, &co);
1036          if (acb == NULL) {
1037              return -EIO;
1038          } else {
1039              qemu_coroutine_yield();
1040              return co.ret;
1041          }
1042      }
1043  
1044      sector_num = offset >> BDRV_SECTOR_BITS;
1045      nb_sectors = bytes >> BDRV_SECTOR_BITS;
1046  
1047      assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1048      assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1049      assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
1050      assert(drv->bdrv_co_readv);
1051  
1052      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1053  }
1054  
1055  static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
1056                                              uint64_t offset, uint64_t bytes,
1057                                              QEMUIOVector *qiov, int flags)
1058  {
1059      BlockDriver *drv = bs->drv;
1060      int64_t sector_num;
1061      unsigned int nb_sectors;
1062      int ret;
1063  
1064      assert(!(flags & ~BDRV_REQ_MASK));
1065      assert(!(flags & BDRV_REQ_NO_FALLBACK));
1066  
1067      if (!drv) {
1068          return -ENOMEDIUM;
1069      }
1070  
1071      if (drv->bdrv_co_pwritev) {
1072          ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1073                                     flags & bs->supported_write_flags);
1074          flags &= ~bs->supported_write_flags;
1075          goto emulate_flags;
1076      }
1077  
1078      if (drv->bdrv_aio_pwritev) {
1079          BlockAIOCB *acb;
1080          CoroutineIOCompletion co = {
1081              .coroutine = qemu_coroutine_self(),
1082          };
1083  
1084          acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1085                                      flags & bs->supported_write_flags,
1086                                      bdrv_co_io_em_complete, &co);
1087          flags &= ~bs->supported_write_flags;
1088          if (acb == NULL) {
1089              ret = -EIO;
1090          } else {
1091              qemu_coroutine_yield();
1092              ret = co.ret;
1093          }
1094          goto emulate_flags;
1095      }
1096  
1097      sector_num = offset >> BDRV_SECTOR_BITS;
1098      nb_sectors = bytes >> BDRV_SECTOR_BITS;
1099  
1100      assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1101      assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1102      assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
1103  
1104      assert(drv->bdrv_co_writev);
1105      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1106                                flags & bs->supported_write_flags);
1107      flags &= ~bs->supported_write_flags;
1108  
1109  emulate_flags:
1110      if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1111          ret = bdrv_co_flush(bs);
1112      }
1113  
1114      return ret;
1115  }
1116  
1117  static int coroutine_fn
1118  bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1119                                 uint64_t bytes, QEMUIOVector *qiov)
1120  {
1121      BlockDriver *drv = bs->drv;
1122  
1123      if (!drv) {
1124          return -ENOMEDIUM;
1125      }
1126  
1127      if (!drv->bdrv_co_pwritev_compressed) {
1128          return -ENOTSUP;
1129      }
1130  
1131      return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1132  }
1133  
1134  static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1135          int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
1136  {
1137      BlockDriverState *bs = child->bs;
1138  
1139      /* Perform I/O through a temporary buffer so that users who scribble over
1140       * their read buffer while the operation is in progress do not end up
1141       * modifying the image file.  This is critical for zero-copy guest I/O
1142       * where anything might happen inside guest memory.
1143       */
1144      void *bounce_buffer;
1145  
1146      BlockDriver *drv = bs->drv;
1147      QEMUIOVector local_qiov;
1148      int64_t cluster_offset;
1149      int64_t cluster_bytes;
1150      size_t skip_bytes;
1151      int ret;
1152      int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1153                                      BDRV_REQUEST_MAX_BYTES);
1154      unsigned int progress = 0;
1155  
1156      if (!drv) {
1157          return -ENOMEDIUM;
1158      }
1159  
1160      /* FIXME We cannot require callers to have write permissions when all they
1161       * are doing is a read request. If we did things right, write permissions
1162       * would be obtained anyway, but internally by the copy-on-read code. As
1163       * long as it is implemented here rather than in a separate filter driver,
1164       * the copy-on-read code doesn't have its own BdrvChild, however, for which
1165       * it could request permissions. Therefore we have to bypass the permission
1166       * system for the moment. */
1167      // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1168  
1169      /* Cover entire cluster so no additional backing file I/O is required when
1170       * allocating cluster in the image file.  Note that this value may exceed
1171       * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1172       * is one reason we loop rather than doing it all at once.
1173       */
1174      bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1175      skip_bytes = offset - cluster_offset;
1176  
1177      trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1178                                     cluster_offset, cluster_bytes);
1179  
1180      bounce_buffer = qemu_try_blockalign(bs,
1181                                          MIN(MIN(max_transfer, cluster_bytes),
1182                                              MAX_BOUNCE_BUFFER));
1183      if (bounce_buffer == NULL) {
1184          ret = -ENOMEM;
1185          goto err;
1186      }
1187  
1188      while (cluster_bytes) {
1189          int64_t pnum;
1190  
1191          ret = bdrv_is_allocated(bs, cluster_offset,
1192                                  MIN(cluster_bytes, max_transfer), &pnum);
1193          if (ret < 0) {
1194              /* Safe to treat errors in querying allocation as if
1195               * unallocated; we'll probably fail again soon on the
1196               * read, but at least that will set a decent errno.
1197               */
1198              pnum = MIN(cluster_bytes, max_transfer);
1199          }
1200  
1201          /* Stop at EOF if the image ends in the middle of the cluster */
1202          if (ret == 0 && pnum == 0) {
1203              assert(progress >= bytes);
1204              break;
1205          }
1206  
1207          assert(skip_bytes < pnum);
1208  
1209          if (ret <= 0) {
1210              /* Must copy-on-read; use the bounce buffer */
1211              pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1212              qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1213  
1214              ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1215                                       &local_qiov, 0);
1216              if (ret < 0) {
1217                  goto err;
1218              }
1219  
1220              bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1221              if (drv->bdrv_co_pwrite_zeroes &&
1222                  buffer_is_zero(bounce_buffer, pnum)) {
1223                  /* FIXME: Should we (perhaps conditionally) be setting
1224                   * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1225                   * that still correctly reads as zero? */
1226                  ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
1227                                                 BDRV_REQ_WRITE_UNCHANGED);
1228              } else {
1229                  /* This does not change the data on the disk, it is not
1230                   * necessary to flush even in cache=writethrough mode.
1231                   */
1232                  ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1233                                            &local_qiov,
1234                                            BDRV_REQ_WRITE_UNCHANGED);
1235              }
1236  
1237              if (ret < 0) {
1238                  /* It might be okay to ignore write errors for guest
1239                   * requests.  If this is a deliberate copy-on-read
1240                   * then we don't want to ignore the error.  Simply
1241                   * report it in all cases.
1242                   */
1243                  goto err;
1244              }
1245  
1246              qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
1247                                  pnum - skip_bytes);
1248          } else {
1249              /* Read directly into the destination */
1250              qemu_iovec_init(&local_qiov, qiov->niov);
1251              qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
1252              ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
1253                                       &local_qiov, 0);
1254              qemu_iovec_destroy(&local_qiov);
1255              if (ret < 0) {
1256                  goto err;
1257              }
1258          }
1259  
1260          cluster_offset += pnum;
1261          cluster_bytes -= pnum;
1262          progress += pnum - skip_bytes;
1263          skip_bytes = 0;
1264      }
1265      ret = 0;
1266  
1267  err:
1268      qemu_vfree(bounce_buffer);
1269      return ret;
1270  }
1271  
1272  /*
1273   * Forwards an already correctly aligned request to the BlockDriver. This
1274   * handles copy on read, zeroing after EOF, and fragmentation of large
1275   * reads; any other features must be implemented by the caller.
1276   */
1277  static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1278      BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1279      int64_t align, QEMUIOVector *qiov, int flags)
1280  {
1281      BlockDriverState *bs = child->bs;
1282      int64_t total_bytes, max_bytes;
1283      int ret = 0;
1284      uint64_t bytes_remaining = bytes;
1285      int max_transfer;
1286  
1287      assert(is_power_of_2(align));
1288      assert((offset & (align - 1)) == 0);
1289      assert((bytes & (align - 1)) == 0);
1290      assert(!qiov || bytes == qiov->size);
1291      assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1292      max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1293                                     align);
1294  
1295      /* TODO: We would need a per-BDS .supported_read_flags and
1296       * potential fallback support, if we ever implement any read flags
1297       * to pass through to drivers.  For now, there aren't any
1298       * passthrough flags.  */
1299      assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1300  
1301      /* Handle Copy on Read and associated serialisation */
1302      if (flags & BDRV_REQ_COPY_ON_READ) {
1303          /* If we touch the same cluster it counts as an overlap.  This
1304           * guarantees that allocating writes will be serialized and not race
1305           * with each other for the same cluster.  For example, in copy-on-read
1306           * it ensures that the CoR read and write operations are atomic and
1307           * guest writes cannot interleave between them. */
1308          mark_request_serialising(req, bdrv_get_cluster_size(bs));
1309      }
1310  
1311      /* BDRV_REQ_SERIALISING is only for write operation */
1312      assert(!(flags & BDRV_REQ_SERIALISING));
1313  
1314      if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1315          wait_serialising_requests(req);
1316      }
1317  
1318      if (flags & BDRV_REQ_COPY_ON_READ) {
1319          int64_t pnum;
1320  
1321          ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1322          if (ret < 0) {
1323              goto out;
1324          }
1325  
1326          if (!ret || pnum != bytes) {
1327              ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
1328              goto out;
1329          }
1330      }
1331  
1332      /* Forward the request to the BlockDriver, possibly fragmenting it */
1333      total_bytes = bdrv_getlength(bs);
1334      if (total_bytes < 0) {
1335          ret = total_bytes;
1336          goto out;
1337      }
1338  
1339      max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1340      if (bytes <= max_bytes && bytes <= max_transfer) {
1341          ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1342          goto out;
1343      }
1344  
1345      while (bytes_remaining) {
1346          int num;
1347  
1348          if (max_bytes) {
1349              QEMUIOVector local_qiov;
1350  
1351              num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1352              assert(num);
1353              qemu_iovec_init(&local_qiov, qiov->niov);
1354              qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1355  
1356              ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1357                                       num, &local_qiov, 0);
1358              max_bytes -= num;
1359              qemu_iovec_destroy(&local_qiov);
1360          } else {
1361              num = bytes_remaining;
1362              ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1363                                      bytes_remaining);
1364          }
1365          if (ret < 0) {
1366              goto out;
1367          }
1368          bytes_remaining -= num;
1369      }
1370  
1371  out:
1372      return ret < 0 ? ret : 0;
1373  }
1374  
1375  /*
1376   * Handle a read request in coroutine context
1377   */
1378  int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1379      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1380      BdrvRequestFlags flags)
1381  {
1382      BlockDriverState *bs = child->bs;
1383      BlockDriver *drv = bs->drv;
1384      BdrvTrackedRequest req;
1385  
1386      uint64_t align = bs->bl.request_alignment;
1387      uint8_t *head_buf = NULL;
1388      uint8_t *tail_buf = NULL;
1389      QEMUIOVector local_qiov;
1390      bool use_local_qiov = false;
1391      int ret;
1392  
1393      trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1394  
1395      if (!drv) {
1396          return -ENOMEDIUM;
1397      }
1398  
1399      ret = bdrv_check_byte_request(bs, offset, bytes);
1400      if (ret < 0) {
1401          return ret;
1402      }
1403  
1404      bdrv_inc_in_flight(bs);
1405  
1406      /* Don't do copy-on-read if we read data before write operation */
1407      if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1408          flags |= BDRV_REQ_COPY_ON_READ;
1409      }
1410  
1411      /* Align read if necessary by padding qiov */
1412      if (offset & (align - 1)) {
1413          head_buf = qemu_blockalign(bs, align);
1414          qemu_iovec_init(&local_qiov, qiov->niov + 2);
1415          qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1416          qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1417          use_local_qiov = true;
1418  
1419          bytes += offset & (align - 1);
1420          offset = offset & ~(align - 1);
1421      }
1422  
1423      if ((offset + bytes) & (align - 1)) {
1424          if (!use_local_qiov) {
1425              qemu_iovec_init(&local_qiov, qiov->niov + 1);
1426              qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1427              use_local_qiov = true;
1428          }
1429          tail_buf = qemu_blockalign(bs, align);
1430          qemu_iovec_add(&local_qiov, tail_buf,
1431                         align - ((offset + bytes) & (align - 1)));
1432  
1433          bytes = ROUND_UP(bytes, align);
1434      }
1435  
1436      tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1437      ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
1438                                use_local_qiov ? &local_qiov : qiov,
1439                                flags);
1440      tracked_request_end(&req);
1441      bdrv_dec_in_flight(bs);
1442  
1443      if (use_local_qiov) {
1444          qemu_iovec_destroy(&local_qiov);
1445          qemu_vfree(head_buf);
1446          qemu_vfree(tail_buf);
1447      }
1448  
1449      return ret;
1450  }
1451  
1452  static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1453      int64_t offset, int bytes, BdrvRequestFlags flags)
1454  {
1455      BlockDriver *drv = bs->drv;
1456      QEMUIOVector qiov;
1457      void *buf = NULL;
1458      int ret = 0;
1459      bool need_flush = false;
1460      int head = 0;
1461      int tail = 0;
1462  
1463      int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1464      int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1465                          bs->bl.request_alignment);
1466      int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1467  
1468      if (!drv) {
1469          return -ENOMEDIUM;
1470      }
1471  
1472      if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1473          return -ENOTSUP;
1474      }
1475  
1476      assert(alignment % bs->bl.request_alignment == 0);
1477      head = offset % alignment;
1478      tail = (offset + bytes) % alignment;
1479      max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1480      assert(max_write_zeroes >= bs->bl.request_alignment);
1481  
1482      while (bytes > 0 && !ret) {
1483          int num = bytes;
1484  
1485          /* Align request.  Block drivers can expect the "bulk" of the request
1486           * to be aligned, and that unaligned requests do not cross cluster
1487           * boundaries.
1488           */
1489          if (head) {
1490              /* Make a small request up to the first aligned sector. For
1491               * convenience, limit this request to max_transfer even if
1492               * we don't need to fall back to writes.  */
1493              num = MIN(MIN(bytes, max_transfer), alignment - head);
1494              head = (head + num) % alignment;
1495              assert(num < max_write_zeroes);
1496          } else if (tail && num > alignment) {
1497              /* Shorten the request to the last aligned sector.  */
1498              num -= tail;
1499          }
1500  
1501          /* limit request size */
1502          if (num > max_write_zeroes) {
1503              num = max_write_zeroes;
1504          }
1505  
1506          ret = -ENOTSUP;
1507          /* First try the efficient write zeroes operation */
1508          if (drv->bdrv_co_pwrite_zeroes) {
1509              ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1510                                               flags & bs->supported_zero_flags);
1511              if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1512                  !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1513                  need_flush = true;
1514              }
1515          } else {
1516              assert(!bs->supported_zero_flags);
1517          }
1518  
1519          if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
1520              /* Fall back to bounce buffer if write zeroes is unsupported */
1521              BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1522  
1523              if ((flags & BDRV_REQ_FUA) &&
1524                  !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1525                  /* No need for bdrv_driver_pwrite() to do a fallback
1526                   * flush on each chunk; use just one at the end */
1527                  write_flags &= ~BDRV_REQ_FUA;
1528                  need_flush = true;
1529              }
1530              num = MIN(num, max_transfer);
1531              if (buf == NULL) {
1532                  buf = qemu_try_blockalign0(bs, num);
1533                  if (buf == NULL) {
1534                      ret = -ENOMEM;
1535                      goto fail;
1536                  }
1537              }
1538              qemu_iovec_init_buf(&qiov, buf, num);
1539  
1540              ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1541  
1542              /* Keep bounce buffer around if it is big enough for all
1543               * all future requests.
1544               */
1545              if (num < max_transfer) {
1546                  qemu_vfree(buf);
1547                  buf = NULL;
1548              }
1549          }
1550  
1551          offset += num;
1552          bytes -= num;
1553      }
1554  
1555  fail:
1556      if (ret == 0 && need_flush) {
1557          ret = bdrv_co_flush(bs);
1558      }
1559      qemu_vfree(buf);
1560      return ret;
1561  }
1562  
1563  static inline int coroutine_fn
1564  bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
1565                            BdrvTrackedRequest *req, int flags)
1566  {
1567      BlockDriverState *bs = child->bs;
1568      bool waited;
1569      int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1570  
1571      if (bs->read_only) {
1572          return -EPERM;
1573      }
1574  
1575      /* BDRV_REQ_NO_SERIALISING is only for read operation */
1576      assert(!(flags & BDRV_REQ_NO_SERIALISING));
1577      assert(!(bs->open_flags & BDRV_O_INACTIVE));
1578      assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1579      assert(!(flags & ~BDRV_REQ_MASK));
1580  
1581      if (flags & BDRV_REQ_SERIALISING) {
1582          mark_request_serialising(req, bdrv_get_cluster_size(bs));
1583      }
1584  
1585      waited = wait_serialising_requests(req);
1586  
1587      assert(!waited || !req->serialising ||
1588             is_request_serialising_and_aligned(req));
1589      assert(req->overlap_offset <= offset);
1590      assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1591      assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1592  
1593      switch (req->type) {
1594      case BDRV_TRACKED_WRITE:
1595      case BDRV_TRACKED_DISCARD:
1596          if (flags & BDRV_REQ_WRITE_UNCHANGED) {
1597              assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1598          } else {
1599              assert(child->perm & BLK_PERM_WRITE);
1600          }
1601          return notifier_with_return_list_notify(&bs->before_write_notifiers,
1602                                                  req);
1603      case BDRV_TRACKED_TRUNCATE:
1604          assert(child->perm & BLK_PERM_RESIZE);
1605          return 0;
1606      default:
1607          abort();
1608      }
1609  }
1610  
1611  static inline void coroutine_fn
1612  bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
1613                           BdrvTrackedRequest *req, int ret)
1614  {
1615      int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1616      BlockDriverState *bs = child->bs;
1617  
1618      atomic_inc(&bs->write_gen);
1619  
1620      /*
1621       * Discard cannot extend the image, but in error handling cases, such as
1622       * when reverting a qcow2 cluster allocation, the discarded range can pass
1623       * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
1624       * here. Instead, just skip it, since semantically a discard request
1625       * beyond EOF cannot expand the image anyway.
1626       */
1627      if (ret == 0 &&
1628          (req->type == BDRV_TRACKED_TRUNCATE ||
1629           end_sector > bs->total_sectors) &&
1630          req->type != BDRV_TRACKED_DISCARD) {
1631          bs->total_sectors = end_sector;
1632          bdrv_parent_cb_resize(bs);
1633          bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
1634      }
1635      if (req->bytes) {
1636          switch (req->type) {
1637          case BDRV_TRACKED_WRITE:
1638              stat64_max(&bs->wr_highest_offset, offset + bytes);
1639              /* fall through, to set dirty bits */
1640          case BDRV_TRACKED_DISCARD:
1641              bdrv_set_dirty(bs, offset, bytes);
1642              break;
1643          default:
1644              break;
1645          }
1646      }
1647  }
1648  
1649  /*
1650   * Forwards an already correctly aligned write request to the BlockDriver,
1651   * after possibly fragmenting it.
1652   */
1653  static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1654      BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1655      int64_t align, QEMUIOVector *qiov, int flags)
1656  {
1657      BlockDriverState *bs = child->bs;
1658      BlockDriver *drv = bs->drv;
1659      int ret;
1660  
1661      uint64_t bytes_remaining = bytes;
1662      int max_transfer;
1663  
1664      if (!drv) {
1665          return -ENOMEDIUM;
1666      }
1667  
1668      if (bdrv_has_readonly_bitmaps(bs)) {
1669          return -EPERM;
1670      }
1671  
1672      assert(is_power_of_2(align));
1673      assert((offset & (align - 1)) == 0);
1674      assert((bytes & (align - 1)) == 0);
1675      assert(!qiov || bytes == qiov->size);
1676      max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1677                                     align);
1678  
1679      ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
1680  
1681      if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1682          !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1683          qemu_iovec_is_zero(qiov)) {
1684          flags |= BDRV_REQ_ZERO_WRITE;
1685          if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1686              flags |= BDRV_REQ_MAY_UNMAP;
1687          }
1688      }
1689  
1690      if (ret < 0) {
1691          /* Do nothing, write notifier decided to fail this request */
1692      } else if (flags & BDRV_REQ_ZERO_WRITE) {
1693          bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1694          ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1695      } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1696          ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1697      } else if (bytes <= max_transfer) {
1698          bdrv_debug_event(bs, BLKDBG_PWRITEV);
1699          ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1700      } else {
1701          bdrv_debug_event(bs, BLKDBG_PWRITEV);
1702          while (bytes_remaining) {
1703              int num = MIN(bytes_remaining, max_transfer);
1704              QEMUIOVector local_qiov;
1705              int local_flags = flags;
1706  
1707              assert(num);
1708              if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1709                  !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1710                  /* If FUA is going to be emulated by flush, we only
1711                   * need to flush on the last iteration */
1712                  local_flags &= ~BDRV_REQ_FUA;
1713              }
1714              qemu_iovec_init(&local_qiov, qiov->niov);
1715              qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1716  
1717              ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1718                                        num, &local_qiov, local_flags);
1719              qemu_iovec_destroy(&local_qiov);
1720              if (ret < 0) {
1721                  break;
1722              }
1723              bytes_remaining -= num;
1724          }
1725      }
1726      bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1727  
1728      if (ret >= 0) {
1729          ret = 0;
1730      }
1731      bdrv_co_write_req_finish(child, offset, bytes, req, ret);
1732  
1733      return ret;
1734  }
1735  
1736  static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
1737                                                  int64_t offset,
1738                                                  unsigned int bytes,
1739                                                  BdrvRequestFlags flags,
1740                                                  BdrvTrackedRequest *req)
1741  {
1742      BlockDriverState *bs = child->bs;
1743      uint8_t *buf = NULL;
1744      QEMUIOVector local_qiov;
1745      uint64_t align = bs->bl.request_alignment;
1746      unsigned int head_padding_bytes, tail_padding_bytes;
1747      int ret = 0;
1748  
1749      head_padding_bytes = offset & (align - 1);
1750      tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
1751  
1752  
1753      assert(flags & BDRV_REQ_ZERO_WRITE);
1754      if (head_padding_bytes || tail_padding_bytes) {
1755          buf = qemu_blockalign(bs, align);
1756          qemu_iovec_init_buf(&local_qiov, buf, align);
1757      }
1758      if (head_padding_bytes) {
1759          uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1760  
1761          /* RMW the unaligned part before head. */
1762          mark_request_serialising(req, align);
1763          wait_serialising_requests(req);
1764          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1765          ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
1766                                    align, &local_qiov, 0);
1767          if (ret < 0) {
1768              goto fail;
1769          }
1770          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1771  
1772          memset(buf + head_padding_bytes, 0, zero_bytes);
1773          ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1774                                     align, &local_qiov,
1775                                     flags & ~BDRV_REQ_ZERO_WRITE);
1776          if (ret < 0) {
1777              goto fail;
1778          }
1779          offset += zero_bytes;
1780          bytes -= zero_bytes;
1781      }
1782  
1783      assert(!bytes || (offset & (align - 1)) == 0);
1784      if (bytes >= align) {
1785          /* Write the aligned part in the middle. */
1786          uint64_t aligned_bytes = bytes & ~(align - 1);
1787          ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
1788                                     NULL, flags);
1789          if (ret < 0) {
1790              goto fail;
1791          }
1792          bytes -= aligned_bytes;
1793          offset += aligned_bytes;
1794      }
1795  
1796      assert(!bytes || (offset & (align - 1)) == 0);
1797      if (bytes) {
1798          assert(align == tail_padding_bytes + bytes);
1799          /* RMW the unaligned part after tail. */
1800          mark_request_serialising(req, align);
1801          wait_serialising_requests(req);
1802          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1803          ret = bdrv_aligned_preadv(child, req, offset, align,
1804                                    align, &local_qiov, 0);
1805          if (ret < 0) {
1806              goto fail;
1807          }
1808          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1809  
1810          memset(buf, 0, bytes);
1811          ret = bdrv_aligned_pwritev(child, req, offset, align, align,
1812                                     &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1813      }
1814  fail:
1815      qemu_vfree(buf);
1816      return ret;
1817  
1818  }
1819  
1820  /*
1821   * Handle a write request in coroutine context
1822   */
1823  int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1824      int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1825      BdrvRequestFlags flags)
1826  {
1827      BlockDriverState *bs = child->bs;
1828      BdrvTrackedRequest req;
1829      uint64_t align = bs->bl.request_alignment;
1830      uint8_t *head_buf = NULL;
1831      uint8_t *tail_buf = NULL;
1832      QEMUIOVector local_qiov;
1833      bool use_local_qiov = false;
1834      int ret;
1835  
1836      trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1837  
1838      if (!bs->drv) {
1839          return -ENOMEDIUM;
1840      }
1841  
1842      ret = bdrv_check_byte_request(bs, offset, bytes);
1843      if (ret < 0) {
1844          return ret;
1845      }
1846  
1847      bdrv_inc_in_flight(bs);
1848      /*
1849       * Align write if necessary by performing a read-modify-write cycle.
1850       * Pad qiov with the read parts and be sure to have a tracked request not
1851       * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1852       */
1853      tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1854  
1855      if (flags & BDRV_REQ_ZERO_WRITE) {
1856          ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
1857          goto out;
1858      }
1859  
1860      if (offset & (align - 1)) {
1861          QEMUIOVector head_qiov;
1862  
1863          mark_request_serialising(&req, align);
1864          wait_serialising_requests(&req);
1865  
1866          head_buf = qemu_blockalign(bs, align);
1867          qemu_iovec_init_buf(&head_qiov, head_buf, align);
1868  
1869          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1870          ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
1871                                    align, &head_qiov, 0);
1872          if (ret < 0) {
1873              goto fail;
1874          }
1875          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1876  
1877          qemu_iovec_init(&local_qiov, qiov->niov + 2);
1878          qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1879          qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1880          use_local_qiov = true;
1881  
1882          bytes += offset & (align - 1);
1883          offset = offset & ~(align - 1);
1884  
1885          /* We have read the tail already if the request is smaller
1886           * than one aligned block.
1887           */
1888          if (bytes < align) {
1889              qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1890              bytes = align;
1891          }
1892      }
1893  
1894      if ((offset + bytes) & (align - 1)) {
1895          QEMUIOVector tail_qiov;
1896          size_t tail_bytes;
1897          bool waited;
1898  
1899          mark_request_serialising(&req, align);
1900          waited = wait_serialising_requests(&req);
1901          assert(!waited || !use_local_qiov);
1902  
1903          tail_buf = qemu_blockalign(bs, align);
1904          qemu_iovec_init_buf(&tail_qiov, tail_buf, align);
1905  
1906          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1907          ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1908                                    align, align, &tail_qiov, 0);
1909          if (ret < 0) {
1910              goto fail;
1911          }
1912          bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1913  
1914          if (!use_local_qiov) {
1915              qemu_iovec_init(&local_qiov, qiov->niov + 1);
1916              qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1917              use_local_qiov = true;
1918          }
1919  
1920          tail_bytes = (offset + bytes) & (align - 1);
1921          qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1922  
1923          bytes = ROUND_UP(bytes, align);
1924      }
1925  
1926      ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
1927                                 use_local_qiov ? &local_qiov : qiov,
1928                                 flags);
1929  
1930  fail:
1931  
1932      if (use_local_qiov) {
1933          qemu_iovec_destroy(&local_qiov);
1934      }
1935      qemu_vfree(head_buf);
1936      qemu_vfree(tail_buf);
1937  out:
1938      tracked_request_end(&req);
1939      bdrv_dec_in_flight(bs);
1940      return ret;
1941  }
1942  
1943  int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1944                                         int bytes, BdrvRequestFlags flags)
1945  {
1946      trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
1947  
1948      if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1949          flags &= ~BDRV_REQ_MAY_UNMAP;
1950      }
1951  
1952      return bdrv_co_pwritev(child, offset, bytes, NULL,
1953                             BDRV_REQ_ZERO_WRITE | flags);
1954  }
1955  
1956  /*
1957   * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1958   */
1959  int bdrv_flush_all(void)
1960  {
1961      BdrvNextIterator it;
1962      BlockDriverState *bs = NULL;
1963      int result = 0;
1964  
1965      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1966          AioContext *aio_context = bdrv_get_aio_context(bs);
1967          int ret;
1968  
1969          aio_context_acquire(aio_context);
1970          ret = bdrv_flush(bs);
1971          if (ret < 0 && !result) {
1972              result = ret;
1973          }
1974          aio_context_release(aio_context);
1975      }
1976  
1977      return result;
1978  }
1979  
1980  
1981  typedef struct BdrvCoBlockStatusData {
1982      BlockDriverState *bs;
1983      BlockDriverState *base;
1984      bool want_zero;
1985      int64_t offset;
1986      int64_t bytes;
1987      int64_t *pnum;
1988      int64_t *map;
1989      BlockDriverState **file;
1990      int ret;
1991      bool done;
1992  } BdrvCoBlockStatusData;
1993  
1994  int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
1995                                                  bool want_zero,
1996                                                  int64_t offset,
1997                                                  int64_t bytes,
1998                                                  int64_t *pnum,
1999                                                  int64_t *map,
2000                                                  BlockDriverState **file)
2001  {
2002      assert(bs->file && bs->file->bs);
2003      *pnum = bytes;
2004      *map = offset;
2005      *file = bs->file->bs;
2006      return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2007  }
2008  
2009  int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
2010                                                     bool want_zero,
2011                                                     int64_t offset,
2012                                                     int64_t bytes,
2013                                                     int64_t *pnum,
2014                                                     int64_t *map,
2015                                                     BlockDriverState **file)
2016  {
2017      assert(bs->backing && bs->backing->bs);
2018      *pnum = bytes;
2019      *map = offset;
2020      *file = bs->backing->bs;
2021      return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2022  }
2023  
2024  /*
2025   * Returns the allocation status of the specified sectors.
2026   * Drivers not implementing the functionality are assumed to not support
2027   * backing files, hence all their sectors are reported as allocated.
2028   *
2029   * If 'want_zero' is true, the caller is querying for mapping
2030   * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2031   * _ZERO where possible; otherwise, the result favors larger 'pnum',
2032   * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2033   *
2034   * If 'offset' is beyond the end of the disk image the return value is
2035   * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2036   *
2037   * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2038   * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2039   * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2040   *
2041   * 'pnum' is set to the number of bytes (including and immediately
2042   * following the specified offset) that are easily known to be in the
2043   * same allocated/unallocated state.  Note that a second call starting
2044   * at the original offset plus returned pnum may have the same status.
2045   * The returned value is non-zero on success except at end-of-file.
2046   *
2047   * Returns negative errno on failure.  Otherwise, if the
2048   * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2049   * set to the host mapping and BDS corresponding to the guest offset.
2050   */
2051  static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2052                                               bool want_zero,
2053                                               int64_t offset, int64_t bytes,
2054                                               int64_t *pnum, int64_t *map,
2055                                               BlockDriverState **file)
2056  {
2057      int64_t total_size;
2058      int64_t n; /* bytes */
2059      int ret;
2060      int64_t local_map = 0;
2061      BlockDriverState *local_file = NULL;
2062      int64_t aligned_offset, aligned_bytes;
2063      uint32_t align;
2064  
2065      assert(pnum);
2066      *pnum = 0;
2067      total_size = bdrv_getlength(bs);
2068      if (total_size < 0) {
2069          ret = total_size;
2070          goto early_out;
2071      }
2072  
2073      if (offset >= total_size) {
2074          ret = BDRV_BLOCK_EOF;
2075          goto early_out;
2076      }
2077      if (!bytes) {
2078          ret = 0;
2079          goto early_out;
2080      }
2081  
2082      n = total_size - offset;
2083      if (n < bytes) {
2084          bytes = n;
2085      }
2086  
2087      /* Must be non-NULL or bdrv_getlength() would have failed */
2088      assert(bs->drv);
2089      if (!bs->drv->bdrv_co_block_status) {
2090          *pnum = bytes;
2091          ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2092          if (offset + bytes == total_size) {
2093              ret |= BDRV_BLOCK_EOF;
2094          }
2095          if (bs->drv->protocol_name) {
2096              ret |= BDRV_BLOCK_OFFSET_VALID;
2097              local_map = offset;
2098              local_file = bs;
2099          }
2100          goto early_out;
2101      }
2102  
2103      bdrv_inc_in_flight(bs);
2104  
2105      /* Round out to request_alignment boundaries */
2106      align = bs->bl.request_alignment;
2107      aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2108      aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2109  
2110      ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2111                                          aligned_bytes, pnum, &local_map,
2112                                          &local_file);
2113      if (ret < 0) {
2114          *pnum = 0;
2115          goto out;
2116      }
2117  
2118      /*
2119       * The driver's result must be a non-zero multiple of request_alignment.
2120       * Clamp pnum and adjust map to original request.
2121       */
2122      assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2123             align > offset - aligned_offset);
2124      *pnum -= offset - aligned_offset;
2125      if (*pnum > bytes) {
2126          *pnum = bytes;
2127      }
2128      if (ret & BDRV_BLOCK_OFFSET_VALID) {
2129          local_map += offset - aligned_offset;
2130      }
2131  
2132      if (ret & BDRV_BLOCK_RAW) {
2133          assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2134          ret = bdrv_co_block_status(local_file, want_zero, local_map,
2135                                     *pnum, pnum, &local_map, &local_file);
2136          goto out;
2137      }
2138  
2139      if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2140          ret |= BDRV_BLOCK_ALLOCATED;
2141      } else if (want_zero) {
2142          if (bdrv_unallocated_blocks_are_zero(bs)) {
2143              ret |= BDRV_BLOCK_ZERO;
2144          } else if (bs->backing) {
2145              BlockDriverState *bs2 = bs->backing->bs;
2146              int64_t size2 = bdrv_getlength(bs2);
2147  
2148              if (size2 >= 0 && offset >= size2) {
2149                  ret |= BDRV_BLOCK_ZERO;
2150              }
2151          }
2152      }
2153  
2154      if (want_zero && local_file && local_file != bs &&
2155          (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2156          (ret & BDRV_BLOCK_OFFSET_VALID)) {
2157          int64_t file_pnum;
2158          int ret2;
2159  
2160          ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
2161                                      *pnum, &file_pnum, NULL, NULL);
2162          if (ret2 >= 0) {
2163              /* Ignore errors.  This is just providing extra information, it
2164               * is useful but not necessary.
2165               */
2166              if (ret2 & BDRV_BLOCK_EOF &&
2167                  (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2168                  /*
2169                   * It is valid for the format block driver to read
2170                   * beyond the end of the underlying file's current
2171                   * size; such areas read as zero.
2172                   */
2173                  ret |= BDRV_BLOCK_ZERO;
2174              } else {
2175                  /* Limit request to the range reported by the protocol driver */
2176                  *pnum = file_pnum;
2177                  ret |= (ret2 & BDRV_BLOCK_ZERO);
2178              }
2179          }
2180      }
2181  
2182  out:
2183      bdrv_dec_in_flight(bs);
2184      if (ret >= 0 && offset + *pnum == total_size) {
2185          ret |= BDRV_BLOCK_EOF;
2186      }
2187  early_out:
2188      if (file) {
2189          *file = local_file;
2190      }
2191      if (map) {
2192          *map = local_map;
2193      }
2194      return ret;
2195  }
2196  
2197  static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2198                                                     BlockDriverState *base,
2199                                                     bool want_zero,
2200                                                     int64_t offset,
2201                                                     int64_t bytes,
2202                                                     int64_t *pnum,
2203                                                     int64_t *map,
2204                                                     BlockDriverState **file)
2205  {
2206      BlockDriverState *p;
2207      int ret = 0;
2208      bool first = true;
2209  
2210      assert(bs != base);
2211      for (p = bs; p != base; p = backing_bs(p)) {
2212          ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2213                                     file);
2214          if (ret < 0) {
2215              break;
2216          }
2217          if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2218              /*
2219               * Reading beyond the end of the file continues to read
2220               * zeroes, but we can only widen the result to the
2221               * unallocated length we learned from an earlier
2222               * iteration.
2223               */
2224              *pnum = bytes;
2225          }
2226          if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2227              break;
2228          }
2229          /* [offset, pnum] unallocated on this layer, which could be only
2230           * the first part of [offset, bytes].  */
2231          bytes = MIN(bytes, *pnum);
2232          first = false;
2233      }
2234      return ret;
2235  }
2236  
2237  /* Coroutine wrapper for bdrv_block_status_above() */
2238  static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2239  {
2240      BdrvCoBlockStatusData *data = opaque;
2241  
2242      data->ret = bdrv_co_block_status_above(data->bs, data->base,
2243                                             data->want_zero,
2244                                             data->offset, data->bytes,
2245                                             data->pnum, data->map, data->file);
2246      data->done = true;
2247      aio_wait_kick();
2248  }
2249  
2250  /*
2251   * Synchronous wrapper around bdrv_co_block_status_above().
2252   *
2253   * See bdrv_co_block_status_above() for details.
2254   */
2255  static int bdrv_common_block_status_above(BlockDriverState *bs,
2256                                            BlockDriverState *base,
2257                                            bool want_zero, int64_t offset,
2258                                            int64_t bytes, int64_t *pnum,
2259                                            int64_t *map,
2260                                            BlockDriverState **file)
2261  {
2262      Coroutine *co;
2263      BdrvCoBlockStatusData data = {
2264          .bs = bs,
2265          .base = base,
2266          .want_zero = want_zero,
2267          .offset = offset,
2268          .bytes = bytes,
2269          .pnum = pnum,
2270          .map = map,
2271          .file = file,
2272          .done = false,
2273      };
2274  
2275      if (qemu_in_coroutine()) {
2276          /* Fast-path if already in coroutine context */
2277          bdrv_block_status_above_co_entry(&data);
2278      } else {
2279          co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2280          bdrv_coroutine_enter(bs, co);
2281          BDRV_POLL_WHILE(bs, !data.done);
2282      }
2283      return data.ret;
2284  }
2285  
2286  int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2287                              int64_t offset, int64_t bytes, int64_t *pnum,
2288                              int64_t *map, BlockDriverState **file)
2289  {
2290      return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2291                                            pnum, map, file);
2292  }
2293  
2294  int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2295                        int64_t *pnum, int64_t *map, BlockDriverState **file)
2296  {
2297      return bdrv_block_status_above(bs, backing_bs(bs),
2298                                     offset, bytes, pnum, map, file);
2299  }
2300  
2301  int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2302                                     int64_t bytes, int64_t *pnum)
2303  {
2304      int ret;
2305      int64_t dummy;
2306  
2307      ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2308                                           bytes, pnum ? pnum : &dummy, NULL,
2309                                           NULL);
2310      if (ret < 0) {
2311          return ret;
2312      }
2313      return !!(ret & BDRV_BLOCK_ALLOCATED);
2314  }
2315  
2316  /*
2317   * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2318   *
2319   * Return true if (a prefix of) the given range is allocated in any image
2320   * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
2321   * offset is allocated in any image of the chain.  Return false otherwise,
2322   * or negative errno on failure.
2323   *
2324   * 'pnum' is set to the number of bytes (including and immediately
2325   * following the specified offset) that are known to be in the same
2326   * allocated/unallocated state.  Note that a subsequent call starting
2327   * at 'offset + *pnum' may return the same allocation status (in other
2328   * words, the result is not necessarily the maximum possible range);
2329   * but 'pnum' will only be 0 when end of file is reached.
2330   *
2331   */
2332  int bdrv_is_allocated_above(BlockDriverState *top,
2333                              BlockDriverState *base,
2334                              int64_t offset, int64_t bytes, int64_t *pnum)
2335  {
2336      BlockDriverState *intermediate;
2337      int ret;
2338      int64_t n = bytes;
2339  
2340      intermediate = top;
2341      while (intermediate && intermediate != base) {
2342          int64_t pnum_inter;
2343          int64_t size_inter;
2344  
2345          ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2346          if (ret < 0) {
2347              return ret;
2348          }
2349          if (ret) {
2350              *pnum = pnum_inter;
2351              return 1;
2352          }
2353  
2354          size_inter = bdrv_getlength(intermediate);
2355          if (size_inter < 0) {
2356              return size_inter;
2357          }
2358          if (n > pnum_inter &&
2359              (intermediate == top || offset + pnum_inter < size_inter)) {
2360              n = pnum_inter;
2361          }
2362  
2363          intermediate = backing_bs(intermediate);
2364      }
2365  
2366      *pnum = n;
2367      return 0;
2368  }
2369  
2370  typedef struct BdrvVmstateCo {
2371      BlockDriverState    *bs;
2372      QEMUIOVector        *qiov;
2373      int64_t             pos;
2374      bool                is_read;
2375      int                 ret;
2376  } BdrvVmstateCo;
2377  
2378  static int coroutine_fn
2379  bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2380                     bool is_read)
2381  {
2382      BlockDriver *drv = bs->drv;
2383      int ret = -ENOTSUP;
2384  
2385      bdrv_inc_in_flight(bs);
2386  
2387      if (!drv) {
2388          ret = -ENOMEDIUM;
2389      } else if (drv->bdrv_load_vmstate) {
2390          if (is_read) {
2391              ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2392          } else {
2393              ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2394          }
2395      } else if (bs->file) {
2396          ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2397      }
2398  
2399      bdrv_dec_in_flight(bs);
2400      return ret;
2401  }
2402  
2403  static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2404  {
2405      BdrvVmstateCo *co = opaque;
2406      co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2407      aio_wait_kick();
2408  }
2409  
2410  static inline int
2411  bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2412                  bool is_read)
2413  {
2414      if (qemu_in_coroutine()) {
2415          return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2416      } else {
2417          BdrvVmstateCo data = {
2418              .bs         = bs,
2419              .qiov       = qiov,
2420              .pos        = pos,
2421              .is_read    = is_read,
2422              .ret        = -EINPROGRESS,
2423          };
2424          Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2425  
2426          bdrv_coroutine_enter(bs, co);
2427          BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2428          return data.ret;
2429      }
2430  }
2431  
2432  int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2433                        int64_t pos, int size)
2434  {
2435      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2436      int ret;
2437  
2438      ret = bdrv_writev_vmstate(bs, &qiov, pos);
2439      if (ret < 0) {
2440          return ret;
2441      }
2442  
2443      return size;
2444  }
2445  
2446  int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2447  {
2448      return bdrv_rw_vmstate(bs, qiov, pos, false);
2449  }
2450  
2451  int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2452                        int64_t pos, int size)
2453  {
2454      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2455      int ret;
2456  
2457      ret = bdrv_readv_vmstate(bs, &qiov, pos);
2458      if (ret < 0) {
2459          return ret;
2460      }
2461  
2462      return size;
2463  }
2464  
2465  int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2466  {
2467      return bdrv_rw_vmstate(bs, qiov, pos, true);
2468  }
2469  
2470  /**************************************************************/
2471  /* async I/Os */
2472  
2473  void bdrv_aio_cancel(BlockAIOCB *acb)
2474  {
2475      qemu_aio_ref(acb);
2476      bdrv_aio_cancel_async(acb);
2477      while (acb->refcnt > 1) {
2478          if (acb->aiocb_info->get_aio_context) {
2479              aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2480          } else if (acb->bs) {
2481              /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2482               * assert that we're not using an I/O thread.  Thread-safe
2483               * code should use bdrv_aio_cancel_async exclusively.
2484               */
2485              assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2486              aio_poll(bdrv_get_aio_context(acb->bs), true);
2487          } else {
2488              abort();
2489          }
2490      }
2491      qemu_aio_unref(acb);
2492  }
2493  
2494  /* Async version of aio cancel. The caller is not blocked if the acb implements
2495   * cancel_async, otherwise we do nothing and let the request normally complete.
2496   * In either case the completion callback must be called. */
2497  void bdrv_aio_cancel_async(BlockAIOCB *acb)
2498  {
2499      if (acb->aiocb_info->cancel_async) {
2500          acb->aiocb_info->cancel_async(acb);
2501      }
2502  }
2503  
2504  /**************************************************************/
2505  /* Coroutine block device emulation */
2506  
2507  typedef struct FlushCo {
2508      BlockDriverState *bs;
2509      int ret;
2510  } FlushCo;
2511  
2512  
2513  static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2514  {
2515      FlushCo *rwco = opaque;
2516  
2517      rwco->ret = bdrv_co_flush(rwco->bs);
2518      aio_wait_kick();
2519  }
2520  
2521  int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2522  {
2523      int current_gen;
2524      int ret = 0;
2525  
2526      bdrv_inc_in_flight(bs);
2527  
2528      if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2529          bdrv_is_sg(bs)) {
2530          goto early_exit;
2531      }
2532  
2533      qemu_co_mutex_lock(&bs->reqs_lock);
2534      current_gen = atomic_read(&bs->write_gen);
2535  
2536      /* Wait until any previous flushes are completed */
2537      while (bs->active_flush_req) {
2538          qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2539      }
2540  
2541      /* Flushes reach this point in nondecreasing current_gen order.  */
2542      bs->active_flush_req = true;
2543      qemu_co_mutex_unlock(&bs->reqs_lock);
2544  
2545      /* Write back all layers by calling one driver function */
2546      if (bs->drv->bdrv_co_flush) {
2547          ret = bs->drv->bdrv_co_flush(bs);
2548          goto out;
2549      }
2550  
2551      /* Write back cached data to the OS even with cache=unsafe */
2552      BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2553      if (bs->drv->bdrv_co_flush_to_os) {
2554          ret = bs->drv->bdrv_co_flush_to_os(bs);
2555          if (ret < 0) {
2556              goto out;
2557          }
2558      }
2559  
2560      /* But don't actually force it to the disk with cache=unsafe */
2561      if (bs->open_flags & BDRV_O_NO_FLUSH) {
2562          goto flush_parent;
2563      }
2564  
2565      /* Check if we really need to flush anything */
2566      if (bs->flushed_gen == current_gen) {
2567          goto flush_parent;
2568      }
2569  
2570      BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2571      if (!bs->drv) {
2572          /* bs->drv->bdrv_co_flush() might have ejected the BDS
2573           * (even in case of apparent success) */
2574          ret = -ENOMEDIUM;
2575          goto out;
2576      }
2577      if (bs->drv->bdrv_co_flush_to_disk) {
2578          ret = bs->drv->bdrv_co_flush_to_disk(bs);
2579      } else if (bs->drv->bdrv_aio_flush) {
2580          BlockAIOCB *acb;
2581          CoroutineIOCompletion co = {
2582              .coroutine = qemu_coroutine_self(),
2583          };
2584  
2585          acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2586          if (acb == NULL) {
2587              ret = -EIO;
2588          } else {
2589              qemu_coroutine_yield();
2590              ret = co.ret;
2591          }
2592      } else {
2593          /*
2594           * Some block drivers always operate in either writethrough or unsafe
2595           * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2596           * know how the server works (because the behaviour is hardcoded or
2597           * depends on server-side configuration), so we can't ensure that
2598           * everything is safe on disk. Returning an error doesn't work because
2599           * that would break guests even if the server operates in writethrough
2600           * mode.
2601           *
2602           * Let's hope the user knows what he's doing.
2603           */
2604          ret = 0;
2605      }
2606  
2607      if (ret < 0) {
2608          goto out;
2609      }
2610  
2611      /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2612       * in the case of cache=unsafe, so there are no useless flushes.
2613       */
2614  flush_parent:
2615      ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2616  out:
2617      /* Notify any pending flushes that we have completed */
2618      if (ret == 0) {
2619          bs->flushed_gen = current_gen;
2620      }
2621  
2622      qemu_co_mutex_lock(&bs->reqs_lock);
2623      bs->active_flush_req = false;
2624      /* Return value is ignored - it's ok if wait queue is empty */
2625      qemu_co_queue_next(&bs->flush_queue);
2626      qemu_co_mutex_unlock(&bs->reqs_lock);
2627  
2628  early_exit:
2629      bdrv_dec_in_flight(bs);
2630      return ret;
2631  }
2632  
2633  int bdrv_flush(BlockDriverState *bs)
2634  {
2635      Coroutine *co;
2636      FlushCo flush_co = {
2637          .bs = bs,
2638          .ret = NOT_DONE,
2639      };
2640  
2641      if (qemu_in_coroutine()) {
2642          /* Fast-path if already in coroutine context */
2643          bdrv_flush_co_entry(&flush_co);
2644      } else {
2645          co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2646          bdrv_coroutine_enter(bs, co);
2647          BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2648      }
2649  
2650      return flush_co.ret;
2651  }
2652  
2653  typedef struct DiscardCo {
2654      BdrvChild *child;
2655      int64_t offset;
2656      int bytes;
2657      int ret;
2658  } DiscardCo;
2659  static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2660  {
2661      DiscardCo *rwco = opaque;
2662  
2663      rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
2664      aio_wait_kick();
2665  }
2666  
2667  int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int bytes)
2668  {
2669      BdrvTrackedRequest req;
2670      int max_pdiscard, ret;
2671      int head, tail, align;
2672      BlockDriverState *bs = child->bs;
2673  
2674      if (!bs || !bs->drv) {
2675          return -ENOMEDIUM;
2676      }
2677  
2678      if (bdrv_has_readonly_bitmaps(bs)) {
2679          return -EPERM;
2680      }
2681  
2682      ret = bdrv_check_byte_request(bs, offset, bytes);
2683      if (ret < 0) {
2684          return ret;
2685      }
2686  
2687      /* Do nothing if disabled.  */
2688      if (!(bs->open_flags & BDRV_O_UNMAP)) {
2689          return 0;
2690      }
2691  
2692      if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2693          return 0;
2694      }
2695  
2696      /* Discard is advisory, but some devices track and coalesce
2697       * unaligned requests, so we must pass everything down rather than
2698       * round here.  Still, most devices will just silently ignore
2699       * unaligned requests (by returning -ENOTSUP), so we must fragment
2700       * the request accordingly.  */
2701      align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2702      assert(align % bs->bl.request_alignment == 0);
2703      head = offset % align;
2704      tail = (offset + bytes) % align;
2705  
2706      bdrv_inc_in_flight(bs);
2707      tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2708  
2709      ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2710      if (ret < 0) {
2711          goto out;
2712      }
2713  
2714      max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2715                                     align);
2716      assert(max_pdiscard >= bs->bl.request_alignment);
2717  
2718      while (bytes > 0) {
2719          int num = bytes;
2720  
2721          if (head) {
2722              /* Make small requests to get to alignment boundaries. */
2723              num = MIN(bytes, align - head);
2724              if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2725                  num %= bs->bl.request_alignment;
2726              }
2727              head = (head + num) % align;
2728              assert(num < max_pdiscard);
2729          } else if (tail) {
2730              if (num > align) {
2731                  /* Shorten the request to the last aligned cluster.  */
2732                  num -= tail;
2733              } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2734                         tail > bs->bl.request_alignment) {
2735                  tail %= bs->bl.request_alignment;
2736                  num -= tail;
2737              }
2738          }
2739          /* limit request size */
2740          if (num > max_pdiscard) {
2741              num = max_pdiscard;
2742          }
2743  
2744          if (!bs->drv) {
2745              ret = -ENOMEDIUM;
2746              goto out;
2747          }
2748          if (bs->drv->bdrv_co_pdiscard) {
2749              ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2750          } else {
2751              BlockAIOCB *acb;
2752              CoroutineIOCompletion co = {
2753                  .coroutine = qemu_coroutine_self(),
2754              };
2755  
2756              acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2757                                               bdrv_co_io_em_complete, &co);
2758              if (acb == NULL) {
2759                  ret = -EIO;
2760                  goto out;
2761              } else {
2762                  qemu_coroutine_yield();
2763                  ret = co.ret;
2764              }
2765          }
2766          if (ret && ret != -ENOTSUP) {
2767              goto out;
2768          }
2769  
2770          offset += num;
2771          bytes -= num;
2772      }
2773      ret = 0;
2774  out:
2775      bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
2776      tracked_request_end(&req);
2777      bdrv_dec_in_flight(bs);
2778      return ret;
2779  }
2780  
2781  int bdrv_pdiscard(BdrvChild *child, int64_t offset, int bytes)
2782  {
2783      Coroutine *co;
2784      DiscardCo rwco = {
2785          .child = child,
2786          .offset = offset,
2787          .bytes = bytes,
2788          .ret = NOT_DONE,
2789      };
2790  
2791      if (qemu_in_coroutine()) {
2792          /* Fast-path if already in coroutine context */
2793          bdrv_pdiscard_co_entry(&rwco);
2794      } else {
2795          co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2796          bdrv_coroutine_enter(child->bs, co);
2797          BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
2798      }
2799  
2800      return rwco.ret;
2801  }
2802  
2803  int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2804  {
2805      BlockDriver *drv = bs->drv;
2806      CoroutineIOCompletion co = {
2807          .coroutine = qemu_coroutine_self(),
2808      };
2809      BlockAIOCB *acb;
2810  
2811      bdrv_inc_in_flight(bs);
2812      if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2813          co.ret = -ENOTSUP;
2814          goto out;
2815      }
2816  
2817      if (drv->bdrv_co_ioctl) {
2818          co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2819      } else {
2820          acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2821          if (!acb) {
2822              co.ret = -ENOTSUP;
2823              goto out;
2824          }
2825          qemu_coroutine_yield();
2826      }
2827  out:
2828      bdrv_dec_in_flight(bs);
2829      return co.ret;
2830  }
2831  
2832  void *qemu_blockalign(BlockDriverState *bs, size_t size)
2833  {
2834      return qemu_memalign(bdrv_opt_mem_align(bs), size);
2835  }
2836  
2837  void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2838  {
2839      return memset(qemu_blockalign(bs, size), 0, size);
2840  }
2841  
2842  void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2843  {
2844      size_t align = bdrv_opt_mem_align(bs);
2845  
2846      /* Ensure that NULL is never returned on success */
2847      assert(align > 0);
2848      if (size == 0) {
2849          size = align;
2850      }
2851  
2852      return qemu_try_memalign(align, size);
2853  }
2854  
2855  void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2856  {
2857      void *mem = qemu_try_blockalign(bs, size);
2858  
2859      if (mem) {
2860          memset(mem, 0, size);
2861      }
2862  
2863      return mem;
2864  }
2865  
2866  /*
2867   * Check if all memory in this vector is sector aligned.
2868   */
2869  bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2870  {
2871      int i;
2872      size_t alignment = bdrv_min_mem_align(bs);
2873  
2874      for (i = 0; i < qiov->niov; i++) {
2875          if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2876              return false;
2877          }
2878          if (qiov->iov[i].iov_len % alignment) {
2879              return false;
2880          }
2881      }
2882  
2883      return true;
2884  }
2885  
2886  void bdrv_add_before_write_notifier(BlockDriverState *bs,
2887                                      NotifierWithReturn *notifier)
2888  {
2889      notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2890  }
2891  
2892  void bdrv_io_plug(BlockDriverState *bs)
2893  {
2894      BdrvChild *child;
2895  
2896      QLIST_FOREACH(child, &bs->children, next) {
2897          bdrv_io_plug(child->bs);
2898      }
2899  
2900      if (atomic_fetch_inc(&bs->io_plugged) == 0) {
2901          BlockDriver *drv = bs->drv;
2902          if (drv && drv->bdrv_io_plug) {
2903              drv->bdrv_io_plug(bs);
2904          }
2905      }
2906  }
2907  
2908  void bdrv_io_unplug(BlockDriverState *bs)
2909  {
2910      BdrvChild *child;
2911  
2912      assert(bs->io_plugged);
2913      if (atomic_fetch_dec(&bs->io_plugged) == 1) {
2914          BlockDriver *drv = bs->drv;
2915          if (drv && drv->bdrv_io_unplug) {
2916              drv->bdrv_io_unplug(bs);
2917          }
2918      }
2919  
2920      QLIST_FOREACH(child, &bs->children, next) {
2921          bdrv_io_unplug(child->bs);
2922      }
2923  }
2924  
2925  void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
2926  {
2927      BdrvChild *child;
2928  
2929      if (bs->drv && bs->drv->bdrv_register_buf) {
2930          bs->drv->bdrv_register_buf(bs, host, size);
2931      }
2932      QLIST_FOREACH(child, &bs->children, next) {
2933          bdrv_register_buf(child->bs, host, size);
2934      }
2935  }
2936  
2937  void bdrv_unregister_buf(BlockDriverState *bs, void *host)
2938  {
2939      BdrvChild *child;
2940  
2941      if (bs->drv && bs->drv->bdrv_unregister_buf) {
2942          bs->drv->bdrv_unregister_buf(bs, host);
2943      }
2944      QLIST_FOREACH(child, &bs->children, next) {
2945          bdrv_unregister_buf(child->bs, host);
2946      }
2947  }
2948  
2949  static int coroutine_fn bdrv_co_copy_range_internal(
2950          BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
2951          uint64_t dst_offset, uint64_t bytes,
2952          BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
2953          bool recurse_src)
2954  {
2955      BdrvTrackedRequest req;
2956      int ret;
2957  
2958      /* TODO We can support BDRV_REQ_NO_FALLBACK here */
2959      assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
2960      assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
2961  
2962      if (!dst || !dst->bs) {
2963          return -ENOMEDIUM;
2964      }
2965      ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
2966      if (ret) {
2967          return ret;
2968      }
2969      if (write_flags & BDRV_REQ_ZERO_WRITE) {
2970          return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
2971      }
2972  
2973      if (!src || !src->bs) {
2974          return -ENOMEDIUM;
2975      }
2976      ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
2977      if (ret) {
2978          return ret;
2979      }
2980  
2981      if (!src->bs->drv->bdrv_co_copy_range_from
2982          || !dst->bs->drv->bdrv_co_copy_range_to
2983          || src->bs->encrypted || dst->bs->encrypted) {
2984          return -ENOTSUP;
2985      }
2986  
2987      if (recurse_src) {
2988          bdrv_inc_in_flight(src->bs);
2989          tracked_request_begin(&req, src->bs, src_offset, bytes,
2990                                BDRV_TRACKED_READ);
2991  
2992          /* BDRV_REQ_SERIALISING is only for write operation */
2993          assert(!(read_flags & BDRV_REQ_SERIALISING));
2994          if (!(read_flags & BDRV_REQ_NO_SERIALISING)) {
2995              wait_serialising_requests(&req);
2996          }
2997  
2998          ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
2999                                                      src, src_offset,
3000                                                      dst, dst_offset,
3001                                                      bytes,
3002                                                      read_flags, write_flags);
3003  
3004          tracked_request_end(&req);
3005          bdrv_dec_in_flight(src->bs);
3006      } else {
3007          bdrv_inc_in_flight(dst->bs);
3008          tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3009                                BDRV_TRACKED_WRITE);
3010          ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
3011                                          write_flags);
3012          if (!ret) {
3013              ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3014                                                        src, src_offset,
3015                                                        dst, dst_offset,
3016                                                        bytes,
3017                                                        read_flags, write_flags);
3018          }
3019          bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3020          tracked_request_end(&req);
3021          bdrv_dec_in_flight(dst->bs);
3022      }
3023  
3024      return ret;
3025  }
3026  
3027  /* Copy range from @src to @dst.
3028   *
3029   * See the comment of bdrv_co_copy_range for the parameter and return value
3030   * semantics. */
3031  int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3032                                           BdrvChild *dst, uint64_t dst_offset,
3033                                           uint64_t bytes,
3034                                           BdrvRequestFlags read_flags,
3035                                           BdrvRequestFlags write_flags)
3036  {
3037      trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3038                                    read_flags, write_flags);
3039      return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3040                                         bytes, read_flags, write_flags, true);
3041  }
3042  
3043  /* Copy range from @src to @dst.
3044   *
3045   * See the comment of bdrv_co_copy_range for the parameter and return value
3046   * semantics. */
3047  int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3048                                         BdrvChild *dst, uint64_t dst_offset,
3049                                         uint64_t bytes,
3050                                         BdrvRequestFlags read_flags,
3051                                         BdrvRequestFlags write_flags)
3052  {
3053      trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3054                                  read_flags, write_flags);
3055      return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3056                                         bytes, read_flags, write_flags, false);
3057  }
3058  
3059  int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3060                                      BdrvChild *dst, uint64_t dst_offset,
3061                                      uint64_t bytes, BdrvRequestFlags read_flags,
3062                                      BdrvRequestFlags write_flags)
3063  {
3064      return bdrv_co_copy_range_from(src, src_offset,
3065                                     dst, dst_offset,
3066                                     bytes, read_flags, write_flags);
3067  }
3068  
3069  static void bdrv_parent_cb_resize(BlockDriverState *bs)
3070  {
3071      BdrvChild *c;
3072      QLIST_FOREACH(c, &bs->parents, next_parent) {
3073          if (c->role->resize) {
3074              c->role->resize(c);
3075          }
3076      }
3077  }
3078  
3079  /**
3080   * Truncate file to 'offset' bytes (needed only for file protocols)
3081   */
3082  int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
3083                                    PreallocMode prealloc, Error **errp)
3084  {
3085      BlockDriverState *bs = child->bs;
3086      BlockDriver *drv = bs->drv;
3087      BdrvTrackedRequest req;
3088      int64_t old_size, new_bytes;
3089      int ret;
3090  
3091  
3092      /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3093      if (!drv) {
3094          error_setg(errp, "No medium inserted");
3095          return -ENOMEDIUM;
3096      }
3097      if (offset < 0) {
3098          error_setg(errp, "Image size cannot be negative");
3099          return -EINVAL;
3100      }
3101  
3102      old_size = bdrv_getlength(bs);
3103      if (old_size < 0) {
3104          error_setg_errno(errp, -old_size, "Failed to get old image size");
3105          return old_size;
3106      }
3107  
3108      if (offset > old_size) {
3109          new_bytes = offset - old_size;
3110      } else {
3111          new_bytes = 0;
3112      }
3113  
3114      bdrv_inc_in_flight(bs);
3115      tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
3116                            BDRV_TRACKED_TRUNCATE);
3117  
3118      /* If we are growing the image and potentially using preallocation for the
3119       * new area, we need to make sure that no write requests are made to it
3120       * concurrently or they might be overwritten by preallocation. */
3121      if (new_bytes) {
3122          mark_request_serialising(&req, 1);
3123      }
3124      if (bs->read_only) {
3125          error_setg(errp, "Image is read-only");
3126          ret = -EACCES;
3127          goto out;
3128      }
3129      ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3130                                      0);
3131      if (ret < 0) {
3132          error_setg_errno(errp, -ret,
3133                           "Failed to prepare request for truncation");
3134          goto out;
3135      }
3136  
3137      if (!drv->bdrv_co_truncate) {
3138          if (bs->file && drv->is_filter) {
3139              ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
3140              goto out;
3141          }
3142          error_setg(errp, "Image format driver does not support resize");
3143          ret = -ENOTSUP;
3144          goto out;
3145      }
3146  
3147      ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
3148      if (ret < 0) {
3149          goto out;
3150      }
3151      ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3152      if (ret < 0) {
3153          error_setg_errno(errp, -ret, "Could not refresh total sector count");
3154      } else {
3155          offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3156      }
3157      /* It's possible that truncation succeeded but refresh_total_sectors
3158       * failed, but the latter doesn't affect how we should finish the request.
3159       * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3160      bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
3161  
3162  out:
3163      tracked_request_end(&req);
3164      bdrv_dec_in_flight(bs);
3165  
3166      return ret;
3167  }
3168  
3169  typedef struct TruncateCo {
3170      BdrvChild *child;
3171      int64_t offset;
3172      PreallocMode prealloc;
3173      Error **errp;
3174      int ret;
3175  } TruncateCo;
3176  
3177  static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
3178  {
3179      TruncateCo *tco = opaque;
3180      tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
3181                                  tco->errp);
3182      aio_wait_kick();
3183  }
3184  
3185  int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
3186                    Error **errp)
3187  {
3188      Coroutine *co;
3189      TruncateCo tco = {
3190          .child      = child,
3191          .offset     = offset,
3192          .prealloc   = prealloc,
3193          .errp       = errp,
3194          .ret        = NOT_DONE,
3195      };
3196  
3197      if (qemu_in_coroutine()) {
3198          /* Fast-path if already in coroutine context */
3199          bdrv_truncate_co_entry(&tco);
3200      } else {
3201          co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco);
3202          bdrv_coroutine_enter(child->bs, co);
3203          BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
3204      }
3205  
3206      return tco.ret;
3207  }
3208