xref: /openbmc/qemu/block/qed.c (revision 623d7e3551a6fc5693c06ea938c60fe281b52e27)
1  /*
2   * QEMU Enhanced Disk Format
3   *
4   * Copyright IBM, Corp. 2010
5   *
6   * Authors:
7   *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
8   *  Anthony Liguori   <aliguori@us.ibm.com>
9   *
10   * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11   * See the COPYING.LIB file in the top-level directory.
12   *
13   */
14  
15  #include "qemu/osdep.h"
16  #include "block/qdict.h"
17  #include "qapi/error.h"
18  #include "qemu/timer.h"
19  #include "qemu/bswap.h"
20  #include "qemu/main-loop.h"
21  #include "qemu/module.h"
22  #include "qemu/option.h"
23  #include "qemu/memalign.h"
24  #include "trace.h"
25  #include "qed.h"
26  #include "sysemu/block-backend.h"
27  #include "qapi/qmp/qdict.h"
28  #include "qapi/qobject-input-visitor.h"
29  #include "qapi/qapi-visit-block-core.h"
30  
31  static QemuOptsList qed_create_opts;
32  
33  static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
34                            const char *filename)
35  {
36      const QEDHeader *header = (const QEDHeader *)buf;
37  
38      if (buf_size < sizeof(*header)) {
39          return 0;
40      }
41      if (le32_to_cpu(header->magic) != QED_MAGIC) {
42          return 0;
43      }
44      return 100;
45  }
46  
47  /**
48   * Check whether an image format is raw
49   *
50   * @fmt:    Backing file format, may be NULL
51   */
52  static bool qed_fmt_is_raw(const char *fmt)
53  {
54      return fmt && strcmp(fmt, "raw") == 0;
55  }
56  
57  static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
58  {
59      cpu->magic = le32_to_cpu(le->magic);
60      cpu->cluster_size = le32_to_cpu(le->cluster_size);
61      cpu->table_size = le32_to_cpu(le->table_size);
62      cpu->header_size = le32_to_cpu(le->header_size);
63      cpu->features = le64_to_cpu(le->features);
64      cpu->compat_features = le64_to_cpu(le->compat_features);
65      cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
66      cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
67      cpu->image_size = le64_to_cpu(le->image_size);
68      cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
69      cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
70  }
71  
72  static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
73  {
74      le->magic = cpu_to_le32(cpu->magic);
75      le->cluster_size = cpu_to_le32(cpu->cluster_size);
76      le->table_size = cpu_to_le32(cpu->table_size);
77      le->header_size = cpu_to_le32(cpu->header_size);
78      le->features = cpu_to_le64(cpu->features);
79      le->compat_features = cpu_to_le64(cpu->compat_features);
80      le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
81      le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
82      le->image_size = cpu_to_le64(cpu->image_size);
83      le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
84      le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
85  }
86  
87  int qed_write_header_sync(BDRVQEDState *s)
88  {
89      QEDHeader le;
90  
91      qed_header_cpu_to_le(&s->header, &le);
92      return bdrv_pwrite(s->bs->file, 0, sizeof(le), &le, 0);
93  }
94  
95  /**
96   * Update header in-place (does not rewrite backing filename or other strings)
97   *
98   * This function only updates known header fields in-place and does not affect
99   * extra data after the QED header.
100   *
101   * No new allocating reqs can start while this function runs.
102   */
103  static int coroutine_fn GRAPH_RDLOCK qed_write_header(BDRVQEDState *s)
104  {
105      /* We must write full sectors for O_DIRECT but cannot necessarily generate
106       * the data following the header if an unrecognized compat feature is
107       * active.  Therefore, first read the sectors containing the header, update
108       * them, and write back.
109       */
110  
111      int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
112      size_t len = nsectors * BDRV_SECTOR_SIZE;
113      uint8_t *buf;
114      int ret;
115  
116      assert(s->allocating_acb || s->allocating_write_reqs_plugged);
117  
118      buf = qemu_blockalign(s->bs, len);
119  
120      ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
121      if (ret < 0) {
122          goto out;
123      }
124  
125      /* Update header */
126      qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
127  
128      ret = bdrv_co_pwrite(s->bs->file, 0, len,  buf, 0);
129      if (ret < 0) {
130          goto out;
131      }
132  
133      ret = 0;
134  out:
135      qemu_vfree(buf);
136      return ret;
137  }
138  
139  static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
140  {
141      uint64_t table_entries;
142      uint64_t l2_size;
143  
144      table_entries = (table_size * cluster_size) / sizeof(uint64_t);
145      l2_size = table_entries * cluster_size;
146  
147      return l2_size * table_entries;
148  }
149  
150  static bool qed_is_cluster_size_valid(uint32_t cluster_size)
151  {
152      if (cluster_size < QED_MIN_CLUSTER_SIZE ||
153          cluster_size > QED_MAX_CLUSTER_SIZE) {
154          return false;
155      }
156      if (cluster_size & (cluster_size - 1)) {
157          return false; /* not power of 2 */
158      }
159      return true;
160  }
161  
162  static bool qed_is_table_size_valid(uint32_t table_size)
163  {
164      if (table_size < QED_MIN_TABLE_SIZE ||
165          table_size > QED_MAX_TABLE_SIZE) {
166          return false;
167      }
168      if (table_size & (table_size - 1)) {
169          return false; /* not power of 2 */
170      }
171      return true;
172  }
173  
174  static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
175                                      uint32_t table_size)
176  {
177      if (image_size % BDRV_SECTOR_SIZE != 0) {
178          return false; /* not multiple of sector size */
179      }
180      if (image_size > qed_max_image_size(cluster_size, table_size)) {
181          return false; /* image is too large */
182      }
183      return true;
184  }
185  
186  /**
187   * Read a string of known length from the image file
188   *
189   * @file:       Image file
190   * @offset:     File offset to start of string, in bytes
191   * @n:          String length in bytes
192   * @buf:        Destination buffer
193   * @buflen:     Destination buffer length in bytes
194   * @ret:        0 on success, -errno on failure
195   *
196   * The string is NUL-terminated.
197   */
198  static int coroutine_fn GRAPH_RDLOCK
199  qed_read_string(BdrvChild *file, uint64_t offset,
200                  size_t n, char *buf, size_t buflen)
201  {
202      int ret;
203      if (n >= buflen) {
204          return -EINVAL;
205      }
206      ret = bdrv_co_pread(file, offset, n, buf, 0);
207      if (ret < 0) {
208          return ret;
209      }
210      buf[n] = '\0';
211      return 0;
212  }
213  
214  /**
215   * Allocate new clusters
216   *
217   * @s:          QED state
218   * @n:          Number of contiguous clusters to allocate
219   * @ret:        Offset of first allocated cluster
220   *
221   * This function only produces the offset where the new clusters should be
222   * written.  It updates BDRVQEDState but does not make any changes to the image
223   * file.
224   *
225   * Called with table_lock held.
226   */
227  static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
228  {
229      uint64_t offset = s->file_size;
230      s->file_size += n * s->header.cluster_size;
231      return offset;
232  }
233  
234  QEDTable *qed_alloc_table(BDRVQEDState *s)
235  {
236      /* Honor O_DIRECT memory alignment requirements */
237      return qemu_blockalign(s->bs,
238                             s->header.cluster_size * s->header.table_size);
239  }
240  
241  /**
242   * Allocate a new zeroed L2 table
243   *
244   * Called with table_lock held.
245   */
246  static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
247  {
248      CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
249  
250      l2_table->table = qed_alloc_table(s);
251      l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
252  
253      memset(l2_table->table->offsets, 0,
254             s->header.cluster_size * s->header.table_size);
255      return l2_table;
256  }
257  
258  static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
259  {
260      qemu_co_mutex_lock(&s->table_lock);
261  
262      /* No reentrancy is allowed.  */
263      assert(!s->allocating_write_reqs_plugged);
264      if (s->allocating_acb != NULL) {
265          /* Another allocating write came concurrently.  This cannot happen
266           * from bdrv_qed_drain_begin, but it can happen when the timer runs.
267           */
268          qemu_co_mutex_unlock(&s->table_lock);
269          return false;
270      }
271  
272      s->allocating_write_reqs_plugged = true;
273      qemu_co_mutex_unlock(&s->table_lock);
274      return true;
275  }
276  
277  static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
278  {
279      qemu_co_mutex_lock(&s->table_lock);
280      assert(s->allocating_write_reqs_plugged);
281      s->allocating_write_reqs_plugged = false;
282      qemu_co_queue_next(&s->allocating_write_reqs);
283      qemu_co_mutex_unlock(&s->table_lock);
284  }
285  
286  static void coroutine_fn GRAPH_RDLOCK qed_need_check_timer(BDRVQEDState *s)
287  {
288      int ret;
289  
290      trace_qed_need_check_timer_cb(s);
291      assert_bdrv_graph_readable();
292  
293      if (!qed_plug_allocating_write_reqs(s)) {
294          return;
295      }
296  
297      /* Ensure writes are on disk before clearing flag */
298      ret = bdrv_co_flush(s->bs->file->bs);
299      if (ret < 0) {
300          qed_unplug_allocating_write_reqs(s);
301          return;
302      }
303  
304      s->header.features &= ~QED_F_NEED_CHECK;
305      ret = qed_write_header(s);
306      (void) ret;
307  
308      qed_unplug_allocating_write_reqs(s);
309  
310      ret = bdrv_co_flush(s->bs);
311      (void) ret;
312  }
313  
314  static void coroutine_fn qed_need_check_timer_entry(void *opaque)
315  {
316      BDRVQEDState *s = opaque;
317      GRAPH_RDLOCK_GUARD();
318  
319      qed_need_check_timer(opaque);
320      bdrv_dec_in_flight(s->bs);
321  }
322  
323  static void qed_need_check_timer_cb(void *opaque)
324  {
325      BDRVQEDState *s = opaque;
326      Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
327  
328      bdrv_inc_in_flight(s->bs);
329      qemu_coroutine_enter(co);
330  }
331  
332  static void qed_start_need_check_timer(BDRVQEDState *s)
333  {
334      trace_qed_start_need_check_timer(s);
335  
336      /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
337       * migration.
338       */
339      timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
340                     NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
341  }
342  
343  /* It's okay to call this multiple times or when no timer is started */
344  static void qed_cancel_need_check_timer(BDRVQEDState *s)
345  {
346      trace_qed_cancel_need_check_timer(s);
347      timer_del(s->need_check_timer);
348  }
349  
350  static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
351  {
352      BDRVQEDState *s = bs->opaque;
353  
354      qed_cancel_need_check_timer(s);
355      timer_free(s->need_check_timer);
356  }
357  
358  static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
359                                          AioContext *new_context)
360  {
361      BDRVQEDState *s = bs->opaque;
362  
363      s->need_check_timer = aio_timer_new(new_context,
364                                          QEMU_CLOCK_VIRTUAL, SCALE_NS,
365                                          qed_need_check_timer_cb, s);
366      if (s->header.features & QED_F_NEED_CHECK) {
367          qed_start_need_check_timer(s);
368      }
369  }
370  
371  static void bdrv_qed_drain_begin(BlockDriverState *bs)
372  {
373      BDRVQEDState *s = bs->opaque;
374  
375      /* Fire the timer immediately in order to start doing I/O as soon as the
376       * header is flushed.
377       */
378      if (s->need_check_timer && timer_pending(s->need_check_timer)) {
379          Coroutine *co;
380  
381          qed_cancel_need_check_timer(s);
382          co = qemu_coroutine_create(qed_need_check_timer_entry, s);
383          bdrv_inc_in_flight(bs);
384          aio_co_enter(bdrv_get_aio_context(bs), co);
385      }
386  }
387  
388  static void bdrv_qed_init_state(BlockDriverState *bs)
389  {
390      BDRVQEDState *s = bs->opaque;
391  
392      memset(s, 0, sizeof(BDRVQEDState));
393      s->bs = bs;
394      qemu_co_mutex_init(&s->table_lock);
395      qemu_co_queue_init(&s->allocating_write_reqs);
396  }
397  
398  /* Called with table_lock held.  */
399  static int coroutine_fn GRAPH_RDLOCK
400  bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
401  {
402      BDRVQEDState *s = bs->opaque;
403      QEDHeader le_header;
404      int64_t file_size;
405      int ret;
406  
407      ret = bdrv_co_pread(bs->file, 0, sizeof(le_header), &le_header, 0);
408      if (ret < 0) {
409          error_setg(errp, "Failed to read QED header");
410          return ret;
411      }
412      qed_header_le_to_cpu(&le_header, &s->header);
413  
414      if (s->header.magic != QED_MAGIC) {
415          error_setg(errp, "Image not in QED format");
416          return -EINVAL;
417      }
418      if (s->header.features & ~QED_FEATURE_MASK) {
419          /* image uses unsupported feature bits */
420          error_setg(errp, "Unsupported QED features: %" PRIx64,
421                     s->header.features & ~QED_FEATURE_MASK);
422          return -ENOTSUP;
423      }
424      if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
425          error_setg(errp, "QED cluster size is invalid");
426          return -EINVAL;
427      }
428  
429      /* Round down file size to the last cluster */
430      file_size = bdrv_co_getlength(bs->file->bs);
431      if (file_size < 0) {
432          error_setg(errp, "Failed to get file length");
433          return file_size;
434      }
435      s->file_size = qed_start_of_cluster(s, file_size);
436  
437      if (!qed_is_table_size_valid(s->header.table_size)) {
438          error_setg(errp, "QED table size is invalid");
439          return -EINVAL;
440      }
441      if (!qed_is_image_size_valid(s->header.image_size,
442                                   s->header.cluster_size,
443                                   s->header.table_size)) {
444          error_setg(errp, "QED image size is invalid");
445          return -EINVAL;
446      }
447      if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
448          error_setg(errp, "QED table offset is invalid");
449          return -EINVAL;
450      }
451  
452      s->table_nelems = (s->header.cluster_size * s->header.table_size) /
453                        sizeof(uint64_t);
454      s->l2_shift = ctz32(s->header.cluster_size);
455      s->l2_mask = s->table_nelems - 1;
456      s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
457  
458      /* Header size calculation must not overflow uint32_t */
459      if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
460          error_setg(errp, "QED header size is too large");
461          return -EINVAL;
462      }
463  
464      if ((s->header.features & QED_F_BACKING_FILE)) {
465          g_autofree char *backing_file_str = NULL;
466  
467          if ((uint64_t)s->header.backing_filename_offset +
468              s->header.backing_filename_size >
469              s->header.cluster_size * s->header.header_size) {
470              error_setg(errp, "QED backing filename offset is invalid");
471              return -EINVAL;
472          }
473  
474          backing_file_str = g_malloc(sizeof(bs->backing_file));
475          ret = qed_read_string(bs->file, s->header.backing_filename_offset,
476                                s->header.backing_filename_size,
477                                backing_file_str, sizeof(bs->backing_file));
478          if (ret < 0) {
479              error_setg(errp, "Failed to read backing filename");
480              return ret;
481          }
482  
483          if (!g_str_equal(backing_file_str, bs->backing_file)) {
484              pstrcpy(bs->backing_file, sizeof(bs->backing_file),
485                      backing_file_str);
486              pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
487                      backing_file_str);
488          }
489  
490          if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
491              pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
492          }
493      }
494  
495      /* Reset unknown autoclear feature bits.  This is a backwards
496       * compatibility mechanism that allows images to be opened by older
497       * programs, which "knock out" unknown feature bits.  When an image is
498       * opened by a newer program again it can detect that the autoclear
499       * feature is no longer valid.
500       */
501      if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
502          !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
503          s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
504  
505          ret = qed_write_header_sync(s);
506          if (ret) {
507              error_setg(errp, "Failed to update header");
508              return ret;
509          }
510  
511          /* From here on only known autoclear feature bits are valid */
512          bdrv_co_flush(bs->file->bs);
513      }
514  
515      s->l1_table = qed_alloc_table(s);
516      qed_init_l2_cache(&s->l2_cache);
517  
518      ret = qed_read_l1_table_sync(s);
519      if (ret) {
520          error_setg(errp, "Failed to read L1 table");
521          goto out;
522      }
523  
524      /* If image was not closed cleanly, check consistency */
525      if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
526          /* Read-only images cannot be fixed.  There is no risk of corruption
527           * since write operations are not possible.  Therefore, allow
528           * potentially inconsistent images to be opened read-only.  This can
529           * aid data recovery from an otherwise inconsistent image.
530           */
531          if (!bdrv_is_read_only(bs->file->bs) &&
532              !(flags & BDRV_O_INACTIVE)) {
533              BdrvCheckResult result = {0};
534  
535              ret = qed_check(s, &result, true);
536              if (ret) {
537                  error_setg(errp, "Image corrupted");
538                  goto out;
539              }
540          }
541      }
542  
543      bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
544  
545  out:
546      if (ret) {
547          qed_free_l2_cache(&s->l2_cache);
548          qemu_vfree(s->l1_table);
549      }
550      return ret;
551  }
552  
553  typedef struct QEDOpenCo {
554      BlockDriverState *bs;
555      QDict *options;
556      int flags;
557      Error **errp;
558      int ret;
559  } QEDOpenCo;
560  
561  static void coroutine_fn bdrv_qed_open_entry(void *opaque)
562  {
563      QEDOpenCo *qoc = opaque;
564      BDRVQEDState *s = qoc->bs->opaque;
565  
566      GRAPH_RDLOCK_GUARD();
567  
568      qemu_co_mutex_lock(&s->table_lock);
569      qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
570      qemu_co_mutex_unlock(&s->table_lock);
571  }
572  
573  static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
574                           Error **errp)
575  {
576      QEDOpenCo qoc = {
577          .bs = bs,
578          .options = options,
579          .flags = flags,
580          .errp = errp,
581          .ret = -EINPROGRESS
582      };
583      int ret;
584  
585      ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
586      if (ret < 0) {
587          return ret;
588      }
589  
590      bdrv_qed_init_state(bs);
591      assert(!qemu_in_coroutine());
592      assert(qemu_get_current_aio_context() == qemu_get_aio_context());
593      qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
594      BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
595  
596      return qoc.ret;
597  }
598  
599  static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
600  {
601      BDRVQEDState *s = bs->opaque;
602  
603      bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
604      bs->bl.max_pwrite_zeroes = QEMU_ALIGN_DOWN(INT_MAX, s->header.cluster_size);
605  }
606  
607  /* We have nothing to do for QED reopen, stubs just return
608   * success */
609  static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
610                                     BlockReopenQueue *queue, Error **errp)
611  {
612      return 0;
613  }
614  
615  static void bdrv_qed_close(BlockDriverState *bs)
616  {
617      BDRVQEDState *s = bs->opaque;
618  
619      bdrv_qed_detach_aio_context(bs);
620  
621      /* Ensure writes reach stable storage */
622      bdrv_flush(bs->file->bs);
623  
624      /* Clean shutdown, no check required on next open */
625      if (s->header.features & QED_F_NEED_CHECK) {
626          s->header.features &= ~QED_F_NEED_CHECK;
627          qed_write_header_sync(s);
628      }
629  
630      qed_free_l2_cache(&s->l2_cache);
631      qemu_vfree(s->l1_table);
632  }
633  
634  static int coroutine_fn GRAPH_UNLOCKED
635  bdrv_qed_co_create(BlockdevCreateOptions *opts, Error **errp)
636  {
637      BlockdevCreateOptionsQed *qed_opts;
638      BlockBackend *blk = NULL;
639      BlockDriverState *bs = NULL;
640  
641      QEDHeader header;
642      QEDHeader le_header;
643      uint8_t *l1_table = NULL;
644      size_t l1_size;
645      int ret = 0;
646  
647      assert(opts->driver == BLOCKDEV_DRIVER_QED);
648      qed_opts = &opts->u.qed;
649  
650      /* Validate options and set default values */
651      if (!qed_opts->has_cluster_size) {
652          qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
653      }
654      if (!qed_opts->has_table_size) {
655          qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
656      }
657  
658      if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
659          error_setg(errp, "QED cluster size must be within range [%u, %u] "
660                           "and power of 2",
661                     QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
662          return -EINVAL;
663      }
664      if (!qed_is_table_size_valid(qed_opts->table_size)) {
665          error_setg(errp, "QED table size must be within range [%u, %u] "
666                           "and power of 2",
667                     QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
668          return -EINVAL;
669      }
670      if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
671                                   qed_opts->table_size))
672      {
673          error_setg(errp, "QED image size must be a non-zero multiple of "
674                           "cluster size and less than %" PRIu64 " bytes",
675                     qed_max_image_size(qed_opts->cluster_size,
676                                        qed_opts->table_size));
677          return -EINVAL;
678      }
679  
680      /* Create BlockBackend to write to the image */
681      bs = bdrv_co_open_blockdev_ref(qed_opts->file, errp);
682      if (bs == NULL) {
683          return -EIO;
684      }
685  
686      blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
687                               errp);
688      if (!blk) {
689          ret = -EPERM;
690          goto out;
691      }
692      blk_set_allow_write_beyond_eof(blk, true);
693  
694      /* Prepare image format */
695      header = (QEDHeader) {
696          .magic = QED_MAGIC,
697          .cluster_size = qed_opts->cluster_size,
698          .table_size = qed_opts->table_size,
699          .header_size = 1,
700          .features = 0,
701          .compat_features = 0,
702          .l1_table_offset = qed_opts->cluster_size,
703          .image_size = qed_opts->size,
704      };
705  
706      l1_size = header.cluster_size * header.table_size;
707  
708      /*
709       * The QED format associates file length with allocation status,
710       * so a new file (which is empty) must have a length of 0.
711       */
712      ret = blk_co_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp);
713      if (ret < 0) {
714          goto out;
715      }
716  
717      if (qed_opts->backing_file) {
718          header.features |= QED_F_BACKING_FILE;
719          header.backing_filename_offset = sizeof(le_header);
720          header.backing_filename_size = strlen(qed_opts->backing_file);
721  
722          if (qed_opts->has_backing_fmt) {
723              const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
724              if (qed_fmt_is_raw(backing_fmt)) {
725                  header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
726              }
727          }
728      }
729  
730      qed_header_cpu_to_le(&header, &le_header);
731      ret = blk_co_pwrite(blk, 0, sizeof(le_header), &le_header, 0);
732      if (ret < 0) {
733          goto out;
734      }
735      ret = blk_co_pwrite(blk, sizeof(le_header), header.backing_filename_size,
736                       qed_opts->backing_file, 0);
737      if (ret < 0) {
738          goto out;
739      }
740  
741      l1_table = g_malloc0(l1_size);
742      ret = blk_co_pwrite(blk, header.l1_table_offset, l1_size, l1_table, 0);
743      if (ret < 0) {
744          goto out;
745      }
746  
747      ret = 0; /* success */
748  out:
749      g_free(l1_table);
750      blk_co_unref(blk);
751      bdrv_co_unref(bs);
752      return ret;
753  }
754  
755  static int coroutine_fn GRAPH_UNLOCKED
756  bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,
757                          QemuOpts *opts, Error **errp)
758  {
759      BlockdevCreateOptions *create_options = NULL;
760      QDict *qdict;
761      Visitor *v;
762      BlockDriverState *bs = NULL;
763      int ret;
764  
765      static const QDictRenames opt_renames[] = {
766          { BLOCK_OPT_BACKING_FILE,       "backing-file" },
767          { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
768          { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
769          { BLOCK_OPT_TABLE_SIZE,         "table-size" },
770          { NULL, NULL },
771      };
772  
773      /* Parse options and convert legacy syntax */
774      qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);
775  
776      if (!qdict_rename_keys(qdict, opt_renames, errp)) {
777          ret = -EINVAL;
778          goto fail;
779      }
780  
781      /* Create and open the file (protocol layer) */
782      ret = bdrv_co_create_file(filename, opts, errp);
783      if (ret < 0) {
784          goto fail;
785      }
786  
787      bs = bdrv_co_open(filename, NULL, NULL,
788                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
789      if (bs == NULL) {
790          ret = -EIO;
791          goto fail;
792      }
793  
794      /* Now get the QAPI type BlockdevCreateOptions */
795      qdict_put_str(qdict, "driver", "qed");
796      qdict_put_str(qdict, "file", bs->node_name);
797  
798      v = qobject_input_visitor_new_flat_confused(qdict, errp);
799      if (!v) {
800          ret = -EINVAL;
801          goto fail;
802      }
803  
804      visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
805      visit_free(v);
806      if (!create_options) {
807          ret = -EINVAL;
808          goto fail;
809      }
810  
811      /* Silently round up size */
812      assert(create_options->driver == BLOCKDEV_DRIVER_QED);
813      create_options->u.qed.size =
814          ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);
815  
816      /* Create the qed image (format layer) */
817      ret = bdrv_qed_co_create(create_options, errp);
818  
819  fail:
820      qobject_unref(qdict);
821      bdrv_co_unref(bs);
822      qapi_free_BlockdevCreateOptions(create_options);
823      return ret;
824  }
825  
826  static int coroutine_fn GRAPH_RDLOCK
827  bdrv_qed_co_block_status(BlockDriverState *bs, bool want_zero, int64_t pos,
828                           int64_t bytes, int64_t *pnum, int64_t *map,
829                           BlockDriverState **file)
830  {
831      BDRVQEDState *s = bs->opaque;
832      size_t len = MIN(bytes, SIZE_MAX);
833      int status;
834      QEDRequest request = { .l2_table = NULL };
835      uint64_t offset;
836      int ret;
837  
838      qemu_co_mutex_lock(&s->table_lock);
839      ret = qed_find_cluster(s, &request, pos, &len, &offset);
840  
841      *pnum = len;
842      switch (ret) {
843      case QED_CLUSTER_FOUND:
844          *map = offset | qed_offset_into_cluster(s, pos);
845          status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
846          *file = bs->file->bs;
847          break;
848      case QED_CLUSTER_ZERO:
849          status = BDRV_BLOCK_ZERO;
850          break;
851      case QED_CLUSTER_L2:
852      case QED_CLUSTER_L1:
853          status = 0;
854          break;
855      default:
856          assert(ret < 0);
857          status = ret;
858          break;
859      }
860  
861      qed_unref_l2_cache_entry(request.l2_table);
862      qemu_co_mutex_unlock(&s->table_lock);
863  
864      return status;
865  }
866  
867  static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
868  {
869      return acb->bs->opaque;
870  }
871  
872  /**
873   * Read from the backing file or zero-fill if no backing file
874   *
875   * @s:              QED state
876   * @pos:            Byte position in device
877   * @qiov:           Destination I/O vector
878   *
879   * This function reads qiov->size bytes starting at pos from the backing file.
880   * If there is no backing file then zeroes are read.
881   */
882  static int coroutine_fn GRAPH_RDLOCK
883  qed_read_backing_file(BDRVQEDState *s, uint64_t pos, QEMUIOVector *qiov)
884  {
885      if (s->bs->backing) {
886          BLKDBG_CO_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
887          return bdrv_co_preadv(s->bs->backing, pos, qiov->size, qiov, 0);
888      }
889      qemu_iovec_memset(qiov, 0, 0, qiov->size);
890      return 0;
891  }
892  
893  /**
894   * Copy data from backing file into the image
895   *
896   * @s:          QED state
897   * @pos:        Byte position in device
898   * @len:        Number of bytes
899   * @offset:     Byte offset in image file
900   */
901  static int coroutine_fn GRAPH_RDLOCK
902  qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, uint64_t len,
903                             uint64_t offset)
904  {
905      QEMUIOVector qiov;
906      int ret;
907  
908      /* Skip copy entirely if there is no work to do */
909      if (len == 0) {
910          return 0;
911      }
912  
913      qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
914  
915      ret = qed_read_backing_file(s, pos, &qiov);
916  
917      if (ret) {
918          goto out;
919      }
920  
921      BLKDBG_CO_EVENT(s->bs->file, BLKDBG_COW_WRITE);
922      ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
923      if (ret < 0) {
924          goto out;
925      }
926      ret = 0;
927  out:
928      qemu_vfree(qemu_iovec_buf(&qiov));
929      return ret;
930  }
931  
932  /**
933   * Link one or more contiguous clusters into a table
934   *
935   * @s:              QED state
936   * @table:          L2 table
937   * @index:          First cluster index
938   * @n:              Number of contiguous clusters
939   * @cluster:        First cluster offset
940   *
941   * The cluster offset may be an allocated byte offset in the image file, the
942   * zero cluster marker, or the unallocated cluster marker.
943   *
944   * Called with table_lock held.
945   */
946  static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
947                                               int index, unsigned int n,
948                                               uint64_t cluster)
949  {
950      int i;
951      for (i = index; i < index + n; i++) {
952          table->offsets[i] = cluster;
953          if (!qed_offset_is_unalloc_cluster(cluster) &&
954              !qed_offset_is_zero_cluster(cluster)) {
955              cluster += s->header.cluster_size;
956          }
957      }
958  }
959  
960  /* Called with table_lock held.  */
961  static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
962  {
963      BDRVQEDState *s = acb_to_s(acb);
964  
965      /* Free resources */
966      qemu_iovec_destroy(&acb->cur_qiov);
967      qed_unref_l2_cache_entry(acb->request.l2_table);
968  
969      /* Free the buffer we may have allocated for zero writes */
970      if (acb->flags & QED_AIOCB_ZERO) {
971          qemu_vfree(acb->qiov->iov[0].iov_base);
972          acb->qiov->iov[0].iov_base = NULL;
973      }
974  
975      /* Start next allocating write request waiting behind this one.  Note that
976       * requests enqueue themselves when they first hit an unallocated cluster
977       * but they wait until the entire request is finished before waking up the
978       * next request in the queue.  This ensures that we don't cycle through
979       * requests multiple times but rather finish one at a time completely.
980       */
981      if (acb == s->allocating_acb) {
982          s->allocating_acb = NULL;
983          if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
984              qemu_co_queue_next(&s->allocating_write_reqs);
985          } else if (s->header.features & QED_F_NEED_CHECK) {
986              qed_start_need_check_timer(s);
987          }
988      }
989  }
990  
991  /**
992   * Update L1 table with new L2 table offset and write it out
993   *
994   * Called with table_lock held.
995   */
996  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_l1_update(QEDAIOCB *acb)
997  {
998      BDRVQEDState *s = acb_to_s(acb);
999      CachedL2Table *l2_table = acb->request.l2_table;
1000      uint64_t l2_offset = l2_table->offset;
1001      int index, ret;
1002  
1003      index = qed_l1_index(s, acb->cur_pos);
1004      s->l1_table->offsets[index] = l2_table->offset;
1005  
1006      ret = qed_write_l1_table(s, index, 1);
1007  
1008      /* Commit the current L2 table to the cache */
1009      qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
1010  
1011      /* This is guaranteed to succeed because we just committed the entry to the
1012       * cache.
1013       */
1014      acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
1015      assert(acb->request.l2_table != NULL);
1016  
1017      return ret;
1018  }
1019  
1020  
1021  /**
1022   * Update L2 table with new cluster offsets and write them out
1023   *
1024   * Called with table_lock held.
1025   */
1026  static int coroutine_fn GRAPH_RDLOCK
1027  qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
1028  {
1029      BDRVQEDState *s = acb_to_s(acb);
1030      bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1031      int index, ret;
1032  
1033      if (need_alloc) {
1034          qed_unref_l2_cache_entry(acb->request.l2_table);
1035          acb->request.l2_table = qed_new_l2_table(s);
1036      }
1037  
1038      index = qed_l2_index(s, acb->cur_pos);
1039      qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1040                           offset);
1041  
1042      if (need_alloc) {
1043          /* Write out the whole new L2 table */
1044          ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
1045          if (ret) {
1046              return ret;
1047          }
1048          return qed_aio_write_l1_update(acb);
1049      } else {
1050          /* Write out only the updated part of the L2 table */
1051          ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
1052                                   false);
1053          if (ret) {
1054              return ret;
1055          }
1056      }
1057      return 0;
1058  }
1059  
1060  /**
1061   * Write data to the image file
1062   *
1063   * Called with table_lock *not* held.
1064   */
1065  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_main(QEDAIOCB *acb)
1066  {
1067      BDRVQEDState *s = acb_to_s(acb);
1068      uint64_t offset = acb->cur_cluster +
1069                        qed_offset_into_cluster(s, acb->cur_pos);
1070  
1071      trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
1072  
1073      BLKDBG_CO_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1074      return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
1075                             &acb->cur_qiov, 0);
1076  }
1077  
1078  /**
1079   * Populate untouched regions of new data cluster
1080   *
1081   * Called with table_lock held.
1082   */
1083  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_cow(QEDAIOCB *acb)
1084  {
1085      BDRVQEDState *s = acb_to_s(acb);
1086      uint64_t start, len, offset;
1087      int ret;
1088  
1089      qemu_co_mutex_unlock(&s->table_lock);
1090  
1091      /* Populate front untouched region of new data cluster */
1092      start = qed_start_of_cluster(s, acb->cur_pos);
1093      len = qed_offset_into_cluster(s, acb->cur_pos);
1094  
1095      trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1096      ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
1097      if (ret < 0) {
1098          goto out;
1099      }
1100  
1101      /* Populate back untouched region of new data cluster */
1102      start = acb->cur_pos + acb->cur_qiov.size;
1103      len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
1104      offset = acb->cur_cluster +
1105               qed_offset_into_cluster(s, acb->cur_pos) +
1106               acb->cur_qiov.size;
1107  
1108      trace_qed_aio_write_postfill(s, acb, start, len, offset);
1109      ret = qed_copy_from_backing_file(s, start, len, offset);
1110      if (ret < 0) {
1111          goto out;
1112      }
1113  
1114      ret = qed_aio_write_main(acb);
1115      if (ret < 0) {
1116          goto out;
1117      }
1118  
1119      if (s->bs->backing) {
1120          /*
1121           * Flush new data clusters before updating the L2 table
1122           *
1123           * This flush is necessary when a backing file is in use.  A crash
1124           * during an allocating write could result in empty clusters in the
1125           * image.  If the write only touched a subregion of the cluster,
1126           * then backing image sectors have been lost in the untouched
1127           * region.  The solution is to flush after writing a new data
1128           * cluster and before updating the L2 table.
1129           */
1130          ret = bdrv_co_flush(s->bs->file->bs);
1131      }
1132  
1133  out:
1134      qemu_co_mutex_lock(&s->table_lock);
1135      return ret;
1136  }
1137  
1138  /**
1139   * Check if the QED_F_NEED_CHECK bit should be set during allocating write
1140   */
1141  static bool qed_should_set_need_check(BDRVQEDState *s)
1142  {
1143      /* The flush before L2 update path ensures consistency */
1144      if (s->bs->backing) {
1145          return false;
1146      }
1147  
1148      return !(s->header.features & QED_F_NEED_CHECK);
1149  }
1150  
1151  /**
1152   * Write new data cluster
1153   *
1154   * @acb:        Write request
1155   * @len:        Length in bytes
1156   *
1157   * This path is taken when writing to previously unallocated clusters.
1158   *
1159   * Called with table_lock held.
1160   */
1161  static int coroutine_fn GRAPH_RDLOCK
1162  qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1163  {
1164      BDRVQEDState *s = acb_to_s(acb);
1165      int ret;
1166  
1167      /* Cancel timer when the first allocating request comes in */
1168      if (s->allocating_acb == NULL) {
1169          qed_cancel_need_check_timer(s);
1170      }
1171  
1172      /* Freeze this request if another allocating write is in progress */
1173      if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
1174          if (s->allocating_acb != NULL) {
1175              qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
1176              assert(s->allocating_acb == NULL);
1177          }
1178          s->allocating_acb = acb;
1179          return -EAGAIN; /* start over with looking up table entries */
1180      }
1181  
1182      acb->cur_nclusters = qed_bytes_to_clusters(s,
1183              qed_offset_into_cluster(s, acb->cur_pos) + len);
1184      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1185  
1186      if (acb->flags & QED_AIOCB_ZERO) {
1187          /* Skip ahead if the clusters are already zero */
1188          if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1189              return 0;
1190          }
1191          acb->cur_cluster = 1;
1192      } else {
1193          acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
1194      }
1195  
1196      if (qed_should_set_need_check(s)) {
1197          s->header.features |= QED_F_NEED_CHECK;
1198          ret = qed_write_header(s);
1199          if (ret < 0) {
1200              return ret;
1201          }
1202      }
1203  
1204      if (!(acb->flags & QED_AIOCB_ZERO)) {
1205          ret = qed_aio_write_cow(acb);
1206          if (ret < 0) {
1207              return ret;
1208          }
1209      }
1210  
1211      return qed_aio_write_l2_update(acb, acb->cur_cluster);
1212  }
1213  
1214  /**
1215   * Write data cluster in place
1216   *
1217   * @acb:        Write request
1218   * @offset:     Cluster offset in bytes
1219   * @len:        Length in bytes
1220   *
1221   * This path is taken when writing to already allocated clusters.
1222   *
1223   * Called with table_lock held.
1224   */
1225  static int coroutine_fn GRAPH_RDLOCK
1226  qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
1227  {
1228      BDRVQEDState *s = acb_to_s(acb);
1229      int r;
1230  
1231      qemu_co_mutex_unlock(&s->table_lock);
1232  
1233      /* Allocate buffer for zero writes */
1234      if (acb->flags & QED_AIOCB_ZERO) {
1235          struct iovec *iov = acb->qiov->iov;
1236  
1237          if (!iov->iov_base) {
1238              iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
1239              if (iov->iov_base == NULL) {
1240                  r = -ENOMEM;
1241                  goto out;
1242              }
1243              memset(iov->iov_base, 0, iov->iov_len);
1244          }
1245      }
1246  
1247      /* Calculate the I/O vector */
1248      acb->cur_cluster = offset;
1249      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1250  
1251      /* Do the actual write.  */
1252      r = qed_aio_write_main(acb);
1253  out:
1254      qemu_co_mutex_lock(&s->table_lock);
1255      return r;
1256  }
1257  
1258  /**
1259   * Write data cluster
1260   *
1261   * @opaque:     Write request
1262   * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1263   * @offset:     Cluster offset in bytes
1264   * @len:        Length in bytes
1265   *
1266   * Called with table_lock held.
1267   */
1268  static int coroutine_fn GRAPH_RDLOCK
1269  qed_aio_write_data(void *opaque, int ret, uint64_t offset, size_t len)
1270  {
1271      QEDAIOCB *acb = opaque;
1272  
1273      trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1274  
1275      acb->find_cluster_ret = ret;
1276  
1277      switch (ret) {
1278      case QED_CLUSTER_FOUND:
1279          return qed_aio_write_inplace(acb, offset, len);
1280  
1281      case QED_CLUSTER_L2:
1282      case QED_CLUSTER_L1:
1283      case QED_CLUSTER_ZERO:
1284          return qed_aio_write_alloc(acb, len);
1285  
1286      default:
1287          g_assert_not_reached();
1288      }
1289  }
1290  
1291  /**
1292   * Read data cluster
1293   *
1294   * @opaque:     Read request
1295   * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1296   * @offset:     Cluster offset in bytes
1297   * @len:        Length in bytes
1298   *
1299   * Called with table_lock held.
1300   */
1301  static int coroutine_fn GRAPH_RDLOCK
1302  qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
1303  {
1304      QEDAIOCB *acb = opaque;
1305      BDRVQEDState *s = acb_to_s(acb);
1306      BlockDriverState *bs = acb->bs;
1307      int r;
1308  
1309      qemu_co_mutex_unlock(&s->table_lock);
1310  
1311      /* Adjust offset into cluster */
1312      offset += qed_offset_into_cluster(s, acb->cur_pos);
1313  
1314      trace_qed_aio_read_data(s, acb, ret, offset, len);
1315  
1316      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1317  
1318      /* Handle zero cluster and backing file reads, otherwise read
1319       * data cluster directly.
1320       */
1321      if (ret == QED_CLUSTER_ZERO) {
1322          qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1323          r = 0;
1324      } else if (ret != QED_CLUSTER_FOUND) {
1325          r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov);
1326      } else {
1327          BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
1328          r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
1329                             &acb->cur_qiov, 0);
1330      }
1331  
1332      qemu_co_mutex_lock(&s->table_lock);
1333      return r;
1334  }
1335  
1336  /**
1337   * Begin next I/O or complete the request
1338   */
1339  static int coroutine_fn GRAPH_RDLOCK qed_aio_next_io(QEDAIOCB *acb)
1340  {
1341      BDRVQEDState *s = acb_to_s(acb);
1342      uint64_t offset;
1343      size_t len;
1344      int ret;
1345  
1346      qemu_co_mutex_lock(&s->table_lock);
1347      while (1) {
1348          trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
1349  
1350          acb->qiov_offset += acb->cur_qiov.size;
1351          acb->cur_pos += acb->cur_qiov.size;
1352          qemu_iovec_reset(&acb->cur_qiov);
1353  
1354          /* Complete request */
1355          if (acb->cur_pos >= acb->end_pos) {
1356              ret = 0;
1357              break;
1358          }
1359  
1360          /* Find next cluster and start I/O */
1361          len = acb->end_pos - acb->cur_pos;
1362          ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
1363          if (ret < 0) {
1364              break;
1365          }
1366  
1367          if (acb->flags & QED_AIOCB_WRITE) {
1368              ret = qed_aio_write_data(acb, ret, offset, len);
1369          } else {
1370              ret = qed_aio_read_data(acb, ret, offset, len);
1371          }
1372  
1373          if (ret < 0 && ret != -EAGAIN) {
1374              break;
1375          }
1376      }
1377  
1378      trace_qed_aio_complete(s, acb, ret);
1379      qed_aio_complete(acb);
1380      qemu_co_mutex_unlock(&s->table_lock);
1381      return ret;
1382  }
1383  
1384  static int coroutine_fn GRAPH_RDLOCK
1385  qed_co_request(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov,
1386                 int nb_sectors, int flags)
1387  {
1388      QEDAIOCB acb = {
1389          .bs         = bs,
1390          .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
1391          .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
1392          .qiov       = qiov,
1393          .flags      = flags,
1394      };
1395      qemu_iovec_init(&acb.cur_qiov, qiov->niov);
1396  
1397      trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
1398  
1399      /* Start request */
1400      return qed_aio_next_io(&acb);
1401  }
1402  
1403  static int coroutine_fn GRAPH_RDLOCK
1404  bdrv_qed_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1405                    QEMUIOVector *qiov)
1406  {
1407      return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
1408  }
1409  
1410  static int coroutine_fn GRAPH_RDLOCK
1411  bdrv_qed_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1412                     QEMUIOVector *qiov, int flags)
1413  {
1414      return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
1415  }
1416  
1417  static int coroutine_fn GRAPH_RDLOCK
1418  bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
1419                            BdrvRequestFlags flags)
1420  {
1421      BDRVQEDState *s = bs->opaque;
1422  
1423      /*
1424       * Zero writes start without an I/O buffer.  If a buffer becomes necessary
1425       * then it will be allocated during request processing.
1426       */
1427      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
1428  
1429      /*
1430       * QED is not prepared for 63bit write-zero requests, so rely on
1431       * max_pwrite_zeroes.
1432       */
1433      assert(bytes <= INT_MAX);
1434  
1435      /* Fall back if the request is not aligned */
1436      if (qed_offset_into_cluster(s, offset) ||
1437          qed_offset_into_cluster(s, bytes)) {
1438          return -ENOTSUP;
1439      }
1440  
1441      return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
1442                            bytes >> BDRV_SECTOR_BITS,
1443                            QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1444  }
1445  
1446  static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
1447                                               int64_t offset,
1448                                               bool exact,
1449                                               PreallocMode prealloc,
1450                                               BdrvRequestFlags flags,
1451                                               Error **errp)
1452  {
1453      BDRVQEDState *s = bs->opaque;
1454      uint64_t old_image_size;
1455      int ret;
1456  
1457      if (prealloc != PREALLOC_MODE_OFF) {
1458          error_setg(errp, "Unsupported preallocation mode '%s'",
1459                     PreallocMode_str(prealloc));
1460          return -ENOTSUP;
1461      }
1462  
1463      if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1464                                   s->header.table_size)) {
1465          error_setg(errp, "Invalid image size specified");
1466          return -EINVAL;
1467      }
1468  
1469      if ((uint64_t)offset < s->header.image_size) {
1470          error_setg(errp, "Shrinking images is currently not supported");
1471          return -ENOTSUP;
1472      }
1473  
1474      old_image_size = s->header.image_size;
1475      s->header.image_size = offset;
1476      ret = qed_write_header_sync(s);
1477      if (ret < 0) {
1478          s->header.image_size = old_image_size;
1479          error_setg_errno(errp, -ret, "Failed to update the image size");
1480      }
1481      return ret;
1482  }
1483  
1484  static int64_t coroutine_fn bdrv_qed_co_getlength(BlockDriverState *bs)
1485  {
1486      BDRVQEDState *s = bs->opaque;
1487      return s->header.image_size;
1488  }
1489  
1490  static int coroutine_fn
1491  bdrv_qed_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1492  {
1493      BDRVQEDState *s = bs->opaque;
1494  
1495      memset(bdi, 0, sizeof(*bdi));
1496      bdi->cluster_size = s->header.cluster_size;
1497      bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1498      return 0;
1499  }
1500  
1501  static int bdrv_qed_change_backing_file(BlockDriverState *bs,
1502                                          const char *backing_file,
1503                                          const char *backing_fmt)
1504  {
1505      BDRVQEDState *s = bs->opaque;
1506      QEDHeader new_header, le_header;
1507      void *buffer;
1508      size_t buffer_len, backing_file_len;
1509      int ret;
1510  
1511      /* Refuse to set backing filename if unknown compat feature bits are
1512       * active.  If the image uses an unknown compat feature then we may not
1513       * know the layout of data following the header structure and cannot safely
1514       * add a new string.
1515       */
1516      if (backing_file && (s->header.compat_features &
1517                           ~QED_COMPAT_FEATURE_MASK)) {
1518          return -ENOTSUP;
1519      }
1520  
1521      memcpy(&new_header, &s->header, sizeof(new_header));
1522  
1523      new_header.features &= ~(QED_F_BACKING_FILE |
1524                               QED_F_BACKING_FORMAT_NO_PROBE);
1525  
1526      /* Adjust feature flags */
1527      if (backing_file) {
1528          new_header.features |= QED_F_BACKING_FILE;
1529  
1530          if (qed_fmt_is_raw(backing_fmt)) {
1531              new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
1532          }
1533      }
1534  
1535      /* Calculate new header size */
1536      backing_file_len = 0;
1537  
1538      if (backing_file) {
1539          backing_file_len = strlen(backing_file);
1540      }
1541  
1542      buffer_len = sizeof(new_header);
1543      new_header.backing_filename_offset = buffer_len;
1544      new_header.backing_filename_size = backing_file_len;
1545      buffer_len += backing_file_len;
1546  
1547      /* Make sure we can rewrite header without failing */
1548      if (buffer_len > new_header.header_size * new_header.cluster_size) {
1549          return -ENOSPC;
1550      }
1551  
1552      /* Prepare new header */
1553      buffer = g_malloc(buffer_len);
1554  
1555      qed_header_cpu_to_le(&new_header, &le_header);
1556      memcpy(buffer, &le_header, sizeof(le_header));
1557      buffer_len = sizeof(le_header);
1558  
1559      if (backing_file) {
1560          memcpy(buffer + buffer_len, backing_file, backing_file_len);
1561          buffer_len += backing_file_len;
1562      }
1563  
1564      /* Write new header */
1565      ret = bdrv_pwrite_sync(bs->file, 0, buffer_len, buffer, 0);
1566      g_free(buffer);
1567      if (ret == 0) {
1568          memcpy(&s->header, &new_header, sizeof(new_header));
1569      }
1570      return ret;
1571  }
1572  
1573  static void coroutine_fn GRAPH_RDLOCK
1574  bdrv_qed_co_invalidate_cache(BlockDriverState *bs, Error **errp)
1575  {
1576      BDRVQEDState *s = bs->opaque;
1577      int ret;
1578  
1579      bdrv_qed_close(bs);
1580  
1581      bdrv_qed_init_state(bs);
1582      qemu_co_mutex_lock(&s->table_lock);
1583      ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, errp);
1584      qemu_co_mutex_unlock(&s->table_lock);
1585      if (ret < 0) {
1586          error_prepend(errp, "Could not reopen qed layer: ");
1587      }
1588  }
1589  
1590  static int coroutine_fn GRAPH_RDLOCK
1591  bdrv_qed_co_check(BlockDriverState *bs, BdrvCheckResult *result,
1592                    BdrvCheckMode fix)
1593  {
1594      BDRVQEDState *s = bs->opaque;
1595      int ret;
1596  
1597      qemu_co_mutex_lock(&s->table_lock);
1598      ret = qed_check(s, result, !!fix);
1599      qemu_co_mutex_unlock(&s->table_lock);
1600  
1601      return ret;
1602  }
1603  
1604  static QemuOptsList qed_create_opts = {
1605      .name = "qed-create-opts",
1606      .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
1607      .desc = {
1608          {
1609              .name = BLOCK_OPT_SIZE,
1610              .type = QEMU_OPT_SIZE,
1611              .help = "Virtual disk size"
1612          },
1613          {
1614              .name = BLOCK_OPT_BACKING_FILE,
1615              .type = QEMU_OPT_STRING,
1616              .help = "File name of a base image"
1617          },
1618          {
1619              .name = BLOCK_OPT_BACKING_FMT,
1620              .type = QEMU_OPT_STRING,
1621              .help = "Image format of the base image"
1622          },
1623          {
1624              .name = BLOCK_OPT_CLUSTER_SIZE,
1625              .type = QEMU_OPT_SIZE,
1626              .help = "Cluster size (in bytes)",
1627              .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
1628          },
1629          {
1630              .name = BLOCK_OPT_TABLE_SIZE,
1631              .type = QEMU_OPT_SIZE,
1632              .help = "L1/L2 table size (in clusters)"
1633          },
1634          { /* end of list */ }
1635      }
1636  };
1637  
1638  static BlockDriver bdrv_qed = {
1639      .format_name              = "qed",
1640      .instance_size            = sizeof(BDRVQEDState),
1641      .create_opts              = &qed_create_opts,
1642      .is_format                = true,
1643      .supports_backing         = true,
1644  
1645      .bdrv_probe               = bdrv_qed_probe,
1646      .bdrv_open                = bdrv_qed_open,
1647      .bdrv_close               = bdrv_qed_close,
1648      .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
1649      .bdrv_child_perm          = bdrv_default_perms,
1650      .bdrv_co_create           = bdrv_qed_co_create,
1651      .bdrv_co_create_opts      = bdrv_qed_co_create_opts,
1652      .bdrv_has_zero_init       = bdrv_has_zero_init_1,
1653      .bdrv_co_block_status     = bdrv_qed_co_block_status,
1654      .bdrv_co_readv            = bdrv_qed_co_readv,
1655      .bdrv_co_writev           = bdrv_qed_co_writev,
1656      .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
1657      .bdrv_co_truncate         = bdrv_qed_co_truncate,
1658      .bdrv_co_getlength        = bdrv_qed_co_getlength,
1659      .bdrv_co_get_info         = bdrv_qed_co_get_info,
1660      .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
1661      .bdrv_change_backing_file = bdrv_qed_change_backing_file,
1662      .bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
1663      .bdrv_co_check            = bdrv_qed_co_check,
1664      .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
1665      .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
1666      .bdrv_drain_begin         = bdrv_qed_drain_begin,
1667  };
1668  
1669  static void bdrv_qed_init(void)
1670  {
1671      bdrv_register(&bdrv_qed);
1672  }
1673  
1674  block_init(bdrv_qed_init);
1675