xref: /openbmc/qemu/block/qed.c (revision 48805df9c22a0700fba4b3b548fafaa21726ca68)
1  /*
2   * QEMU Enhanced Disk Format
3   *
4   * Copyright IBM, Corp. 2010
5   *
6   * Authors:
7   *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
8   *  Anthony Liguori   <aliguori@us.ibm.com>
9   *
10   * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11   * See the COPYING.LIB file in the top-level directory.
12   *
13   */
14  
15  #include "qemu/osdep.h"
16  #include "block/qdict.h"
17  #include "qapi/error.h"
18  #include "qemu/timer.h"
19  #include "qemu/bswap.h"
20  #include "qemu/main-loop.h"
21  #include "qemu/module.h"
22  #include "qemu/option.h"
23  #include "qemu/memalign.h"
24  #include "trace.h"
25  #include "qed.h"
26  #include "sysemu/block-backend.h"
27  #include "qapi/qmp/qdict.h"
28  #include "qapi/qobject-input-visitor.h"
29  #include "qapi/qapi-visit-block-core.h"
30  
31  static QemuOptsList qed_create_opts;
32  
33  static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
34                            const char *filename)
35  {
36      const QEDHeader *header = (const QEDHeader *)buf;
37  
38      if (buf_size < sizeof(*header)) {
39          return 0;
40      }
41      if (le32_to_cpu(header->magic) != QED_MAGIC) {
42          return 0;
43      }
44      return 100;
45  }
46  
47  /**
48   * Check whether an image format is raw
49   *
50   * @fmt:    Backing file format, may be NULL
51   */
52  static bool qed_fmt_is_raw(const char *fmt)
53  {
54      return fmt && strcmp(fmt, "raw") == 0;
55  }
56  
57  static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
58  {
59      cpu->magic = le32_to_cpu(le->magic);
60      cpu->cluster_size = le32_to_cpu(le->cluster_size);
61      cpu->table_size = le32_to_cpu(le->table_size);
62      cpu->header_size = le32_to_cpu(le->header_size);
63      cpu->features = le64_to_cpu(le->features);
64      cpu->compat_features = le64_to_cpu(le->compat_features);
65      cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
66      cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
67      cpu->image_size = le64_to_cpu(le->image_size);
68      cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
69      cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
70  }
71  
72  static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
73  {
74      le->magic = cpu_to_le32(cpu->magic);
75      le->cluster_size = cpu_to_le32(cpu->cluster_size);
76      le->table_size = cpu_to_le32(cpu->table_size);
77      le->header_size = cpu_to_le32(cpu->header_size);
78      le->features = cpu_to_le64(cpu->features);
79      le->compat_features = cpu_to_le64(cpu->compat_features);
80      le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
81      le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
82      le->image_size = cpu_to_le64(cpu->image_size);
83      le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
84      le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
85  }
86  
87  int qed_write_header_sync(BDRVQEDState *s)
88  {
89      QEDHeader le;
90  
91      qed_header_cpu_to_le(&s->header, &le);
92      return bdrv_pwrite(s->bs->file, 0, sizeof(le), &le, 0);
93  }
94  
95  /**
96   * Update header in-place (does not rewrite backing filename or other strings)
97   *
98   * This function only updates known header fields in-place and does not affect
99   * extra data after the QED header.
100   *
101   * No new allocating reqs can start while this function runs.
102   */
103  static int coroutine_fn GRAPH_RDLOCK qed_write_header(BDRVQEDState *s)
104  {
105      /* We must write full sectors for O_DIRECT but cannot necessarily generate
106       * the data following the header if an unrecognized compat feature is
107       * active.  Therefore, first read the sectors containing the header, update
108       * them, and write back.
109       */
110  
111      int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
112      size_t len = nsectors * BDRV_SECTOR_SIZE;
113      uint8_t *buf;
114      int ret;
115  
116      assert(s->allocating_acb || s->allocating_write_reqs_plugged);
117  
118      buf = qemu_blockalign(s->bs, len);
119  
120      ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
121      if (ret < 0) {
122          goto out;
123      }
124  
125      /* Update header */
126      qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
127  
128      ret = bdrv_co_pwrite(s->bs->file, 0, len,  buf, 0);
129      if (ret < 0) {
130          goto out;
131      }
132  
133      ret = 0;
134  out:
135      qemu_vfree(buf);
136      return ret;
137  }
138  
139  static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
140  {
141      uint64_t table_entries;
142      uint64_t l2_size;
143  
144      table_entries = (table_size * cluster_size) / sizeof(uint64_t);
145      l2_size = table_entries * cluster_size;
146  
147      return l2_size * table_entries;
148  }
149  
150  static bool qed_is_cluster_size_valid(uint32_t cluster_size)
151  {
152      if (cluster_size < QED_MIN_CLUSTER_SIZE ||
153          cluster_size > QED_MAX_CLUSTER_SIZE) {
154          return false;
155      }
156      if (cluster_size & (cluster_size - 1)) {
157          return false; /* not power of 2 */
158      }
159      return true;
160  }
161  
162  static bool qed_is_table_size_valid(uint32_t table_size)
163  {
164      if (table_size < QED_MIN_TABLE_SIZE ||
165          table_size > QED_MAX_TABLE_SIZE) {
166          return false;
167      }
168      if (table_size & (table_size - 1)) {
169          return false; /* not power of 2 */
170      }
171      return true;
172  }
173  
174  static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
175                                      uint32_t table_size)
176  {
177      if (image_size % BDRV_SECTOR_SIZE != 0) {
178          return false; /* not multiple of sector size */
179      }
180      if (image_size > qed_max_image_size(cluster_size, table_size)) {
181          return false; /* image is too large */
182      }
183      return true;
184  }
185  
186  /**
187   * Read a string of known length from the image file
188   *
189   * @file:       Image file
190   * @offset:     File offset to start of string, in bytes
191   * @n:          String length in bytes
192   * @buf:        Destination buffer
193   * @buflen:     Destination buffer length in bytes
194   * @ret:        0 on success, -errno on failure
195   *
196   * The string is NUL-terminated.
197   */
198  static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
199                             char *buf, size_t buflen)
200  {
201      int ret;
202      if (n >= buflen) {
203          return -EINVAL;
204      }
205      ret = bdrv_pread(file, offset, n, buf, 0);
206      if (ret < 0) {
207          return ret;
208      }
209      buf[n] = '\0';
210      return 0;
211  }
212  
213  /**
214   * Allocate new clusters
215   *
216   * @s:          QED state
217   * @n:          Number of contiguous clusters to allocate
218   * @ret:        Offset of first allocated cluster
219   *
220   * This function only produces the offset where the new clusters should be
221   * written.  It updates BDRVQEDState but does not make any changes to the image
222   * file.
223   *
224   * Called with table_lock held.
225   */
226  static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
227  {
228      uint64_t offset = s->file_size;
229      s->file_size += n * s->header.cluster_size;
230      return offset;
231  }
232  
233  QEDTable *qed_alloc_table(BDRVQEDState *s)
234  {
235      /* Honor O_DIRECT memory alignment requirements */
236      return qemu_blockalign(s->bs,
237                             s->header.cluster_size * s->header.table_size);
238  }
239  
240  /**
241   * Allocate a new zeroed L2 table
242   *
243   * Called with table_lock held.
244   */
245  static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
246  {
247      CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
248  
249      l2_table->table = qed_alloc_table(s);
250      l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
251  
252      memset(l2_table->table->offsets, 0,
253             s->header.cluster_size * s->header.table_size);
254      return l2_table;
255  }
256  
257  static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
258  {
259      qemu_co_mutex_lock(&s->table_lock);
260  
261      /* No reentrancy is allowed.  */
262      assert(!s->allocating_write_reqs_plugged);
263      if (s->allocating_acb != NULL) {
264          /* Another allocating write came concurrently.  This cannot happen
265           * from bdrv_qed_drain_begin, but it can happen when the timer runs.
266           */
267          qemu_co_mutex_unlock(&s->table_lock);
268          return false;
269      }
270  
271      s->allocating_write_reqs_plugged = true;
272      qemu_co_mutex_unlock(&s->table_lock);
273      return true;
274  }
275  
276  static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
277  {
278      qemu_co_mutex_lock(&s->table_lock);
279      assert(s->allocating_write_reqs_plugged);
280      s->allocating_write_reqs_plugged = false;
281      qemu_co_queue_next(&s->allocating_write_reqs);
282      qemu_co_mutex_unlock(&s->table_lock);
283  }
284  
285  static void coroutine_fn GRAPH_RDLOCK qed_need_check_timer(BDRVQEDState *s)
286  {
287      int ret;
288  
289      trace_qed_need_check_timer_cb(s);
290      assert_bdrv_graph_readable();
291  
292      if (!qed_plug_allocating_write_reqs(s)) {
293          return;
294      }
295  
296      /* Ensure writes are on disk before clearing flag */
297      ret = bdrv_co_flush(s->bs->file->bs);
298      if (ret < 0) {
299          qed_unplug_allocating_write_reqs(s);
300          return;
301      }
302  
303      s->header.features &= ~QED_F_NEED_CHECK;
304      ret = qed_write_header(s);
305      (void) ret;
306  
307      qed_unplug_allocating_write_reqs(s);
308  
309      ret = bdrv_co_flush(s->bs);
310      (void) ret;
311  }
312  
313  static void coroutine_fn qed_need_check_timer_entry(void *opaque)
314  {
315      BDRVQEDState *s = opaque;
316      GRAPH_RDLOCK_GUARD();
317  
318      qed_need_check_timer(opaque);
319      bdrv_dec_in_flight(s->bs);
320  }
321  
322  static void qed_need_check_timer_cb(void *opaque)
323  {
324      BDRVQEDState *s = opaque;
325      Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
326  
327      bdrv_inc_in_flight(s->bs);
328      qemu_coroutine_enter(co);
329  }
330  
331  static void qed_start_need_check_timer(BDRVQEDState *s)
332  {
333      trace_qed_start_need_check_timer(s);
334  
335      /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
336       * migration.
337       */
338      timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
339                     NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
340  }
341  
342  /* It's okay to call this multiple times or when no timer is started */
343  static void qed_cancel_need_check_timer(BDRVQEDState *s)
344  {
345      trace_qed_cancel_need_check_timer(s);
346      timer_del(s->need_check_timer);
347  }
348  
349  static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
350  {
351      BDRVQEDState *s = bs->opaque;
352  
353      qed_cancel_need_check_timer(s);
354      timer_free(s->need_check_timer);
355  }
356  
357  static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
358                                          AioContext *new_context)
359  {
360      BDRVQEDState *s = bs->opaque;
361  
362      s->need_check_timer = aio_timer_new(new_context,
363                                          QEMU_CLOCK_VIRTUAL, SCALE_NS,
364                                          qed_need_check_timer_cb, s);
365      if (s->header.features & QED_F_NEED_CHECK) {
366          qed_start_need_check_timer(s);
367      }
368  }
369  
370  static void bdrv_qed_drain_begin(BlockDriverState *bs)
371  {
372      BDRVQEDState *s = bs->opaque;
373  
374      /* Fire the timer immediately in order to start doing I/O as soon as the
375       * header is flushed.
376       */
377      if (s->need_check_timer && timer_pending(s->need_check_timer)) {
378          Coroutine *co;
379  
380          qed_cancel_need_check_timer(s);
381          co = qemu_coroutine_create(qed_need_check_timer_entry, s);
382          bdrv_inc_in_flight(bs);
383          aio_co_enter(bdrv_get_aio_context(bs), co);
384      }
385  }
386  
387  static void bdrv_qed_init_state(BlockDriverState *bs)
388  {
389      BDRVQEDState *s = bs->opaque;
390  
391      memset(s, 0, sizeof(BDRVQEDState));
392      s->bs = bs;
393      qemu_co_mutex_init(&s->table_lock);
394      qemu_co_queue_init(&s->allocating_write_reqs);
395  }
396  
397  /* Called with table_lock held.  */
398  static int coroutine_fn GRAPH_RDLOCK
399  bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
400  {
401      BDRVQEDState *s = bs->opaque;
402      QEDHeader le_header;
403      int64_t file_size;
404      int ret;
405  
406      ret = bdrv_co_pread(bs->file, 0, sizeof(le_header), &le_header, 0);
407      if (ret < 0) {
408          error_setg(errp, "Failed to read QED header");
409          return ret;
410      }
411      qed_header_le_to_cpu(&le_header, &s->header);
412  
413      if (s->header.magic != QED_MAGIC) {
414          error_setg(errp, "Image not in QED format");
415          return -EINVAL;
416      }
417      if (s->header.features & ~QED_FEATURE_MASK) {
418          /* image uses unsupported feature bits */
419          error_setg(errp, "Unsupported QED features: %" PRIx64,
420                     s->header.features & ~QED_FEATURE_MASK);
421          return -ENOTSUP;
422      }
423      if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
424          error_setg(errp, "QED cluster size is invalid");
425          return -EINVAL;
426      }
427  
428      /* Round down file size to the last cluster */
429      file_size = bdrv_co_getlength(bs->file->bs);
430      if (file_size < 0) {
431          error_setg(errp, "Failed to get file length");
432          return file_size;
433      }
434      s->file_size = qed_start_of_cluster(s, file_size);
435  
436      if (!qed_is_table_size_valid(s->header.table_size)) {
437          error_setg(errp, "QED table size is invalid");
438          return -EINVAL;
439      }
440      if (!qed_is_image_size_valid(s->header.image_size,
441                                   s->header.cluster_size,
442                                   s->header.table_size)) {
443          error_setg(errp, "QED image size is invalid");
444          return -EINVAL;
445      }
446      if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
447          error_setg(errp, "QED table offset is invalid");
448          return -EINVAL;
449      }
450  
451      s->table_nelems = (s->header.cluster_size * s->header.table_size) /
452                        sizeof(uint64_t);
453      s->l2_shift = ctz32(s->header.cluster_size);
454      s->l2_mask = s->table_nelems - 1;
455      s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
456  
457      /* Header size calculation must not overflow uint32_t */
458      if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
459          error_setg(errp, "QED header size is too large");
460          return -EINVAL;
461      }
462  
463      if ((s->header.features & QED_F_BACKING_FILE)) {
464          g_autofree char *backing_file_str = NULL;
465  
466          if ((uint64_t)s->header.backing_filename_offset +
467              s->header.backing_filename_size >
468              s->header.cluster_size * s->header.header_size) {
469              error_setg(errp, "QED backing filename offset is invalid");
470              return -EINVAL;
471          }
472  
473          backing_file_str = g_malloc(sizeof(bs->backing_file));
474          ret = qed_read_string(bs->file, s->header.backing_filename_offset,
475                                s->header.backing_filename_size,
476                                backing_file_str, sizeof(bs->backing_file));
477          if (ret < 0) {
478              error_setg(errp, "Failed to read backing filename");
479              return ret;
480          }
481  
482          if (!g_str_equal(backing_file_str, bs->backing_file)) {
483              pstrcpy(bs->backing_file, sizeof(bs->backing_file),
484                      backing_file_str);
485              pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
486                      backing_file_str);
487          }
488  
489          if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
490              pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
491          }
492      }
493  
494      /* Reset unknown autoclear feature bits.  This is a backwards
495       * compatibility mechanism that allows images to be opened by older
496       * programs, which "knock out" unknown feature bits.  When an image is
497       * opened by a newer program again it can detect that the autoclear
498       * feature is no longer valid.
499       */
500      if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
501          !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
502          s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
503  
504          ret = qed_write_header_sync(s);
505          if (ret) {
506              error_setg(errp, "Failed to update header");
507              return ret;
508          }
509  
510          /* From here on only known autoclear feature bits are valid */
511          bdrv_co_flush(bs->file->bs);
512      }
513  
514      s->l1_table = qed_alloc_table(s);
515      qed_init_l2_cache(&s->l2_cache);
516  
517      ret = qed_read_l1_table_sync(s);
518      if (ret) {
519          error_setg(errp, "Failed to read L1 table");
520          goto out;
521      }
522  
523      /* If image was not closed cleanly, check consistency */
524      if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
525          /* Read-only images cannot be fixed.  There is no risk of corruption
526           * since write operations are not possible.  Therefore, allow
527           * potentially inconsistent images to be opened read-only.  This can
528           * aid data recovery from an otherwise inconsistent image.
529           */
530          if (!bdrv_is_read_only(bs->file->bs) &&
531              !(flags & BDRV_O_INACTIVE)) {
532              BdrvCheckResult result = {0};
533  
534              ret = qed_check(s, &result, true);
535              if (ret) {
536                  error_setg(errp, "Image corrupted");
537                  goto out;
538              }
539          }
540      }
541  
542      bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
543  
544  out:
545      if (ret) {
546          qed_free_l2_cache(&s->l2_cache);
547          qemu_vfree(s->l1_table);
548      }
549      return ret;
550  }
551  
552  typedef struct QEDOpenCo {
553      BlockDriverState *bs;
554      QDict *options;
555      int flags;
556      Error **errp;
557      int ret;
558  } QEDOpenCo;
559  
560  static void coroutine_fn GRAPH_RDLOCK bdrv_qed_open_entry(void *opaque)
561  {
562      QEDOpenCo *qoc = opaque;
563      BDRVQEDState *s = qoc->bs->opaque;
564  
565      qemu_co_mutex_lock(&s->table_lock);
566      qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
567      qemu_co_mutex_unlock(&s->table_lock);
568  }
569  
570  static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
571                           Error **errp)
572  {
573      QEDOpenCo qoc = {
574          .bs = bs,
575          .options = options,
576          .flags = flags,
577          .errp = errp,
578          .ret = -EINPROGRESS
579      };
580      int ret;
581  
582      assume_graph_lock(); /* FIXME */
583  
584      ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
585      if (ret < 0) {
586          return ret;
587      }
588  
589      bdrv_qed_init_state(bs);
590      if (qemu_in_coroutine()) {
591          bdrv_qed_open_entry(&qoc);
592      } else {
593          assert(qemu_get_current_aio_context() == qemu_get_aio_context());
594          qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
595          BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
596      }
597      return qoc.ret;
598  }
599  
600  static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
601  {
602      BDRVQEDState *s = bs->opaque;
603  
604      bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
605      bs->bl.max_pwrite_zeroes = QEMU_ALIGN_DOWN(INT_MAX, s->header.cluster_size);
606  }
607  
608  /* We have nothing to do for QED reopen, stubs just return
609   * success */
610  static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
611                                     BlockReopenQueue *queue, Error **errp)
612  {
613      return 0;
614  }
615  
616  static void bdrv_qed_close(BlockDriverState *bs)
617  {
618      BDRVQEDState *s = bs->opaque;
619  
620      bdrv_qed_detach_aio_context(bs);
621  
622      /* Ensure writes reach stable storage */
623      bdrv_flush(bs->file->bs);
624  
625      /* Clean shutdown, no check required on next open */
626      if (s->header.features & QED_F_NEED_CHECK) {
627          s->header.features &= ~QED_F_NEED_CHECK;
628          qed_write_header_sync(s);
629      }
630  
631      qed_free_l2_cache(&s->l2_cache);
632      qemu_vfree(s->l1_table);
633  }
634  
635  static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
636                                             Error **errp)
637  {
638      BlockdevCreateOptionsQed *qed_opts;
639      BlockBackend *blk = NULL;
640      BlockDriverState *bs = NULL;
641  
642      QEDHeader header;
643      QEDHeader le_header;
644      uint8_t *l1_table = NULL;
645      size_t l1_size;
646      int ret = 0;
647  
648      assert(opts->driver == BLOCKDEV_DRIVER_QED);
649      qed_opts = &opts->u.qed;
650  
651      /* Validate options and set default values */
652      if (!qed_opts->has_cluster_size) {
653          qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
654      }
655      if (!qed_opts->has_table_size) {
656          qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
657      }
658  
659      if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
660          error_setg(errp, "QED cluster size must be within range [%u, %u] "
661                           "and power of 2",
662                     QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
663          return -EINVAL;
664      }
665      if (!qed_is_table_size_valid(qed_opts->table_size)) {
666          error_setg(errp, "QED table size must be within range [%u, %u] "
667                           "and power of 2",
668                     QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
669          return -EINVAL;
670      }
671      if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
672                                   qed_opts->table_size))
673      {
674          error_setg(errp, "QED image size must be a non-zero multiple of "
675                           "cluster size and less than %" PRIu64 " bytes",
676                     qed_max_image_size(qed_opts->cluster_size,
677                                        qed_opts->table_size));
678          return -EINVAL;
679      }
680  
681      /* Create BlockBackend to write to the image */
682      bs = bdrv_co_open_blockdev_ref(qed_opts->file, errp);
683      if (bs == NULL) {
684          return -EIO;
685      }
686  
687      blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
688                               errp);
689      if (!blk) {
690          ret = -EPERM;
691          goto out;
692      }
693      blk_set_allow_write_beyond_eof(blk, true);
694  
695      /* Prepare image format */
696      header = (QEDHeader) {
697          .magic = QED_MAGIC,
698          .cluster_size = qed_opts->cluster_size,
699          .table_size = qed_opts->table_size,
700          .header_size = 1,
701          .features = 0,
702          .compat_features = 0,
703          .l1_table_offset = qed_opts->cluster_size,
704          .image_size = qed_opts->size,
705      };
706  
707      l1_size = header.cluster_size * header.table_size;
708  
709      /*
710       * The QED format associates file length with allocation status,
711       * so a new file (which is empty) must have a length of 0.
712       */
713      ret = blk_co_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp);
714      if (ret < 0) {
715          goto out;
716      }
717  
718      if (qed_opts->backing_file) {
719          header.features |= QED_F_BACKING_FILE;
720          header.backing_filename_offset = sizeof(le_header);
721          header.backing_filename_size = strlen(qed_opts->backing_file);
722  
723          if (qed_opts->has_backing_fmt) {
724              const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
725              if (qed_fmt_is_raw(backing_fmt)) {
726                  header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
727              }
728          }
729      }
730  
731      qed_header_cpu_to_le(&header, &le_header);
732      ret = blk_co_pwrite(blk, 0, sizeof(le_header), &le_header, 0);
733      if (ret < 0) {
734          goto out;
735      }
736      ret = blk_co_pwrite(blk, sizeof(le_header), header.backing_filename_size,
737                       qed_opts->backing_file, 0);
738      if (ret < 0) {
739          goto out;
740      }
741  
742      l1_table = g_malloc0(l1_size);
743      ret = blk_co_pwrite(blk, header.l1_table_offset, l1_size, l1_table, 0);
744      if (ret < 0) {
745          goto out;
746      }
747  
748      ret = 0; /* success */
749  out:
750      g_free(l1_table);
751      blk_unref(blk);
752      bdrv_unref(bs);
753      return ret;
754  }
755  
756  static int coroutine_fn GRAPH_RDLOCK
757  bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,
758                          QemuOpts *opts, Error **errp)
759  {
760      BlockdevCreateOptions *create_options = NULL;
761      QDict *qdict;
762      Visitor *v;
763      BlockDriverState *bs = NULL;
764      int ret;
765  
766      static const QDictRenames opt_renames[] = {
767          { BLOCK_OPT_BACKING_FILE,       "backing-file" },
768          { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
769          { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
770          { BLOCK_OPT_TABLE_SIZE,         "table-size" },
771          { NULL, NULL },
772      };
773  
774      /* Parse options and convert legacy syntax */
775      qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);
776  
777      if (!qdict_rename_keys(qdict, opt_renames, errp)) {
778          ret = -EINVAL;
779          goto fail;
780      }
781  
782      /* Create and open the file (protocol layer) */
783      ret = bdrv_co_create_file(filename, opts, errp);
784      if (ret < 0) {
785          goto fail;
786      }
787  
788      bs = bdrv_co_open(filename, NULL, NULL,
789                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
790      if (bs == NULL) {
791          ret = -EIO;
792          goto fail;
793      }
794  
795      /* Now get the QAPI type BlockdevCreateOptions */
796      qdict_put_str(qdict, "driver", "qed");
797      qdict_put_str(qdict, "file", bs->node_name);
798  
799      v = qobject_input_visitor_new_flat_confused(qdict, errp);
800      if (!v) {
801          ret = -EINVAL;
802          goto fail;
803      }
804  
805      visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
806      visit_free(v);
807      if (!create_options) {
808          ret = -EINVAL;
809          goto fail;
810      }
811  
812      /* Silently round up size */
813      assert(create_options->driver == BLOCKDEV_DRIVER_QED);
814      create_options->u.qed.size =
815          ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);
816  
817      /* Create the qed image (format layer) */
818      ret = bdrv_qed_co_create(create_options, errp);
819  
820  fail:
821      qobject_unref(qdict);
822      bdrv_unref(bs);
823      qapi_free_BlockdevCreateOptions(create_options);
824      return ret;
825  }
826  
827  static int coroutine_fn GRAPH_RDLOCK
828  bdrv_qed_co_block_status(BlockDriverState *bs, bool want_zero, int64_t pos,
829                           int64_t bytes, int64_t *pnum, int64_t *map,
830                           BlockDriverState **file)
831  {
832      BDRVQEDState *s = bs->opaque;
833      size_t len = MIN(bytes, SIZE_MAX);
834      int status;
835      QEDRequest request = { .l2_table = NULL };
836      uint64_t offset;
837      int ret;
838  
839      qemu_co_mutex_lock(&s->table_lock);
840      ret = qed_find_cluster(s, &request, pos, &len, &offset);
841  
842      *pnum = len;
843      switch (ret) {
844      case QED_CLUSTER_FOUND:
845          *map = offset | qed_offset_into_cluster(s, pos);
846          status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
847          *file = bs->file->bs;
848          break;
849      case QED_CLUSTER_ZERO:
850          status = BDRV_BLOCK_ZERO;
851          break;
852      case QED_CLUSTER_L2:
853      case QED_CLUSTER_L1:
854          status = 0;
855          break;
856      default:
857          assert(ret < 0);
858          status = ret;
859          break;
860      }
861  
862      qed_unref_l2_cache_entry(request.l2_table);
863      qemu_co_mutex_unlock(&s->table_lock);
864  
865      return status;
866  }
867  
868  static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
869  {
870      return acb->bs->opaque;
871  }
872  
873  /**
874   * Read from the backing file or zero-fill if no backing file
875   *
876   * @s:              QED state
877   * @pos:            Byte position in device
878   * @qiov:           Destination I/O vector
879   *
880   * This function reads qiov->size bytes starting at pos from the backing file.
881   * If there is no backing file then zeroes are read.
882   */
883  static int coroutine_fn GRAPH_RDLOCK
884  qed_read_backing_file(BDRVQEDState *s, uint64_t pos, QEMUIOVector *qiov)
885  {
886      if (s->bs->backing) {
887          BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
888          return bdrv_co_preadv(s->bs->backing, pos, qiov->size, qiov, 0);
889      }
890      qemu_iovec_memset(qiov, 0, 0, qiov->size);
891      return 0;
892  }
893  
894  /**
895   * Copy data from backing file into the image
896   *
897   * @s:          QED state
898   * @pos:        Byte position in device
899   * @len:        Number of bytes
900   * @offset:     Byte offset in image file
901   */
902  static int coroutine_fn GRAPH_RDLOCK
903  qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, uint64_t len,
904                             uint64_t offset)
905  {
906      QEMUIOVector qiov;
907      int ret;
908  
909      /* Skip copy entirely if there is no work to do */
910      if (len == 0) {
911          return 0;
912      }
913  
914      qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
915  
916      ret = qed_read_backing_file(s, pos, &qiov);
917  
918      if (ret) {
919          goto out;
920      }
921  
922      BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
923      ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
924      if (ret < 0) {
925          goto out;
926      }
927      ret = 0;
928  out:
929      qemu_vfree(qemu_iovec_buf(&qiov));
930      return ret;
931  }
932  
933  /**
934   * Link one or more contiguous clusters into a table
935   *
936   * @s:              QED state
937   * @table:          L2 table
938   * @index:          First cluster index
939   * @n:              Number of contiguous clusters
940   * @cluster:        First cluster offset
941   *
942   * The cluster offset may be an allocated byte offset in the image file, the
943   * zero cluster marker, or the unallocated cluster marker.
944   *
945   * Called with table_lock held.
946   */
947  static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
948                                               int index, unsigned int n,
949                                               uint64_t cluster)
950  {
951      int i;
952      for (i = index; i < index + n; i++) {
953          table->offsets[i] = cluster;
954          if (!qed_offset_is_unalloc_cluster(cluster) &&
955              !qed_offset_is_zero_cluster(cluster)) {
956              cluster += s->header.cluster_size;
957          }
958      }
959  }
960  
961  /* Called with table_lock held.  */
962  static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
963  {
964      BDRVQEDState *s = acb_to_s(acb);
965  
966      /* Free resources */
967      qemu_iovec_destroy(&acb->cur_qiov);
968      qed_unref_l2_cache_entry(acb->request.l2_table);
969  
970      /* Free the buffer we may have allocated for zero writes */
971      if (acb->flags & QED_AIOCB_ZERO) {
972          qemu_vfree(acb->qiov->iov[0].iov_base);
973          acb->qiov->iov[0].iov_base = NULL;
974      }
975  
976      /* Start next allocating write request waiting behind this one.  Note that
977       * requests enqueue themselves when they first hit an unallocated cluster
978       * but they wait until the entire request is finished before waking up the
979       * next request in the queue.  This ensures that we don't cycle through
980       * requests multiple times but rather finish one at a time completely.
981       */
982      if (acb == s->allocating_acb) {
983          s->allocating_acb = NULL;
984          if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
985              qemu_co_queue_next(&s->allocating_write_reqs);
986          } else if (s->header.features & QED_F_NEED_CHECK) {
987              qed_start_need_check_timer(s);
988          }
989      }
990  }
991  
992  /**
993   * Update L1 table with new L2 table offset and write it out
994   *
995   * Called with table_lock held.
996   */
997  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_l1_update(QEDAIOCB *acb)
998  {
999      BDRVQEDState *s = acb_to_s(acb);
1000      CachedL2Table *l2_table = acb->request.l2_table;
1001      uint64_t l2_offset = l2_table->offset;
1002      int index, ret;
1003  
1004      index = qed_l1_index(s, acb->cur_pos);
1005      s->l1_table->offsets[index] = l2_table->offset;
1006  
1007      ret = qed_write_l1_table(s, index, 1);
1008  
1009      /* Commit the current L2 table to the cache */
1010      qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
1011  
1012      /* This is guaranteed to succeed because we just committed the entry to the
1013       * cache.
1014       */
1015      acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
1016      assert(acb->request.l2_table != NULL);
1017  
1018      return ret;
1019  }
1020  
1021  
1022  /**
1023   * Update L2 table with new cluster offsets and write them out
1024   *
1025   * Called with table_lock held.
1026   */
1027  static int coroutine_fn GRAPH_RDLOCK
1028  qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
1029  {
1030      BDRVQEDState *s = acb_to_s(acb);
1031      bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1032      int index, ret;
1033  
1034      if (need_alloc) {
1035          qed_unref_l2_cache_entry(acb->request.l2_table);
1036          acb->request.l2_table = qed_new_l2_table(s);
1037      }
1038  
1039      index = qed_l2_index(s, acb->cur_pos);
1040      qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1041                           offset);
1042  
1043      if (need_alloc) {
1044          /* Write out the whole new L2 table */
1045          ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
1046          if (ret) {
1047              return ret;
1048          }
1049          return qed_aio_write_l1_update(acb);
1050      } else {
1051          /* Write out only the updated part of the L2 table */
1052          ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
1053                                   false);
1054          if (ret) {
1055              return ret;
1056          }
1057      }
1058      return 0;
1059  }
1060  
1061  /**
1062   * Write data to the image file
1063   *
1064   * Called with table_lock *not* held.
1065   */
1066  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_main(QEDAIOCB *acb)
1067  {
1068      BDRVQEDState *s = acb_to_s(acb);
1069      uint64_t offset = acb->cur_cluster +
1070                        qed_offset_into_cluster(s, acb->cur_pos);
1071  
1072      trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
1073  
1074      BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1075      return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
1076                             &acb->cur_qiov, 0);
1077  }
1078  
1079  /**
1080   * Populate untouched regions of new data cluster
1081   *
1082   * Called with table_lock held.
1083   */
1084  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_cow(QEDAIOCB *acb)
1085  {
1086      BDRVQEDState *s = acb_to_s(acb);
1087      uint64_t start, len, offset;
1088      int ret;
1089  
1090      qemu_co_mutex_unlock(&s->table_lock);
1091  
1092      /* Populate front untouched region of new data cluster */
1093      start = qed_start_of_cluster(s, acb->cur_pos);
1094      len = qed_offset_into_cluster(s, acb->cur_pos);
1095  
1096      trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1097      ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
1098      if (ret < 0) {
1099          goto out;
1100      }
1101  
1102      /* Populate back untouched region of new data cluster */
1103      start = acb->cur_pos + acb->cur_qiov.size;
1104      len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
1105      offset = acb->cur_cluster +
1106               qed_offset_into_cluster(s, acb->cur_pos) +
1107               acb->cur_qiov.size;
1108  
1109      trace_qed_aio_write_postfill(s, acb, start, len, offset);
1110      ret = qed_copy_from_backing_file(s, start, len, offset);
1111      if (ret < 0) {
1112          goto out;
1113      }
1114  
1115      ret = qed_aio_write_main(acb);
1116      if (ret < 0) {
1117          goto out;
1118      }
1119  
1120      if (s->bs->backing) {
1121          /*
1122           * Flush new data clusters before updating the L2 table
1123           *
1124           * This flush is necessary when a backing file is in use.  A crash
1125           * during an allocating write could result in empty clusters in the
1126           * image.  If the write only touched a subregion of the cluster,
1127           * then backing image sectors have been lost in the untouched
1128           * region.  The solution is to flush after writing a new data
1129           * cluster and before updating the L2 table.
1130           */
1131          ret = bdrv_co_flush(s->bs->file->bs);
1132      }
1133  
1134  out:
1135      qemu_co_mutex_lock(&s->table_lock);
1136      return ret;
1137  }
1138  
1139  /**
1140   * Check if the QED_F_NEED_CHECK bit should be set during allocating write
1141   */
1142  static bool qed_should_set_need_check(BDRVQEDState *s)
1143  {
1144      /* The flush before L2 update path ensures consistency */
1145      if (s->bs->backing) {
1146          return false;
1147      }
1148  
1149      return !(s->header.features & QED_F_NEED_CHECK);
1150  }
1151  
1152  /**
1153   * Write new data cluster
1154   *
1155   * @acb:        Write request
1156   * @len:        Length in bytes
1157   *
1158   * This path is taken when writing to previously unallocated clusters.
1159   *
1160   * Called with table_lock held.
1161   */
1162  static int coroutine_fn GRAPH_RDLOCK
1163  qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1164  {
1165      BDRVQEDState *s = acb_to_s(acb);
1166      int ret;
1167  
1168      /* Cancel timer when the first allocating request comes in */
1169      if (s->allocating_acb == NULL) {
1170          qed_cancel_need_check_timer(s);
1171      }
1172  
1173      /* Freeze this request if another allocating write is in progress */
1174      if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
1175          if (s->allocating_acb != NULL) {
1176              qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
1177              assert(s->allocating_acb == NULL);
1178          }
1179          s->allocating_acb = acb;
1180          return -EAGAIN; /* start over with looking up table entries */
1181      }
1182  
1183      acb->cur_nclusters = qed_bytes_to_clusters(s,
1184              qed_offset_into_cluster(s, acb->cur_pos) + len);
1185      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1186  
1187      if (acb->flags & QED_AIOCB_ZERO) {
1188          /* Skip ahead if the clusters are already zero */
1189          if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1190              return 0;
1191          }
1192          acb->cur_cluster = 1;
1193      } else {
1194          acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
1195      }
1196  
1197      if (qed_should_set_need_check(s)) {
1198          s->header.features |= QED_F_NEED_CHECK;
1199          ret = qed_write_header(s);
1200          if (ret < 0) {
1201              return ret;
1202          }
1203      }
1204  
1205      if (!(acb->flags & QED_AIOCB_ZERO)) {
1206          ret = qed_aio_write_cow(acb);
1207          if (ret < 0) {
1208              return ret;
1209          }
1210      }
1211  
1212      return qed_aio_write_l2_update(acb, acb->cur_cluster);
1213  }
1214  
1215  /**
1216   * Write data cluster in place
1217   *
1218   * @acb:        Write request
1219   * @offset:     Cluster offset in bytes
1220   * @len:        Length in bytes
1221   *
1222   * This path is taken when writing to already allocated clusters.
1223   *
1224   * Called with table_lock held.
1225   */
1226  static int coroutine_fn GRAPH_RDLOCK
1227  qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
1228  {
1229      BDRVQEDState *s = acb_to_s(acb);
1230      int r;
1231  
1232      qemu_co_mutex_unlock(&s->table_lock);
1233  
1234      /* Allocate buffer for zero writes */
1235      if (acb->flags & QED_AIOCB_ZERO) {
1236          struct iovec *iov = acb->qiov->iov;
1237  
1238          if (!iov->iov_base) {
1239              iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
1240              if (iov->iov_base == NULL) {
1241                  r = -ENOMEM;
1242                  goto out;
1243              }
1244              memset(iov->iov_base, 0, iov->iov_len);
1245          }
1246      }
1247  
1248      /* Calculate the I/O vector */
1249      acb->cur_cluster = offset;
1250      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1251  
1252      /* Do the actual write.  */
1253      r = qed_aio_write_main(acb);
1254  out:
1255      qemu_co_mutex_lock(&s->table_lock);
1256      return r;
1257  }
1258  
1259  /**
1260   * Write data cluster
1261   *
1262   * @opaque:     Write request
1263   * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1264   * @offset:     Cluster offset in bytes
1265   * @len:        Length in bytes
1266   *
1267   * Called with table_lock held.
1268   */
1269  static int coroutine_fn GRAPH_RDLOCK
1270  qed_aio_write_data(void *opaque, int ret, uint64_t offset, size_t len)
1271  {
1272      QEDAIOCB *acb = opaque;
1273  
1274      trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1275  
1276      acb->find_cluster_ret = ret;
1277  
1278      switch (ret) {
1279      case QED_CLUSTER_FOUND:
1280          return qed_aio_write_inplace(acb, offset, len);
1281  
1282      case QED_CLUSTER_L2:
1283      case QED_CLUSTER_L1:
1284      case QED_CLUSTER_ZERO:
1285          return qed_aio_write_alloc(acb, len);
1286  
1287      default:
1288          g_assert_not_reached();
1289      }
1290  }
1291  
1292  /**
1293   * Read data cluster
1294   *
1295   * @opaque:     Read request
1296   * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1297   * @offset:     Cluster offset in bytes
1298   * @len:        Length in bytes
1299   *
1300   * Called with table_lock held.
1301   */
1302  static int coroutine_fn GRAPH_RDLOCK
1303  qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
1304  {
1305      QEDAIOCB *acb = opaque;
1306      BDRVQEDState *s = acb_to_s(acb);
1307      BlockDriverState *bs = acb->bs;
1308      int r;
1309  
1310      qemu_co_mutex_unlock(&s->table_lock);
1311  
1312      /* Adjust offset into cluster */
1313      offset += qed_offset_into_cluster(s, acb->cur_pos);
1314  
1315      trace_qed_aio_read_data(s, acb, ret, offset, len);
1316  
1317      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1318  
1319      /* Handle zero cluster and backing file reads, otherwise read
1320       * data cluster directly.
1321       */
1322      if (ret == QED_CLUSTER_ZERO) {
1323          qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1324          r = 0;
1325      } else if (ret != QED_CLUSTER_FOUND) {
1326          r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov);
1327      } else {
1328          BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1329          r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
1330                             &acb->cur_qiov, 0);
1331      }
1332  
1333      qemu_co_mutex_lock(&s->table_lock);
1334      return r;
1335  }
1336  
1337  /**
1338   * Begin next I/O or complete the request
1339   */
1340  static int coroutine_fn GRAPH_RDLOCK qed_aio_next_io(QEDAIOCB *acb)
1341  {
1342      BDRVQEDState *s = acb_to_s(acb);
1343      uint64_t offset;
1344      size_t len;
1345      int ret;
1346  
1347      qemu_co_mutex_lock(&s->table_lock);
1348      while (1) {
1349          trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
1350  
1351          acb->qiov_offset += acb->cur_qiov.size;
1352          acb->cur_pos += acb->cur_qiov.size;
1353          qemu_iovec_reset(&acb->cur_qiov);
1354  
1355          /* Complete request */
1356          if (acb->cur_pos >= acb->end_pos) {
1357              ret = 0;
1358              break;
1359          }
1360  
1361          /* Find next cluster and start I/O */
1362          len = acb->end_pos - acb->cur_pos;
1363          ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
1364          if (ret < 0) {
1365              break;
1366          }
1367  
1368          if (acb->flags & QED_AIOCB_WRITE) {
1369              ret = qed_aio_write_data(acb, ret, offset, len);
1370          } else {
1371              ret = qed_aio_read_data(acb, ret, offset, len);
1372          }
1373  
1374          if (ret < 0 && ret != -EAGAIN) {
1375              break;
1376          }
1377      }
1378  
1379      trace_qed_aio_complete(s, acb, ret);
1380      qed_aio_complete(acb);
1381      qemu_co_mutex_unlock(&s->table_lock);
1382      return ret;
1383  }
1384  
1385  static int coroutine_fn GRAPH_RDLOCK
1386  qed_co_request(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov,
1387                 int nb_sectors, int flags)
1388  {
1389      QEDAIOCB acb = {
1390          .bs         = bs,
1391          .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
1392          .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
1393          .qiov       = qiov,
1394          .flags      = flags,
1395      };
1396      qemu_iovec_init(&acb.cur_qiov, qiov->niov);
1397  
1398      trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
1399  
1400      /* Start request */
1401      return qed_aio_next_io(&acb);
1402  }
1403  
1404  static int coroutine_fn GRAPH_RDLOCK
1405  bdrv_qed_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1406                    QEMUIOVector *qiov)
1407  {
1408      return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
1409  }
1410  
1411  static int coroutine_fn GRAPH_RDLOCK
1412  bdrv_qed_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1413                     QEMUIOVector *qiov, int flags)
1414  {
1415      return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
1416  }
1417  
1418  static int coroutine_fn GRAPH_RDLOCK
1419  bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
1420                            BdrvRequestFlags flags)
1421  {
1422      BDRVQEDState *s = bs->opaque;
1423  
1424      /*
1425       * Zero writes start without an I/O buffer.  If a buffer becomes necessary
1426       * then it will be allocated during request processing.
1427       */
1428      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
1429  
1430      /*
1431       * QED is not prepared for 63bit write-zero requests, so rely on
1432       * max_pwrite_zeroes.
1433       */
1434      assert(bytes <= INT_MAX);
1435  
1436      /* Fall back if the request is not aligned */
1437      if (qed_offset_into_cluster(s, offset) ||
1438          qed_offset_into_cluster(s, bytes)) {
1439          return -ENOTSUP;
1440      }
1441  
1442      return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
1443                            bytes >> BDRV_SECTOR_BITS,
1444                            QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1445  }
1446  
1447  static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
1448                                               int64_t offset,
1449                                               bool exact,
1450                                               PreallocMode prealloc,
1451                                               BdrvRequestFlags flags,
1452                                               Error **errp)
1453  {
1454      BDRVQEDState *s = bs->opaque;
1455      uint64_t old_image_size;
1456      int ret;
1457  
1458      if (prealloc != PREALLOC_MODE_OFF) {
1459          error_setg(errp, "Unsupported preallocation mode '%s'",
1460                     PreallocMode_str(prealloc));
1461          return -ENOTSUP;
1462      }
1463  
1464      if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1465                                   s->header.table_size)) {
1466          error_setg(errp, "Invalid image size specified");
1467          return -EINVAL;
1468      }
1469  
1470      if ((uint64_t)offset < s->header.image_size) {
1471          error_setg(errp, "Shrinking images is currently not supported");
1472          return -ENOTSUP;
1473      }
1474  
1475      old_image_size = s->header.image_size;
1476      s->header.image_size = offset;
1477      ret = qed_write_header_sync(s);
1478      if (ret < 0) {
1479          s->header.image_size = old_image_size;
1480          error_setg_errno(errp, -ret, "Failed to update the image size");
1481      }
1482      return ret;
1483  }
1484  
1485  static int64_t coroutine_fn bdrv_qed_co_getlength(BlockDriverState *bs)
1486  {
1487      BDRVQEDState *s = bs->opaque;
1488      return s->header.image_size;
1489  }
1490  
1491  static int coroutine_fn
1492  bdrv_qed_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1493  {
1494      BDRVQEDState *s = bs->opaque;
1495  
1496      memset(bdi, 0, sizeof(*bdi));
1497      bdi->cluster_size = s->header.cluster_size;
1498      bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1499      return 0;
1500  }
1501  
1502  static int bdrv_qed_change_backing_file(BlockDriverState *bs,
1503                                          const char *backing_file,
1504                                          const char *backing_fmt)
1505  {
1506      BDRVQEDState *s = bs->opaque;
1507      QEDHeader new_header, le_header;
1508      void *buffer;
1509      size_t buffer_len, backing_file_len;
1510      int ret;
1511  
1512      /* Refuse to set backing filename if unknown compat feature bits are
1513       * active.  If the image uses an unknown compat feature then we may not
1514       * know the layout of data following the header structure and cannot safely
1515       * add a new string.
1516       */
1517      if (backing_file && (s->header.compat_features &
1518                           ~QED_COMPAT_FEATURE_MASK)) {
1519          return -ENOTSUP;
1520      }
1521  
1522      memcpy(&new_header, &s->header, sizeof(new_header));
1523  
1524      new_header.features &= ~(QED_F_BACKING_FILE |
1525                               QED_F_BACKING_FORMAT_NO_PROBE);
1526  
1527      /* Adjust feature flags */
1528      if (backing_file) {
1529          new_header.features |= QED_F_BACKING_FILE;
1530  
1531          if (qed_fmt_is_raw(backing_fmt)) {
1532              new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
1533          }
1534      }
1535  
1536      /* Calculate new header size */
1537      backing_file_len = 0;
1538  
1539      if (backing_file) {
1540          backing_file_len = strlen(backing_file);
1541      }
1542  
1543      buffer_len = sizeof(new_header);
1544      new_header.backing_filename_offset = buffer_len;
1545      new_header.backing_filename_size = backing_file_len;
1546      buffer_len += backing_file_len;
1547  
1548      /* Make sure we can rewrite header without failing */
1549      if (buffer_len > new_header.header_size * new_header.cluster_size) {
1550          return -ENOSPC;
1551      }
1552  
1553      /* Prepare new header */
1554      buffer = g_malloc(buffer_len);
1555  
1556      qed_header_cpu_to_le(&new_header, &le_header);
1557      memcpy(buffer, &le_header, sizeof(le_header));
1558      buffer_len = sizeof(le_header);
1559  
1560      if (backing_file) {
1561          memcpy(buffer + buffer_len, backing_file, backing_file_len);
1562          buffer_len += backing_file_len;
1563      }
1564  
1565      /* Write new header */
1566      ret = bdrv_pwrite_sync(bs->file, 0, buffer_len, buffer, 0);
1567      g_free(buffer);
1568      if (ret == 0) {
1569          memcpy(&s->header, &new_header, sizeof(new_header));
1570      }
1571      return ret;
1572  }
1573  
1574  static void coroutine_fn GRAPH_RDLOCK
1575  bdrv_qed_co_invalidate_cache(BlockDriverState *bs, Error **errp)
1576  {
1577      BDRVQEDState *s = bs->opaque;
1578      int ret;
1579  
1580      bdrv_qed_close(bs);
1581  
1582      bdrv_qed_init_state(bs);
1583      qemu_co_mutex_lock(&s->table_lock);
1584      ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, errp);
1585      qemu_co_mutex_unlock(&s->table_lock);
1586      if (ret < 0) {
1587          error_prepend(errp, "Could not reopen qed layer: ");
1588      }
1589  }
1590  
1591  static int coroutine_fn GRAPH_RDLOCK
1592  bdrv_qed_co_check(BlockDriverState *bs, BdrvCheckResult *result,
1593                    BdrvCheckMode fix)
1594  {
1595      BDRVQEDState *s = bs->opaque;
1596      int ret;
1597  
1598      qemu_co_mutex_lock(&s->table_lock);
1599      ret = qed_check(s, result, !!fix);
1600      qemu_co_mutex_unlock(&s->table_lock);
1601  
1602      return ret;
1603  }
1604  
1605  static QemuOptsList qed_create_opts = {
1606      .name = "qed-create-opts",
1607      .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
1608      .desc = {
1609          {
1610              .name = BLOCK_OPT_SIZE,
1611              .type = QEMU_OPT_SIZE,
1612              .help = "Virtual disk size"
1613          },
1614          {
1615              .name = BLOCK_OPT_BACKING_FILE,
1616              .type = QEMU_OPT_STRING,
1617              .help = "File name of a base image"
1618          },
1619          {
1620              .name = BLOCK_OPT_BACKING_FMT,
1621              .type = QEMU_OPT_STRING,
1622              .help = "Image format of the base image"
1623          },
1624          {
1625              .name = BLOCK_OPT_CLUSTER_SIZE,
1626              .type = QEMU_OPT_SIZE,
1627              .help = "Cluster size (in bytes)",
1628              .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
1629          },
1630          {
1631              .name = BLOCK_OPT_TABLE_SIZE,
1632              .type = QEMU_OPT_SIZE,
1633              .help = "L1/L2 table size (in clusters)"
1634          },
1635          { /* end of list */ }
1636      }
1637  };
1638  
1639  static BlockDriver bdrv_qed = {
1640      .format_name              = "qed",
1641      .instance_size            = sizeof(BDRVQEDState),
1642      .create_opts              = &qed_create_opts,
1643      .is_format                = true,
1644      .supports_backing         = true,
1645  
1646      .bdrv_probe               = bdrv_qed_probe,
1647      .bdrv_open                = bdrv_qed_open,
1648      .bdrv_close               = bdrv_qed_close,
1649      .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
1650      .bdrv_child_perm          = bdrv_default_perms,
1651      .bdrv_co_create           = bdrv_qed_co_create,
1652      .bdrv_co_create_opts      = bdrv_qed_co_create_opts,
1653      .bdrv_has_zero_init       = bdrv_has_zero_init_1,
1654      .bdrv_co_block_status     = bdrv_qed_co_block_status,
1655      .bdrv_co_readv            = bdrv_qed_co_readv,
1656      .bdrv_co_writev           = bdrv_qed_co_writev,
1657      .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
1658      .bdrv_co_truncate         = bdrv_qed_co_truncate,
1659      .bdrv_co_getlength        = bdrv_qed_co_getlength,
1660      .bdrv_co_get_info         = bdrv_qed_co_get_info,
1661      .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
1662      .bdrv_change_backing_file = bdrv_qed_change_backing_file,
1663      .bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
1664      .bdrv_co_check            = bdrv_qed_co_check,
1665      .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
1666      .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
1667      .bdrv_drain_begin         = bdrv_qed_drain_begin,
1668  };
1669  
1670  static void bdrv_qed_init(void)
1671  {
1672      bdrv_register(&bdrv_qed);
1673  }
1674  
1675  block_init(bdrv_qed_init);
1676