xref: /openbmc/qemu/block/qed.c (revision d76aa73fad1f64c192856e1420ad0756f5e3b778)
1  /*
2   * QEMU Enhanced Disk Format
3   *
4   * Copyright IBM, Corp. 2010
5   *
6   * Authors:
7   *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
8   *  Anthony Liguori   <aliguori@us.ibm.com>
9   *
10   * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11   * See the COPYING.LIB file in the top-level directory.
12   *
13   */
14  
15  #include "qemu/osdep.h"
16  #include "block/qdict.h"
17  #include "qapi/error.h"
18  #include "qemu/timer.h"
19  #include "qemu/bswap.h"
20  #include "qemu/main-loop.h"
21  #include "qemu/module.h"
22  #include "qemu/option.h"
23  #include "qemu/memalign.h"
24  #include "trace.h"
25  #include "qed.h"
26  #include "sysemu/block-backend.h"
27  #include "qapi/qmp/qdict.h"
28  #include "qapi/qobject-input-visitor.h"
29  #include "qapi/qapi-visit-block-core.h"
30  
31  static QemuOptsList qed_create_opts;
32  
33  static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
34                            const char *filename)
35  {
36      const QEDHeader *header = (const QEDHeader *)buf;
37  
38      if (buf_size < sizeof(*header)) {
39          return 0;
40      }
41      if (le32_to_cpu(header->magic) != QED_MAGIC) {
42          return 0;
43      }
44      return 100;
45  }
46  
47  /**
48   * Check whether an image format is raw
49   *
50   * @fmt:    Backing file format, may be NULL
51   */
52  static bool qed_fmt_is_raw(const char *fmt)
53  {
54      return fmt && strcmp(fmt, "raw") == 0;
55  }
56  
57  static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
58  {
59      cpu->magic = le32_to_cpu(le->magic);
60      cpu->cluster_size = le32_to_cpu(le->cluster_size);
61      cpu->table_size = le32_to_cpu(le->table_size);
62      cpu->header_size = le32_to_cpu(le->header_size);
63      cpu->features = le64_to_cpu(le->features);
64      cpu->compat_features = le64_to_cpu(le->compat_features);
65      cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
66      cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
67      cpu->image_size = le64_to_cpu(le->image_size);
68      cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
69      cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
70  }
71  
72  static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
73  {
74      le->magic = cpu_to_le32(cpu->magic);
75      le->cluster_size = cpu_to_le32(cpu->cluster_size);
76      le->table_size = cpu_to_le32(cpu->table_size);
77      le->header_size = cpu_to_le32(cpu->header_size);
78      le->features = cpu_to_le64(cpu->features);
79      le->compat_features = cpu_to_le64(cpu->compat_features);
80      le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
81      le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
82      le->image_size = cpu_to_le64(cpu->image_size);
83      le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
84      le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
85  }
86  
87  int qed_write_header_sync(BDRVQEDState *s)
88  {
89      QEDHeader le;
90  
91      qed_header_cpu_to_le(&s->header, &le);
92      return bdrv_pwrite(s->bs->file, 0, sizeof(le), &le, 0);
93  }
94  
95  /**
96   * Update header in-place (does not rewrite backing filename or other strings)
97   *
98   * This function only updates known header fields in-place and does not affect
99   * extra data after the QED header.
100   *
101   * No new allocating reqs can start while this function runs.
102   */
103  static int coroutine_fn qed_write_header(BDRVQEDState *s)
104  {
105      /* We must write full sectors for O_DIRECT but cannot necessarily generate
106       * the data following the header if an unrecognized compat feature is
107       * active.  Therefore, first read the sectors containing the header, update
108       * them, and write back.
109       */
110  
111      int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
112      size_t len = nsectors * BDRV_SECTOR_SIZE;
113      uint8_t *buf;
114      int ret;
115  
116      assert(s->allocating_acb || s->allocating_write_reqs_plugged);
117  
118      buf = qemu_blockalign(s->bs, len);
119  
120      ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
121      if (ret < 0) {
122          goto out;
123      }
124  
125      /* Update header */
126      qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
127  
128      ret = bdrv_co_pwrite(s->bs->file, 0, len,  buf, 0);
129      if (ret < 0) {
130          goto out;
131      }
132  
133      ret = 0;
134  out:
135      qemu_vfree(buf);
136      return ret;
137  }
138  
139  static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
140  {
141      uint64_t table_entries;
142      uint64_t l2_size;
143  
144      table_entries = (table_size * cluster_size) / sizeof(uint64_t);
145      l2_size = table_entries * cluster_size;
146  
147      return l2_size * table_entries;
148  }
149  
150  static bool qed_is_cluster_size_valid(uint32_t cluster_size)
151  {
152      if (cluster_size < QED_MIN_CLUSTER_SIZE ||
153          cluster_size > QED_MAX_CLUSTER_SIZE) {
154          return false;
155      }
156      if (cluster_size & (cluster_size - 1)) {
157          return false; /* not power of 2 */
158      }
159      return true;
160  }
161  
162  static bool qed_is_table_size_valid(uint32_t table_size)
163  {
164      if (table_size < QED_MIN_TABLE_SIZE ||
165          table_size > QED_MAX_TABLE_SIZE) {
166          return false;
167      }
168      if (table_size & (table_size - 1)) {
169          return false; /* not power of 2 */
170      }
171      return true;
172  }
173  
174  static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
175                                      uint32_t table_size)
176  {
177      if (image_size % BDRV_SECTOR_SIZE != 0) {
178          return false; /* not multiple of sector size */
179      }
180      if (image_size > qed_max_image_size(cluster_size, table_size)) {
181          return false; /* image is too large */
182      }
183      return true;
184  }
185  
186  /**
187   * Read a string of known length from the image file
188   *
189   * @file:       Image file
190   * @offset:     File offset to start of string, in bytes
191   * @n:          String length in bytes
192   * @buf:        Destination buffer
193   * @buflen:     Destination buffer length in bytes
194   * @ret:        0 on success, -errno on failure
195   *
196   * The string is NUL-terminated.
197   */
198  static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
199                             char *buf, size_t buflen)
200  {
201      int ret;
202      if (n >= buflen) {
203          return -EINVAL;
204      }
205      ret = bdrv_pread(file, offset, n, buf, 0);
206      if (ret < 0) {
207          return ret;
208      }
209      buf[n] = '\0';
210      return 0;
211  }
212  
213  /**
214   * Allocate new clusters
215   *
216   * @s:          QED state
217   * @n:          Number of contiguous clusters to allocate
218   * @ret:        Offset of first allocated cluster
219   *
220   * This function only produces the offset where the new clusters should be
221   * written.  It updates BDRVQEDState but does not make any changes to the image
222   * file.
223   *
224   * Called with table_lock held.
225   */
226  static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
227  {
228      uint64_t offset = s->file_size;
229      s->file_size += n * s->header.cluster_size;
230      return offset;
231  }
232  
233  QEDTable *qed_alloc_table(BDRVQEDState *s)
234  {
235      /* Honor O_DIRECT memory alignment requirements */
236      return qemu_blockalign(s->bs,
237                             s->header.cluster_size * s->header.table_size);
238  }
239  
240  /**
241   * Allocate a new zeroed L2 table
242   *
243   * Called with table_lock held.
244   */
245  static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
246  {
247      CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
248  
249      l2_table->table = qed_alloc_table(s);
250      l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
251  
252      memset(l2_table->table->offsets, 0,
253             s->header.cluster_size * s->header.table_size);
254      return l2_table;
255  }
256  
257  static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
258  {
259      qemu_co_mutex_lock(&s->table_lock);
260  
261      /* No reentrancy is allowed.  */
262      assert(!s->allocating_write_reqs_plugged);
263      if (s->allocating_acb != NULL) {
264          /* Another allocating write came concurrently.  This cannot happen
265           * from bdrv_qed_drain_begin, but it can happen when the timer runs.
266           */
267          qemu_co_mutex_unlock(&s->table_lock);
268          return false;
269      }
270  
271      s->allocating_write_reqs_plugged = true;
272      qemu_co_mutex_unlock(&s->table_lock);
273      return true;
274  }
275  
276  static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
277  {
278      qemu_co_mutex_lock(&s->table_lock);
279      assert(s->allocating_write_reqs_plugged);
280      s->allocating_write_reqs_plugged = false;
281      qemu_co_queue_next(&s->allocating_write_reqs);
282      qemu_co_mutex_unlock(&s->table_lock);
283  }
284  
285  static void coroutine_fn qed_need_check_timer(BDRVQEDState *s)
286  {
287      int ret;
288  
289      trace_qed_need_check_timer_cb(s);
290  
291      if (!qed_plug_allocating_write_reqs(s)) {
292          return;
293      }
294  
295      /* Ensure writes are on disk before clearing flag */
296      ret = bdrv_co_flush(s->bs->file->bs);
297      if (ret < 0) {
298          qed_unplug_allocating_write_reqs(s);
299          return;
300      }
301  
302      s->header.features &= ~QED_F_NEED_CHECK;
303      ret = qed_write_header(s);
304      (void) ret;
305  
306      qed_unplug_allocating_write_reqs(s);
307  
308      ret = bdrv_co_flush(s->bs);
309      (void) ret;
310  }
311  
312  static void coroutine_fn qed_need_check_timer_entry(void *opaque)
313  {
314      BDRVQEDState *s = opaque;
315  
316      qed_need_check_timer(opaque);
317      bdrv_dec_in_flight(s->bs);
318  }
319  
320  static void qed_need_check_timer_cb(void *opaque)
321  {
322      BDRVQEDState *s = opaque;
323      Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
324  
325      bdrv_inc_in_flight(s->bs);
326      qemu_coroutine_enter(co);
327  }
328  
329  static void qed_start_need_check_timer(BDRVQEDState *s)
330  {
331      trace_qed_start_need_check_timer(s);
332  
333      /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
334       * migration.
335       */
336      timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
337                     NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
338  }
339  
340  /* It's okay to call this multiple times or when no timer is started */
341  static void qed_cancel_need_check_timer(BDRVQEDState *s)
342  {
343      trace_qed_cancel_need_check_timer(s);
344      timer_del(s->need_check_timer);
345  }
346  
347  static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
348  {
349      BDRVQEDState *s = bs->opaque;
350  
351      qed_cancel_need_check_timer(s);
352      timer_free(s->need_check_timer);
353  }
354  
355  static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
356                                          AioContext *new_context)
357  {
358      BDRVQEDState *s = bs->opaque;
359  
360      s->need_check_timer = aio_timer_new(new_context,
361                                          QEMU_CLOCK_VIRTUAL, SCALE_NS,
362                                          qed_need_check_timer_cb, s);
363      if (s->header.features & QED_F_NEED_CHECK) {
364          qed_start_need_check_timer(s);
365      }
366  }
367  
368  static void bdrv_qed_drain_begin(BlockDriverState *bs)
369  {
370      BDRVQEDState *s = bs->opaque;
371  
372      /* Fire the timer immediately in order to start doing I/O as soon as the
373       * header is flushed.
374       */
375      if (s->need_check_timer && timer_pending(s->need_check_timer)) {
376          Coroutine *co;
377  
378          qed_cancel_need_check_timer(s);
379          co = qemu_coroutine_create(qed_need_check_timer_entry, s);
380          bdrv_inc_in_flight(bs);
381          aio_co_enter(bdrv_get_aio_context(bs), co);
382      }
383  }
384  
385  static void bdrv_qed_init_state(BlockDriverState *bs)
386  {
387      BDRVQEDState *s = bs->opaque;
388  
389      memset(s, 0, sizeof(BDRVQEDState));
390      s->bs = bs;
391      qemu_co_mutex_init(&s->table_lock);
392      qemu_co_queue_init(&s->allocating_write_reqs);
393  }
394  
395  /* Called with table_lock held.  */
396  static int coroutine_fn bdrv_qed_do_open(BlockDriverState *bs, QDict *options,
397                                           int flags, Error **errp)
398  {
399      BDRVQEDState *s = bs->opaque;
400      QEDHeader le_header;
401      int64_t file_size;
402      int ret;
403  
404      ret = bdrv_co_pread(bs->file, 0, sizeof(le_header), &le_header, 0);
405      if (ret < 0) {
406          error_setg(errp, "Failed to read QED header");
407          return ret;
408      }
409      qed_header_le_to_cpu(&le_header, &s->header);
410  
411      if (s->header.magic != QED_MAGIC) {
412          error_setg(errp, "Image not in QED format");
413          return -EINVAL;
414      }
415      if (s->header.features & ~QED_FEATURE_MASK) {
416          /* image uses unsupported feature bits */
417          error_setg(errp, "Unsupported QED features: %" PRIx64,
418                     s->header.features & ~QED_FEATURE_MASK);
419          return -ENOTSUP;
420      }
421      if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
422          error_setg(errp, "QED cluster size is invalid");
423          return -EINVAL;
424      }
425  
426      /* Round down file size to the last cluster */
427      file_size = bdrv_co_getlength(bs->file->bs);
428      if (file_size < 0) {
429          error_setg(errp, "Failed to get file length");
430          return file_size;
431      }
432      s->file_size = qed_start_of_cluster(s, file_size);
433  
434      if (!qed_is_table_size_valid(s->header.table_size)) {
435          error_setg(errp, "QED table size is invalid");
436          return -EINVAL;
437      }
438      if (!qed_is_image_size_valid(s->header.image_size,
439                                   s->header.cluster_size,
440                                   s->header.table_size)) {
441          error_setg(errp, "QED image size is invalid");
442          return -EINVAL;
443      }
444      if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
445          error_setg(errp, "QED table offset is invalid");
446          return -EINVAL;
447      }
448  
449      s->table_nelems = (s->header.cluster_size * s->header.table_size) /
450                        sizeof(uint64_t);
451      s->l2_shift = ctz32(s->header.cluster_size);
452      s->l2_mask = s->table_nelems - 1;
453      s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
454  
455      /* Header size calculation must not overflow uint32_t */
456      if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
457          error_setg(errp, "QED header size is too large");
458          return -EINVAL;
459      }
460  
461      if ((s->header.features & QED_F_BACKING_FILE)) {
462          g_autofree char *backing_file_str = NULL;
463  
464          if ((uint64_t)s->header.backing_filename_offset +
465              s->header.backing_filename_size >
466              s->header.cluster_size * s->header.header_size) {
467              error_setg(errp, "QED backing filename offset is invalid");
468              return -EINVAL;
469          }
470  
471          backing_file_str = g_malloc(sizeof(bs->backing_file));
472          ret = qed_read_string(bs->file, s->header.backing_filename_offset,
473                                s->header.backing_filename_size,
474                                backing_file_str, sizeof(bs->backing_file));
475          if (ret < 0) {
476              error_setg(errp, "Failed to read backing filename");
477              return ret;
478          }
479  
480          if (!g_str_equal(backing_file_str, bs->backing_file)) {
481              pstrcpy(bs->backing_file, sizeof(bs->backing_file),
482                      backing_file_str);
483              pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
484                      backing_file_str);
485          }
486  
487          if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
488              pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
489          }
490      }
491  
492      /* Reset unknown autoclear feature bits.  This is a backwards
493       * compatibility mechanism that allows images to be opened by older
494       * programs, which "knock out" unknown feature bits.  When an image is
495       * opened by a newer program again it can detect that the autoclear
496       * feature is no longer valid.
497       */
498      if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
499          !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
500          s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
501  
502          ret = qed_write_header_sync(s);
503          if (ret) {
504              error_setg(errp, "Failed to update header");
505              return ret;
506          }
507  
508          /* From here on only known autoclear feature bits are valid */
509          bdrv_co_flush(bs->file->bs);
510      }
511  
512      s->l1_table = qed_alloc_table(s);
513      qed_init_l2_cache(&s->l2_cache);
514  
515      ret = qed_read_l1_table_sync(s);
516      if (ret) {
517          error_setg(errp, "Failed to read L1 table");
518          goto out;
519      }
520  
521      /* If image was not closed cleanly, check consistency */
522      if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
523          /* Read-only images cannot be fixed.  There is no risk of corruption
524           * since write operations are not possible.  Therefore, allow
525           * potentially inconsistent images to be opened read-only.  This can
526           * aid data recovery from an otherwise inconsistent image.
527           */
528          if (!bdrv_is_read_only(bs->file->bs) &&
529              !(flags & BDRV_O_INACTIVE)) {
530              BdrvCheckResult result = {0};
531  
532              ret = qed_check(s, &result, true);
533              if (ret) {
534                  error_setg(errp, "Image corrupted");
535                  goto out;
536              }
537          }
538      }
539  
540      bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
541  
542  out:
543      if (ret) {
544          qed_free_l2_cache(&s->l2_cache);
545          qemu_vfree(s->l1_table);
546      }
547      return ret;
548  }
549  
550  typedef struct QEDOpenCo {
551      BlockDriverState *bs;
552      QDict *options;
553      int flags;
554      Error **errp;
555      int ret;
556  } QEDOpenCo;
557  
558  static void coroutine_fn bdrv_qed_open_entry(void *opaque)
559  {
560      QEDOpenCo *qoc = opaque;
561      BDRVQEDState *s = qoc->bs->opaque;
562  
563      qemu_co_mutex_lock(&s->table_lock);
564      qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
565      qemu_co_mutex_unlock(&s->table_lock);
566  }
567  
568  static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
569                           Error **errp)
570  {
571      QEDOpenCo qoc = {
572          .bs = bs,
573          .options = options,
574          .flags = flags,
575          .errp = errp,
576          .ret = -EINPROGRESS
577      };
578      int ret;
579  
580      ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
581      if (ret < 0) {
582          return ret;
583      }
584  
585      bdrv_qed_init_state(bs);
586      if (qemu_in_coroutine()) {
587          bdrv_qed_open_entry(&qoc);
588      } else {
589          assert(qemu_get_current_aio_context() == qemu_get_aio_context());
590          qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
591          BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
592      }
593      BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
594      return qoc.ret;
595  }
596  
597  static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
598  {
599      BDRVQEDState *s = bs->opaque;
600  
601      bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
602      bs->bl.max_pwrite_zeroes = QEMU_ALIGN_DOWN(INT_MAX, s->header.cluster_size);
603  }
604  
605  /* We have nothing to do for QED reopen, stubs just return
606   * success */
607  static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
608                                     BlockReopenQueue *queue, Error **errp)
609  {
610      return 0;
611  }
612  
613  static void bdrv_qed_close(BlockDriverState *bs)
614  {
615      BDRVQEDState *s = bs->opaque;
616  
617      bdrv_qed_detach_aio_context(bs);
618  
619      /* Ensure writes reach stable storage */
620      bdrv_flush(bs->file->bs);
621  
622      /* Clean shutdown, no check required on next open */
623      if (s->header.features & QED_F_NEED_CHECK) {
624          s->header.features &= ~QED_F_NEED_CHECK;
625          qed_write_header_sync(s);
626      }
627  
628      qed_free_l2_cache(&s->l2_cache);
629      qemu_vfree(s->l1_table);
630  }
631  
632  static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
633                                             Error **errp)
634  {
635      BlockdevCreateOptionsQed *qed_opts;
636      BlockBackend *blk = NULL;
637      BlockDriverState *bs = NULL;
638  
639      QEDHeader header;
640      QEDHeader le_header;
641      uint8_t *l1_table = NULL;
642      size_t l1_size;
643      int ret = 0;
644  
645      assert(opts->driver == BLOCKDEV_DRIVER_QED);
646      qed_opts = &opts->u.qed;
647  
648      /* Validate options and set default values */
649      if (!qed_opts->has_cluster_size) {
650          qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
651      }
652      if (!qed_opts->has_table_size) {
653          qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
654      }
655  
656      if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
657          error_setg(errp, "QED cluster size must be within range [%u, %u] "
658                           "and power of 2",
659                     QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
660          return -EINVAL;
661      }
662      if (!qed_is_table_size_valid(qed_opts->table_size)) {
663          error_setg(errp, "QED table size must be within range [%u, %u] "
664                           "and power of 2",
665                     QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
666          return -EINVAL;
667      }
668      if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
669                                   qed_opts->table_size))
670      {
671          error_setg(errp, "QED image size must be a non-zero multiple of "
672                           "cluster size and less than %" PRIu64 " bytes",
673                     qed_max_image_size(qed_opts->cluster_size,
674                                        qed_opts->table_size));
675          return -EINVAL;
676      }
677  
678      /* Create BlockBackend to write to the image */
679      bs = bdrv_open_blockdev_ref(qed_opts->file, errp);
680      if (bs == NULL) {
681          return -EIO;
682      }
683  
684      blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
685                            errp);
686      if (!blk) {
687          ret = -EPERM;
688          goto out;
689      }
690      blk_set_allow_write_beyond_eof(blk, true);
691  
692      /* Prepare image format */
693      header = (QEDHeader) {
694          .magic = QED_MAGIC,
695          .cluster_size = qed_opts->cluster_size,
696          .table_size = qed_opts->table_size,
697          .header_size = 1,
698          .features = 0,
699          .compat_features = 0,
700          .l1_table_offset = qed_opts->cluster_size,
701          .image_size = qed_opts->size,
702      };
703  
704      l1_size = header.cluster_size * header.table_size;
705  
706      /*
707       * The QED format associates file length with allocation status,
708       * so a new file (which is empty) must have a length of 0.
709       */
710      ret = blk_co_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp);
711      if (ret < 0) {
712          goto out;
713      }
714  
715      if (qed_opts->backing_file) {
716          header.features |= QED_F_BACKING_FILE;
717          header.backing_filename_offset = sizeof(le_header);
718          header.backing_filename_size = strlen(qed_opts->backing_file);
719  
720          if (qed_opts->has_backing_fmt) {
721              const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
722              if (qed_fmt_is_raw(backing_fmt)) {
723                  header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
724              }
725          }
726      }
727  
728      qed_header_cpu_to_le(&header, &le_header);
729      ret = blk_co_pwrite(blk, 0, sizeof(le_header), &le_header, 0);
730      if (ret < 0) {
731          goto out;
732      }
733      ret = blk_co_pwrite(blk, sizeof(le_header), header.backing_filename_size,
734                       qed_opts->backing_file, 0);
735      if (ret < 0) {
736          goto out;
737      }
738  
739      l1_table = g_malloc0(l1_size);
740      ret = blk_co_pwrite(blk, header.l1_table_offset, l1_size, l1_table, 0);
741      if (ret < 0) {
742          goto out;
743      }
744  
745      ret = 0; /* success */
746  out:
747      g_free(l1_table);
748      blk_unref(blk);
749      bdrv_unref(bs);
750      return ret;
751  }
752  
753  static int coroutine_fn bdrv_qed_co_create_opts(BlockDriver *drv,
754                                                  const char *filename,
755                                                  QemuOpts *opts,
756                                                  Error **errp)
757  {
758      BlockdevCreateOptions *create_options = NULL;
759      QDict *qdict;
760      Visitor *v;
761      BlockDriverState *bs = NULL;
762      int ret;
763  
764      static const QDictRenames opt_renames[] = {
765          { BLOCK_OPT_BACKING_FILE,       "backing-file" },
766          { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
767          { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
768          { BLOCK_OPT_TABLE_SIZE,         "table-size" },
769          { NULL, NULL },
770      };
771  
772      /* Parse options and convert legacy syntax */
773      qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);
774  
775      if (!qdict_rename_keys(qdict, opt_renames, errp)) {
776          ret = -EINVAL;
777          goto fail;
778      }
779  
780      /* Create and open the file (protocol layer) */
781      ret = bdrv_co_create_file(filename, opts, errp);
782      if (ret < 0) {
783          goto fail;
784      }
785  
786      bs = bdrv_open(filename, NULL, NULL,
787                     BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
788      if (bs == NULL) {
789          ret = -EIO;
790          goto fail;
791      }
792  
793      /* Now get the QAPI type BlockdevCreateOptions */
794      qdict_put_str(qdict, "driver", "qed");
795      qdict_put_str(qdict, "file", bs->node_name);
796  
797      v = qobject_input_visitor_new_flat_confused(qdict, errp);
798      if (!v) {
799          ret = -EINVAL;
800          goto fail;
801      }
802  
803      visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
804      visit_free(v);
805      if (!create_options) {
806          ret = -EINVAL;
807          goto fail;
808      }
809  
810      /* Silently round up size */
811      assert(create_options->driver == BLOCKDEV_DRIVER_QED);
812      create_options->u.qed.size =
813          ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);
814  
815      /* Create the qed image (format layer) */
816      ret = bdrv_qed_co_create(create_options, errp);
817  
818  fail:
819      qobject_unref(qdict);
820      bdrv_unref(bs);
821      qapi_free_BlockdevCreateOptions(create_options);
822      return ret;
823  }
824  
825  static int coroutine_fn bdrv_qed_co_block_status(BlockDriverState *bs,
826                                                   bool want_zero,
827                                                   int64_t pos, int64_t bytes,
828                                                   int64_t *pnum, int64_t *map,
829                                                   BlockDriverState **file)
830  {
831      BDRVQEDState *s = bs->opaque;
832      size_t len = MIN(bytes, SIZE_MAX);
833      int status;
834      QEDRequest request = { .l2_table = NULL };
835      uint64_t offset;
836      int ret;
837  
838      qemu_co_mutex_lock(&s->table_lock);
839      ret = qed_find_cluster(s, &request, pos, &len, &offset);
840  
841      *pnum = len;
842      switch (ret) {
843      case QED_CLUSTER_FOUND:
844          *map = offset | qed_offset_into_cluster(s, pos);
845          status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
846          *file = bs->file->bs;
847          break;
848      case QED_CLUSTER_ZERO:
849          status = BDRV_BLOCK_ZERO;
850          break;
851      case QED_CLUSTER_L2:
852      case QED_CLUSTER_L1:
853          status = 0;
854          break;
855      default:
856          assert(ret < 0);
857          status = ret;
858          break;
859      }
860  
861      qed_unref_l2_cache_entry(request.l2_table);
862      qemu_co_mutex_unlock(&s->table_lock);
863  
864      return status;
865  }
866  
867  static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
868  {
869      return acb->bs->opaque;
870  }
871  
872  /**
873   * Read from the backing file or zero-fill if no backing file
874   *
875   * @s:              QED state
876   * @pos:            Byte position in device
877   * @qiov:           Destination I/O vector
878   *
879   * This function reads qiov->size bytes starting at pos from the backing file.
880   * If there is no backing file then zeroes are read.
881   */
882  static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
883                                                QEMUIOVector *qiov)
884  {
885      if (s->bs->backing) {
886          BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
887          return bdrv_co_preadv(s->bs->backing, pos, qiov->size, qiov, 0);
888      }
889      qemu_iovec_memset(qiov, 0, 0, qiov->size);
890      return 0;
891  }
892  
893  /**
894   * Copy data from backing file into the image
895   *
896   * @s:          QED state
897   * @pos:        Byte position in device
898   * @len:        Number of bytes
899   * @offset:     Byte offset in image file
900   */
901  static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
902                                                     uint64_t pos, uint64_t len,
903                                                     uint64_t offset)
904  {
905      QEMUIOVector qiov;
906      int ret;
907  
908      /* Skip copy entirely if there is no work to do */
909      if (len == 0) {
910          return 0;
911      }
912  
913      qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
914  
915      ret = qed_read_backing_file(s, pos, &qiov);
916  
917      if (ret) {
918          goto out;
919      }
920  
921      BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
922      ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
923      if (ret < 0) {
924          goto out;
925      }
926      ret = 0;
927  out:
928      qemu_vfree(qemu_iovec_buf(&qiov));
929      return ret;
930  }
931  
932  /**
933   * Link one or more contiguous clusters into a table
934   *
935   * @s:              QED state
936   * @table:          L2 table
937   * @index:          First cluster index
938   * @n:              Number of contiguous clusters
939   * @cluster:        First cluster offset
940   *
941   * The cluster offset may be an allocated byte offset in the image file, the
942   * zero cluster marker, or the unallocated cluster marker.
943   *
944   * Called with table_lock held.
945   */
946  static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
947                                               int index, unsigned int n,
948                                               uint64_t cluster)
949  {
950      int i;
951      for (i = index; i < index + n; i++) {
952          table->offsets[i] = cluster;
953          if (!qed_offset_is_unalloc_cluster(cluster) &&
954              !qed_offset_is_zero_cluster(cluster)) {
955              cluster += s->header.cluster_size;
956          }
957      }
958  }
959  
960  /* Called with table_lock held.  */
961  static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
962  {
963      BDRVQEDState *s = acb_to_s(acb);
964  
965      /* Free resources */
966      qemu_iovec_destroy(&acb->cur_qiov);
967      qed_unref_l2_cache_entry(acb->request.l2_table);
968  
969      /* Free the buffer we may have allocated for zero writes */
970      if (acb->flags & QED_AIOCB_ZERO) {
971          qemu_vfree(acb->qiov->iov[0].iov_base);
972          acb->qiov->iov[0].iov_base = NULL;
973      }
974  
975      /* Start next allocating write request waiting behind this one.  Note that
976       * requests enqueue themselves when they first hit an unallocated cluster
977       * but they wait until the entire request is finished before waking up the
978       * next request in the queue.  This ensures that we don't cycle through
979       * requests multiple times but rather finish one at a time completely.
980       */
981      if (acb == s->allocating_acb) {
982          s->allocating_acb = NULL;
983          if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
984              qemu_co_queue_next(&s->allocating_write_reqs);
985          } else if (s->header.features & QED_F_NEED_CHECK) {
986              qed_start_need_check_timer(s);
987          }
988      }
989  }
990  
991  /**
992   * Update L1 table with new L2 table offset and write it out
993   *
994   * Called with table_lock held.
995   */
996  static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
997  {
998      BDRVQEDState *s = acb_to_s(acb);
999      CachedL2Table *l2_table = acb->request.l2_table;
1000      uint64_t l2_offset = l2_table->offset;
1001      int index, ret;
1002  
1003      index = qed_l1_index(s, acb->cur_pos);
1004      s->l1_table->offsets[index] = l2_table->offset;
1005  
1006      ret = qed_write_l1_table(s, index, 1);
1007  
1008      /* Commit the current L2 table to the cache */
1009      qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
1010  
1011      /* This is guaranteed to succeed because we just committed the entry to the
1012       * cache.
1013       */
1014      acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
1015      assert(acb->request.l2_table != NULL);
1016  
1017      return ret;
1018  }
1019  
1020  
1021  /**
1022   * Update L2 table with new cluster offsets and write them out
1023   *
1024   * Called with table_lock held.
1025   */
1026  static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
1027  {
1028      BDRVQEDState *s = acb_to_s(acb);
1029      bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1030      int index, ret;
1031  
1032      if (need_alloc) {
1033          qed_unref_l2_cache_entry(acb->request.l2_table);
1034          acb->request.l2_table = qed_new_l2_table(s);
1035      }
1036  
1037      index = qed_l2_index(s, acb->cur_pos);
1038      qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1039                           offset);
1040  
1041      if (need_alloc) {
1042          /* Write out the whole new L2 table */
1043          ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
1044          if (ret) {
1045              return ret;
1046          }
1047          return qed_aio_write_l1_update(acb);
1048      } else {
1049          /* Write out only the updated part of the L2 table */
1050          ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
1051                                   false);
1052          if (ret) {
1053              return ret;
1054          }
1055      }
1056      return 0;
1057  }
1058  
1059  /**
1060   * Write data to the image file
1061   *
1062   * Called with table_lock *not* held.
1063   */
1064  static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
1065  {
1066      BDRVQEDState *s = acb_to_s(acb);
1067      uint64_t offset = acb->cur_cluster +
1068                        qed_offset_into_cluster(s, acb->cur_pos);
1069  
1070      trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
1071  
1072      BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1073      return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
1074                             &acb->cur_qiov, 0);
1075  }
1076  
1077  /**
1078   * Populate untouched regions of new data cluster
1079   *
1080   * Called with table_lock held.
1081   */
1082  static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
1083  {
1084      BDRVQEDState *s = acb_to_s(acb);
1085      uint64_t start, len, offset;
1086      int ret;
1087  
1088      qemu_co_mutex_unlock(&s->table_lock);
1089  
1090      /* Populate front untouched region of new data cluster */
1091      start = qed_start_of_cluster(s, acb->cur_pos);
1092      len = qed_offset_into_cluster(s, acb->cur_pos);
1093  
1094      trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1095      ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
1096      if (ret < 0) {
1097          goto out;
1098      }
1099  
1100      /* Populate back untouched region of new data cluster */
1101      start = acb->cur_pos + acb->cur_qiov.size;
1102      len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
1103      offset = acb->cur_cluster +
1104               qed_offset_into_cluster(s, acb->cur_pos) +
1105               acb->cur_qiov.size;
1106  
1107      trace_qed_aio_write_postfill(s, acb, start, len, offset);
1108      ret = qed_copy_from_backing_file(s, start, len, offset);
1109      if (ret < 0) {
1110          goto out;
1111      }
1112  
1113      ret = qed_aio_write_main(acb);
1114      if (ret < 0) {
1115          goto out;
1116      }
1117  
1118      if (s->bs->backing) {
1119          /*
1120           * Flush new data clusters before updating the L2 table
1121           *
1122           * This flush is necessary when a backing file is in use.  A crash
1123           * during an allocating write could result in empty clusters in the
1124           * image.  If the write only touched a subregion of the cluster,
1125           * then backing image sectors have been lost in the untouched
1126           * region.  The solution is to flush after writing a new data
1127           * cluster and before updating the L2 table.
1128           */
1129          ret = bdrv_co_flush(s->bs->file->bs);
1130      }
1131  
1132  out:
1133      qemu_co_mutex_lock(&s->table_lock);
1134      return ret;
1135  }
1136  
1137  /**
1138   * Check if the QED_F_NEED_CHECK bit should be set during allocating write
1139   */
1140  static bool qed_should_set_need_check(BDRVQEDState *s)
1141  {
1142      /* The flush before L2 update path ensures consistency */
1143      if (s->bs->backing) {
1144          return false;
1145      }
1146  
1147      return !(s->header.features & QED_F_NEED_CHECK);
1148  }
1149  
1150  /**
1151   * Write new data cluster
1152   *
1153   * @acb:        Write request
1154   * @len:        Length in bytes
1155   *
1156   * This path is taken when writing to previously unallocated clusters.
1157   *
1158   * Called with table_lock held.
1159   */
1160  static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1161  {
1162      BDRVQEDState *s = acb_to_s(acb);
1163      int ret;
1164  
1165      /* Cancel timer when the first allocating request comes in */
1166      if (s->allocating_acb == NULL) {
1167          qed_cancel_need_check_timer(s);
1168      }
1169  
1170      /* Freeze this request if another allocating write is in progress */
1171      if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
1172          if (s->allocating_acb != NULL) {
1173              qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
1174              assert(s->allocating_acb == NULL);
1175          }
1176          s->allocating_acb = acb;
1177          return -EAGAIN; /* start over with looking up table entries */
1178      }
1179  
1180      acb->cur_nclusters = qed_bytes_to_clusters(s,
1181              qed_offset_into_cluster(s, acb->cur_pos) + len);
1182      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1183  
1184      if (acb->flags & QED_AIOCB_ZERO) {
1185          /* Skip ahead if the clusters are already zero */
1186          if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1187              return 0;
1188          }
1189          acb->cur_cluster = 1;
1190      } else {
1191          acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
1192      }
1193  
1194      if (qed_should_set_need_check(s)) {
1195          s->header.features |= QED_F_NEED_CHECK;
1196          ret = qed_write_header(s);
1197          if (ret < 0) {
1198              return ret;
1199          }
1200      }
1201  
1202      if (!(acb->flags & QED_AIOCB_ZERO)) {
1203          ret = qed_aio_write_cow(acb);
1204          if (ret < 0) {
1205              return ret;
1206          }
1207      }
1208  
1209      return qed_aio_write_l2_update(acb, acb->cur_cluster);
1210  }
1211  
1212  /**
1213   * Write data cluster in place
1214   *
1215   * @acb:        Write request
1216   * @offset:     Cluster offset in bytes
1217   * @len:        Length in bytes
1218   *
1219   * This path is taken when writing to already allocated clusters.
1220   *
1221   * Called with table_lock held.
1222   */
1223  static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
1224                                                size_t len)
1225  {
1226      BDRVQEDState *s = acb_to_s(acb);
1227      int r;
1228  
1229      qemu_co_mutex_unlock(&s->table_lock);
1230  
1231      /* Allocate buffer for zero writes */
1232      if (acb->flags & QED_AIOCB_ZERO) {
1233          struct iovec *iov = acb->qiov->iov;
1234  
1235          if (!iov->iov_base) {
1236              iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
1237              if (iov->iov_base == NULL) {
1238                  r = -ENOMEM;
1239                  goto out;
1240              }
1241              memset(iov->iov_base, 0, iov->iov_len);
1242          }
1243      }
1244  
1245      /* Calculate the I/O vector */
1246      acb->cur_cluster = offset;
1247      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1248  
1249      /* Do the actual write.  */
1250      r = qed_aio_write_main(acb);
1251  out:
1252      qemu_co_mutex_lock(&s->table_lock);
1253      return r;
1254  }
1255  
1256  /**
1257   * Write data cluster
1258   *
1259   * @opaque:     Write request
1260   * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1261   * @offset:     Cluster offset in bytes
1262   * @len:        Length in bytes
1263   *
1264   * Called with table_lock held.
1265   */
1266  static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
1267                                             uint64_t offset, size_t len)
1268  {
1269      QEDAIOCB *acb = opaque;
1270  
1271      trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1272  
1273      acb->find_cluster_ret = ret;
1274  
1275      switch (ret) {
1276      case QED_CLUSTER_FOUND:
1277          return qed_aio_write_inplace(acb, offset, len);
1278  
1279      case QED_CLUSTER_L2:
1280      case QED_CLUSTER_L1:
1281      case QED_CLUSTER_ZERO:
1282          return qed_aio_write_alloc(acb, len);
1283  
1284      default:
1285          g_assert_not_reached();
1286      }
1287  }
1288  
1289  /**
1290   * Read data cluster
1291   *
1292   * @opaque:     Read request
1293   * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1294   * @offset:     Cluster offset in bytes
1295   * @len:        Length in bytes
1296   *
1297   * Called with table_lock held.
1298   */
1299  static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
1300                                            uint64_t offset, size_t len)
1301  {
1302      QEDAIOCB *acb = opaque;
1303      BDRVQEDState *s = acb_to_s(acb);
1304      BlockDriverState *bs = acb->bs;
1305      int r;
1306  
1307      qemu_co_mutex_unlock(&s->table_lock);
1308  
1309      /* Adjust offset into cluster */
1310      offset += qed_offset_into_cluster(s, acb->cur_pos);
1311  
1312      trace_qed_aio_read_data(s, acb, ret, offset, len);
1313  
1314      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1315  
1316      /* Handle zero cluster and backing file reads, otherwise read
1317       * data cluster directly.
1318       */
1319      if (ret == QED_CLUSTER_ZERO) {
1320          qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1321          r = 0;
1322      } else if (ret != QED_CLUSTER_FOUND) {
1323          r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov);
1324      } else {
1325          BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1326          r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
1327                             &acb->cur_qiov, 0);
1328      }
1329  
1330      qemu_co_mutex_lock(&s->table_lock);
1331      return r;
1332  }
1333  
1334  /**
1335   * Begin next I/O or complete the request
1336   */
1337  static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
1338  {
1339      BDRVQEDState *s = acb_to_s(acb);
1340      uint64_t offset;
1341      size_t len;
1342      int ret;
1343  
1344      qemu_co_mutex_lock(&s->table_lock);
1345      while (1) {
1346          trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
1347  
1348          acb->qiov_offset += acb->cur_qiov.size;
1349          acb->cur_pos += acb->cur_qiov.size;
1350          qemu_iovec_reset(&acb->cur_qiov);
1351  
1352          /* Complete request */
1353          if (acb->cur_pos >= acb->end_pos) {
1354              ret = 0;
1355              break;
1356          }
1357  
1358          /* Find next cluster and start I/O */
1359          len = acb->end_pos - acb->cur_pos;
1360          ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
1361          if (ret < 0) {
1362              break;
1363          }
1364  
1365          if (acb->flags & QED_AIOCB_WRITE) {
1366              ret = qed_aio_write_data(acb, ret, offset, len);
1367          } else {
1368              ret = qed_aio_read_data(acb, ret, offset, len);
1369          }
1370  
1371          if (ret < 0 && ret != -EAGAIN) {
1372              break;
1373          }
1374      }
1375  
1376      trace_qed_aio_complete(s, acb, ret);
1377      qed_aio_complete(acb);
1378      qemu_co_mutex_unlock(&s->table_lock);
1379      return ret;
1380  }
1381  
1382  static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
1383                                         QEMUIOVector *qiov, int nb_sectors,
1384                                         int flags)
1385  {
1386      QEDAIOCB acb = {
1387          .bs         = bs,
1388          .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
1389          .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
1390          .qiov       = qiov,
1391          .flags      = flags,
1392      };
1393      qemu_iovec_init(&acb.cur_qiov, qiov->niov);
1394  
1395      trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
1396  
1397      /* Start request */
1398      return qed_aio_next_io(&acb);
1399  }
1400  
1401  static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
1402                                            int64_t sector_num, int nb_sectors,
1403                                            QEMUIOVector *qiov)
1404  {
1405      return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
1406  }
1407  
1408  static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
1409                                             int64_t sector_num, int nb_sectors,
1410                                             QEMUIOVector *qiov, int flags)
1411  {
1412      return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
1413  }
1414  
1415  static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
1416                                                    int64_t offset,
1417                                                    int64_t bytes,
1418                                                    BdrvRequestFlags flags)
1419  {
1420      BDRVQEDState *s = bs->opaque;
1421  
1422      /*
1423       * Zero writes start without an I/O buffer.  If a buffer becomes necessary
1424       * then it will be allocated during request processing.
1425       */
1426      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
1427  
1428      /*
1429       * QED is not prepared for 63bit write-zero requests, so rely on
1430       * max_pwrite_zeroes.
1431       */
1432      assert(bytes <= INT_MAX);
1433  
1434      /* Fall back if the request is not aligned */
1435      if (qed_offset_into_cluster(s, offset) ||
1436          qed_offset_into_cluster(s, bytes)) {
1437          return -ENOTSUP;
1438      }
1439  
1440      return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
1441                            bytes >> BDRV_SECTOR_BITS,
1442                            QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1443  }
1444  
1445  static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
1446                                               int64_t offset,
1447                                               bool exact,
1448                                               PreallocMode prealloc,
1449                                               BdrvRequestFlags flags,
1450                                               Error **errp)
1451  {
1452      BDRVQEDState *s = bs->opaque;
1453      uint64_t old_image_size;
1454      int ret;
1455  
1456      if (prealloc != PREALLOC_MODE_OFF) {
1457          error_setg(errp, "Unsupported preallocation mode '%s'",
1458                     PreallocMode_str(prealloc));
1459          return -ENOTSUP;
1460      }
1461  
1462      if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1463                                   s->header.table_size)) {
1464          error_setg(errp, "Invalid image size specified");
1465          return -EINVAL;
1466      }
1467  
1468      if ((uint64_t)offset < s->header.image_size) {
1469          error_setg(errp, "Shrinking images is currently not supported");
1470          return -ENOTSUP;
1471      }
1472  
1473      old_image_size = s->header.image_size;
1474      s->header.image_size = offset;
1475      ret = qed_write_header_sync(s);
1476      if (ret < 0) {
1477          s->header.image_size = old_image_size;
1478          error_setg_errno(errp, -ret, "Failed to update the image size");
1479      }
1480      return ret;
1481  }
1482  
1483  static int64_t coroutine_fn bdrv_qed_co_getlength(BlockDriverState *bs)
1484  {
1485      BDRVQEDState *s = bs->opaque;
1486      return s->header.image_size;
1487  }
1488  
1489  static int coroutine_fn
1490  bdrv_qed_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1491  {
1492      BDRVQEDState *s = bs->opaque;
1493  
1494      memset(bdi, 0, sizeof(*bdi));
1495      bdi->cluster_size = s->header.cluster_size;
1496      bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1497      return 0;
1498  }
1499  
1500  static int bdrv_qed_change_backing_file(BlockDriverState *bs,
1501                                          const char *backing_file,
1502                                          const char *backing_fmt)
1503  {
1504      BDRVQEDState *s = bs->opaque;
1505      QEDHeader new_header, le_header;
1506      void *buffer;
1507      size_t buffer_len, backing_file_len;
1508      int ret;
1509  
1510      /* Refuse to set backing filename if unknown compat feature bits are
1511       * active.  If the image uses an unknown compat feature then we may not
1512       * know the layout of data following the header structure and cannot safely
1513       * add a new string.
1514       */
1515      if (backing_file && (s->header.compat_features &
1516                           ~QED_COMPAT_FEATURE_MASK)) {
1517          return -ENOTSUP;
1518      }
1519  
1520      memcpy(&new_header, &s->header, sizeof(new_header));
1521  
1522      new_header.features &= ~(QED_F_BACKING_FILE |
1523                               QED_F_BACKING_FORMAT_NO_PROBE);
1524  
1525      /* Adjust feature flags */
1526      if (backing_file) {
1527          new_header.features |= QED_F_BACKING_FILE;
1528  
1529          if (qed_fmt_is_raw(backing_fmt)) {
1530              new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
1531          }
1532      }
1533  
1534      /* Calculate new header size */
1535      backing_file_len = 0;
1536  
1537      if (backing_file) {
1538          backing_file_len = strlen(backing_file);
1539      }
1540  
1541      buffer_len = sizeof(new_header);
1542      new_header.backing_filename_offset = buffer_len;
1543      new_header.backing_filename_size = backing_file_len;
1544      buffer_len += backing_file_len;
1545  
1546      /* Make sure we can rewrite header without failing */
1547      if (buffer_len > new_header.header_size * new_header.cluster_size) {
1548          return -ENOSPC;
1549      }
1550  
1551      /* Prepare new header */
1552      buffer = g_malloc(buffer_len);
1553  
1554      qed_header_cpu_to_le(&new_header, &le_header);
1555      memcpy(buffer, &le_header, sizeof(le_header));
1556      buffer_len = sizeof(le_header);
1557  
1558      if (backing_file) {
1559          memcpy(buffer + buffer_len, backing_file, backing_file_len);
1560          buffer_len += backing_file_len;
1561      }
1562  
1563      /* Write new header */
1564      ret = bdrv_pwrite_sync(bs->file, 0, buffer_len, buffer, 0);
1565      g_free(buffer);
1566      if (ret == 0) {
1567          memcpy(&s->header, &new_header, sizeof(new_header));
1568      }
1569      return ret;
1570  }
1571  
1572  static void coroutine_fn bdrv_qed_co_invalidate_cache(BlockDriverState *bs,
1573                                                        Error **errp)
1574  {
1575      BDRVQEDState *s = bs->opaque;
1576      int ret;
1577  
1578      bdrv_qed_close(bs);
1579  
1580      bdrv_qed_init_state(bs);
1581      qemu_co_mutex_lock(&s->table_lock);
1582      ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, errp);
1583      qemu_co_mutex_unlock(&s->table_lock);
1584      if (ret < 0) {
1585          error_prepend(errp, "Could not reopen qed layer: ");
1586      }
1587  }
1588  
1589  static int coroutine_fn bdrv_qed_co_check(BlockDriverState *bs,
1590                                            BdrvCheckResult *result,
1591                                            BdrvCheckMode fix)
1592  {
1593      BDRVQEDState *s = bs->opaque;
1594      int ret;
1595  
1596      qemu_co_mutex_lock(&s->table_lock);
1597      ret = qed_check(s, result, !!fix);
1598      qemu_co_mutex_unlock(&s->table_lock);
1599  
1600      return ret;
1601  }
1602  
1603  static QemuOptsList qed_create_opts = {
1604      .name = "qed-create-opts",
1605      .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
1606      .desc = {
1607          {
1608              .name = BLOCK_OPT_SIZE,
1609              .type = QEMU_OPT_SIZE,
1610              .help = "Virtual disk size"
1611          },
1612          {
1613              .name = BLOCK_OPT_BACKING_FILE,
1614              .type = QEMU_OPT_STRING,
1615              .help = "File name of a base image"
1616          },
1617          {
1618              .name = BLOCK_OPT_BACKING_FMT,
1619              .type = QEMU_OPT_STRING,
1620              .help = "Image format of the base image"
1621          },
1622          {
1623              .name = BLOCK_OPT_CLUSTER_SIZE,
1624              .type = QEMU_OPT_SIZE,
1625              .help = "Cluster size (in bytes)",
1626              .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
1627          },
1628          {
1629              .name = BLOCK_OPT_TABLE_SIZE,
1630              .type = QEMU_OPT_SIZE,
1631              .help = "L1/L2 table size (in clusters)"
1632          },
1633          { /* end of list */ }
1634      }
1635  };
1636  
1637  static BlockDriver bdrv_qed = {
1638      .format_name              = "qed",
1639      .instance_size            = sizeof(BDRVQEDState),
1640      .create_opts              = &qed_create_opts,
1641      .is_format                = true,
1642      .supports_backing         = true,
1643  
1644      .bdrv_probe               = bdrv_qed_probe,
1645      .bdrv_open                = bdrv_qed_open,
1646      .bdrv_close               = bdrv_qed_close,
1647      .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
1648      .bdrv_child_perm          = bdrv_default_perms,
1649      .bdrv_co_create           = bdrv_qed_co_create,
1650      .bdrv_co_create_opts      = bdrv_qed_co_create_opts,
1651      .bdrv_has_zero_init       = bdrv_has_zero_init_1,
1652      .bdrv_co_block_status     = bdrv_qed_co_block_status,
1653      .bdrv_co_readv            = bdrv_qed_co_readv,
1654      .bdrv_co_writev           = bdrv_qed_co_writev,
1655      .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
1656      .bdrv_co_truncate         = bdrv_qed_co_truncate,
1657      .bdrv_co_getlength        = bdrv_qed_co_getlength,
1658      .bdrv_co_get_info         = bdrv_qed_co_get_info,
1659      .bdrv_refresh_limits      = bdrv_qed_refresh_limits,
1660      .bdrv_change_backing_file = bdrv_qed_change_backing_file,
1661      .bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
1662      .bdrv_co_check            = bdrv_qed_co_check,
1663      .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
1664      .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
1665      .bdrv_drain_begin         = bdrv_qed_drain_begin,
1666  };
1667  
1668  static void bdrv_qed_init(void)
1669  {
1670      bdrv_register(&bdrv_qed);
1671  }
1672  
1673  block_init(bdrv_qed_init);
1674