xref: /openbmc/qemu/block/qed.c (revision e692f9c6a681de1372a41999b14a947a553b6a1a)
1  /*
2   * QEMU Enhanced Disk Format
3   *
4   * Copyright IBM, Corp. 2010
5   *
6   * Authors:
7   *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
8   *  Anthony Liguori   <aliguori@us.ibm.com>
9   *
10   * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11   * See the COPYING.LIB file in the top-level directory.
12   *
13   */
14  
15  #include "qemu/osdep.h"
16  #include "block/qdict.h"
17  #include "qapi/error.h"
18  #include "qemu/timer.h"
19  #include "qemu/bswap.h"
20  #include "qemu/main-loop.h"
21  #include "qemu/module.h"
22  #include "qemu/option.h"
23  #include "qemu/memalign.h"
24  #include "trace.h"
25  #include "qed.h"
26  #include "sysemu/block-backend.h"
27  #include "qapi/qmp/qdict.h"
28  #include "qapi/qobject-input-visitor.h"
29  #include "qapi/qapi-visit-block-core.h"
30  
31  static QemuOptsList qed_create_opts;
32  
bdrv_qed_probe(const uint8_t * buf,int buf_size,const char * filename)33  static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
34                            const char *filename)
35  {
36      const QEDHeader *header = (const QEDHeader *)buf;
37  
38      if (buf_size < sizeof(*header)) {
39          return 0;
40      }
41      if (le32_to_cpu(header->magic) != QED_MAGIC) {
42          return 0;
43      }
44      return 100;
45  }
46  
47  /**
48   * Check whether an image format is raw
49   *
50   * @fmt:    Backing file format, may be NULL
51   */
qed_fmt_is_raw(const char * fmt)52  static bool qed_fmt_is_raw(const char *fmt)
53  {
54      return fmt && strcmp(fmt, "raw") == 0;
55  }
56  
qed_header_le_to_cpu(const QEDHeader * le,QEDHeader * cpu)57  static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
58  {
59      cpu->magic = le32_to_cpu(le->magic);
60      cpu->cluster_size = le32_to_cpu(le->cluster_size);
61      cpu->table_size = le32_to_cpu(le->table_size);
62      cpu->header_size = le32_to_cpu(le->header_size);
63      cpu->features = le64_to_cpu(le->features);
64      cpu->compat_features = le64_to_cpu(le->compat_features);
65      cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
66      cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
67      cpu->image_size = le64_to_cpu(le->image_size);
68      cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
69      cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
70  }
71  
qed_header_cpu_to_le(const QEDHeader * cpu,QEDHeader * le)72  static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
73  {
74      le->magic = cpu_to_le32(cpu->magic);
75      le->cluster_size = cpu_to_le32(cpu->cluster_size);
76      le->table_size = cpu_to_le32(cpu->table_size);
77      le->header_size = cpu_to_le32(cpu->header_size);
78      le->features = cpu_to_le64(cpu->features);
79      le->compat_features = cpu_to_le64(cpu->compat_features);
80      le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
81      le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
82      le->image_size = cpu_to_le64(cpu->image_size);
83      le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
84      le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
85  }
86  
qed_write_header_sync(BDRVQEDState * s)87  int qed_write_header_sync(BDRVQEDState *s)
88  {
89      QEDHeader le;
90  
91      qed_header_cpu_to_le(&s->header, &le);
92      return bdrv_pwrite(s->bs->file, 0, sizeof(le), &le, 0);
93  }
94  
95  /**
96   * Update header in-place (does not rewrite backing filename or other strings)
97   *
98   * This function only updates known header fields in-place and does not affect
99   * extra data after the QED header.
100   *
101   * No new allocating reqs can start while this function runs.
102   */
qed_write_header(BDRVQEDState * s)103  static int coroutine_fn GRAPH_RDLOCK qed_write_header(BDRVQEDState *s)
104  {
105      /* We must write full sectors for O_DIRECT but cannot necessarily generate
106       * the data following the header if an unrecognized compat feature is
107       * active.  Therefore, first read the sectors containing the header, update
108       * them, and write back.
109       */
110  
111      int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
112      size_t len = nsectors * BDRV_SECTOR_SIZE;
113      uint8_t *buf;
114      int ret;
115  
116      assert(s->allocating_acb || s->allocating_write_reqs_plugged);
117  
118      buf = qemu_blockalign(s->bs, len);
119  
120      ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
121      if (ret < 0) {
122          goto out;
123      }
124  
125      /* Update header */
126      qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
127  
128      ret = bdrv_co_pwrite(s->bs->file, 0, len,  buf, 0);
129      if (ret < 0) {
130          goto out;
131      }
132  
133      ret = 0;
134  out:
135      qemu_vfree(buf);
136      return ret;
137  }
138  
qed_max_image_size(uint32_t cluster_size,uint32_t table_size)139  static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
140  {
141      uint64_t table_entries;
142      uint64_t l2_size;
143  
144      table_entries = (table_size * cluster_size) / sizeof(uint64_t);
145      l2_size = table_entries * cluster_size;
146  
147      return l2_size * table_entries;
148  }
149  
qed_is_cluster_size_valid(uint32_t cluster_size)150  static bool qed_is_cluster_size_valid(uint32_t cluster_size)
151  {
152      if (cluster_size < QED_MIN_CLUSTER_SIZE ||
153          cluster_size > QED_MAX_CLUSTER_SIZE) {
154          return false;
155      }
156      if (cluster_size & (cluster_size - 1)) {
157          return false; /* not power of 2 */
158      }
159      return true;
160  }
161  
qed_is_table_size_valid(uint32_t table_size)162  static bool qed_is_table_size_valid(uint32_t table_size)
163  {
164      if (table_size < QED_MIN_TABLE_SIZE ||
165          table_size > QED_MAX_TABLE_SIZE) {
166          return false;
167      }
168      if (table_size & (table_size - 1)) {
169          return false; /* not power of 2 */
170      }
171      return true;
172  }
173  
qed_is_image_size_valid(uint64_t image_size,uint32_t cluster_size,uint32_t table_size)174  static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
175                                      uint32_t table_size)
176  {
177      if (image_size % BDRV_SECTOR_SIZE != 0) {
178          return false; /* not multiple of sector size */
179      }
180      if (image_size > qed_max_image_size(cluster_size, table_size)) {
181          return false; /* image is too large */
182      }
183      return true;
184  }
185  
186  /**
187   * Read a string of known length from the image file
188   *
189   * @file:       Image file
190   * @offset:     File offset to start of string, in bytes
191   * @n:          String length in bytes
192   * @buf:        Destination buffer
193   * @buflen:     Destination buffer length in bytes
194   * @ret:        0 on success, -errno on failure
195   *
196   * The string is NUL-terminated.
197   */
198  static int coroutine_fn GRAPH_RDLOCK
qed_read_string(BdrvChild * file,uint64_t offset,size_t n,char * buf,size_t buflen)199  qed_read_string(BdrvChild *file, uint64_t offset,
200                  size_t n, char *buf, size_t buflen)
201  {
202      int ret;
203      if (n >= buflen) {
204          return -EINVAL;
205      }
206      ret = bdrv_co_pread(file, offset, n, buf, 0);
207      if (ret < 0) {
208          return ret;
209      }
210      buf[n] = '\0';
211      return 0;
212  }
213  
214  /**
215   * Allocate new clusters
216   *
217   * @s:          QED state
218   * @n:          Number of contiguous clusters to allocate
219   * @ret:        Offset of first allocated cluster
220   *
221   * This function only produces the offset where the new clusters should be
222   * written.  It updates BDRVQEDState but does not make any changes to the image
223   * file.
224   *
225   * Called with table_lock held.
226   */
qed_alloc_clusters(BDRVQEDState * s,unsigned int n)227  static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
228  {
229      uint64_t offset = s->file_size;
230      s->file_size += n * s->header.cluster_size;
231      return offset;
232  }
233  
qed_alloc_table(BDRVQEDState * s)234  QEDTable *qed_alloc_table(BDRVQEDState *s)
235  {
236      /* Honor O_DIRECT memory alignment requirements */
237      return qemu_blockalign(s->bs,
238                             s->header.cluster_size * s->header.table_size);
239  }
240  
241  /**
242   * Allocate a new zeroed L2 table
243   *
244   * Called with table_lock held.
245   */
qed_new_l2_table(BDRVQEDState * s)246  static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
247  {
248      CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
249  
250      l2_table->table = qed_alloc_table(s);
251      l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
252  
253      memset(l2_table->table->offsets, 0,
254             s->header.cluster_size * s->header.table_size);
255      return l2_table;
256  }
257  
qed_plug_allocating_write_reqs(BDRVQEDState * s)258  static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
259  {
260      qemu_co_mutex_lock(&s->table_lock);
261  
262      /* No reentrancy is allowed.  */
263      assert(!s->allocating_write_reqs_plugged);
264      if (s->allocating_acb != NULL) {
265          /* Another allocating write came concurrently.  This cannot happen
266           * from bdrv_qed_drain_begin, but it can happen when the timer runs.
267           */
268          qemu_co_mutex_unlock(&s->table_lock);
269          return false;
270      }
271  
272      s->allocating_write_reqs_plugged = true;
273      qemu_co_mutex_unlock(&s->table_lock);
274      return true;
275  }
276  
qed_unplug_allocating_write_reqs(BDRVQEDState * s)277  static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
278  {
279      qemu_co_mutex_lock(&s->table_lock);
280      assert(s->allocating_write_reqs_plugged);
281      s->allocating_write_reqs_plugged = false;
282      qemu_co_queue_next(&s->allocating_write_reqs);
283      qemu_co_mutex_unlock(&s->table_lock);
284  }
285  
qed_need_check_timer(BDRVQEDState * s)286  static void coroutine_fn GRAPH_RDLOCK qed_need_check_timer(BDRVQEDState *s)
287  {
288      int ret;
289  
290      trace_qed_need_check_timer_cb(s);
291      assert_bdrv_graph_readable();
292  
293      if (!qed_plug_allocating_write_reqs(s)) {
294          return;
295      }
296  
297      /* Ensure writes are on disk before clearing flag */
298      ret = bdrv_co_flush(s->bs->file->bs);
299      if (ret < 0) {
300          qed_unplug_allocating_write_reqs(s);
301          return;
302      }
303  
304      s->header.features &= ~QED_F_NEED_CHECK;
305      ret = qed_write_header(s);
306      (void) ret;
307  
308      qed_unplug_allocating_write_reqs(s);
309  
310      ret = bdrv_co_flush(s->bs);
311      (void) ret;
312  }
313  
qed_need_check_timer_entry(void * opaque)314  static void coroutine_fn qed_need_check_timer_entry(void *opaque)
315  {
316      BDRVQEDState *s = opaque;
317      GRAPH_RDLOCK_GUARD();
318  
319      qed_need_check_timer(opaque);
320      bdrv_dec_in_flight(s->bs);
321  }
322  
qed_need_check_timer_cb(void * opaque)323  static void qed_need_check_timer_cb(void *opaque)
324  {
325      BDRVQEDState *s = opaque;
326      Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
327  
328      bdrv_inc_in_flight(s->bs);
329      qemu_coroutine_enter(co);
330  }
331  
qed_start_need_check_timer(BDRVQEDState * s)332  static void qed_start_need_check_timer(BDRVQEDState *s)
333  {
334      trace_qed_start_need_check_timer(s);
335  
336      /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
337       * migration.
338       */
339      timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
340                     NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
341  }
342  
343  /* It's okay to call this multiple times or when no timer is started */
qed_cancel_need_check_timer(BDRVQEDState * s)344  static void qed_cancel_need_check_timer(BDRVQEDState *s)
345  {
346      trace_qed_cancel_need_check_timer(s);
347      timer_del(s->need_check_timer);
348  }
349  
bdrv_qed_detach_aio_context(BlockDriverState * bs)350  static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
351  {
352      BDRVQEDState *s = bs->opaque;
353  
354      qed_cancel_need_check_timer(s);
355      timer_free(s->need_check_timer);
356  }
357  
bdrv_qed_attach_aio_context(BlockDriverState * bs,AioContext * new_context)358  static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
359                                          AioContext *new_context)
360  {
361      BDRVQEDState *s = bs->opaque;
362  
363      s->need_check_timer = aio_timer_new(new_context,
364                                          QEMU_CLOCK_VIRTUAL, SCALE_NS,
365                                          qed_need_check_timer_cb, s);
366      if (s->header.features & QED_F_NEED_CHECK) {
367          qed_start_need_check_timer(s);
368      }
369  }
370  
bdrv_qed_drain_begin(BlockDriverState * bs)371  static void bdrv_qed_drain_begin(BlockDriverState *bs)
372  {
373      BDRVQEDState *s = bs->opaque;
374  
375      /* Fire the timer immediately in order to start doing I/O as soon as the
376       * header is flushed.
377       */
378      if (s->need_check_timer && timer_pending(s->need_check_timer)) {
379          Coroutine *co;
380  
381          qed_cancel_need_check_timer(s);
382          co = qemu_coroutine_create(qed_need_check_timer_entry, s);
383          bdrv_inc_in_flight(bs);
384          aio_co_enter(bdrv_get_aio_context(bs), co);
385      }
386  }
387  
bdrv_qed_init_state(BlockDriverState * bs)388  static void bdrv_qed_init_state(BlockDriverState *bs)
389  {
390      BDRVQEDState *s = bs->opaque;
391  
392      memset(s, 0, sizeof(BDRVQEDState));
393      s->bs = bs;
394      qemu_co_mutex_init(&s->table_lock);
395      qemu_co_queue_init(&s->allocating_write_reqs);
396  }
397  
398  /* Called with table_lock held.  */
399  static int coroutine_fn GRAPH_RDLOCK
bdrv_qed_do_open(BlockDriverState * bs,QDict * options,int flags,Error ** errp)400  bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
401  {
402      BDRVQEDState *s = bs->opaque;
403      QEDHeader le_header;
404      int64_t file_size;
405      int ret;
406  
407      ret = bdrv_co_pread(bs->file, 0, sizeof(le_header), &le_header, 0);
408      if (ret < 0) {
409          error_setg(errp, "Failed to read QED header");
410          return ret;
411      }
412      qed_header_le_to_cpu(&le_header, &s->header);
413  
414      if (s->header.magic != QED_MAGIC) {
415          error_setg(errp, "Image not in QED format");
416          return -EINVAL;
417      }
418      if (s->header.features & ~QED_FEATURE_MASK) {
419          /* image uses unsupported feature bits */
420          error_setg(errp, "Unsupported QED features: %" PRIx64,
421                     s->header.features & ~QED_FEATURE_MASK);
422          return -ENOTSUP;
423      }
424      if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
425          error_setg(errp, "QED cluster size is invalid");
426          return -EINVAL;
427      }
428  
429      /* Round down file size to the last cluster */
430      file_size = bdrv_co_getlength(bs->file->bs);
431      if (file_size < 0) {
432          error_setg(errp, "Failed to get file length");
433          return file_size;
434      }
435      s->file_size = qed_start_of_cluster(s, file_size);
436  
437      if (!qed_is_table_size_valid(s->header.table_size)) {
438          error_setg(errp, "QED table size is invalid");
439          return -EINVAL;
440      }
441      if (!qed_is_image_size_valid(s->header.image_size,
442                                   s->header.cluster_size,
443                                   s->header.table_size)) {
444          error_setg(errp, "QED image size is invalid");
445          return -EINVAL;
446      }
447      if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
448          error_setg(errp, "QED table offset is invalid");
449          return -EINVAL;
450      }
451  
452      s->table_nelems = (s->header.cluster_size * s->header.table_size) /
453                        sizeof(uint64_t);
454      s->l2_shift = ctz32(s->header.cluster_size);
455      s->l2_mask = s->table_nelems - 1;
456      s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
457  
458      /* Header size calculation must not overflow uint32_t */
459      if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
460          error_setg(errp, "QED header size is too large");
461          return -EINVAL;
462      }
463  
464      if ((s->header.features & QED_F_BACKING_FILE)) {
465          g_autofree char *backing_file_str = NULL;
466  
467          if ((uint64_t)s->header.backing_filename_offset +
468              s->header.backing_filename_size >
469              s->header.cluster_size * s->header.header_size) {
470              error_setg(errp, "QED backing filename offset is invalid");
471              return -EINVAL;
472          }
473  
474          backing_file_str = g_malloc(sizeof(bs->backing_file));
475          ret = qed_read_string(bs->file, s->header.backing_filename_offset,
476                                s->header.backing_filename_size,
477                                backing_file_str, sizeof(bs->backing_file));
478          if (ret < 0) {
479              error_setg(errp, "Failed to read backing filename");
480              return ret;
481          }
482  
483          if (!g_str_equal(backing_file_str, bs->backing_file)) {
484              pstrcpy(bs->backing_file, sizeof(bs->backing_file),
485                      backing_file_str);
486              pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
487                      backing_file_str);
488          }
489  
490          if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
491              pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
492          }
493      }
494  
495      /* Reset unknown autoclear feature bits.  This is a backwards
496       * compatibility mechanism that allows images to be opened by older
497       * programs, which "knock out" unknown feature bits.  When an image is
498       * opened by a newer program again it can detect that the autoclear
499       * feature is no longer valid.
500       */
501      if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
502          !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
503          s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
504  
505          ret = qed_write_header_sync(s);
506          if (ret) {
507              error_setg(errp, "Failed to update header");
508              return ret;
509          }
510  
511          /* From here on only known autoclear feature bits are valid */
512          bdrv_co_flush(bs->file->bs);
513      }
514  
515      s->l1_table = qed_alloc_table(s);
516      qed_init_l2_cache(&s->l2_cache);
517  
518      ret = qed_read_l1_table_sync(s);
519      if (ret) {
520          error_setg(errp, "Failed to read L1 table");
521          goto out;
522      }
523  
524      /* If image was not closed cleanly, check consistency */
525      if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
526          /* Read-only images cannot be fixed.  There is no risk of corruption
527           * since write operations are not possible.  Therefore, allow
528           * potentially inconsistent images to be opened read-only.  This can
529           * aid data recovery from an otherwise inconsistent image.
530           */
531          if (!bdrv_is_read_only(bs->file->bs) &&
532              !(flags & BDRV_O_INACTIVE)) {
533              BdrvCheckResult result = {0};
534  
535              ret = qed_check(s, &result, true);
536              if (ret) {
537                  error_setg(errp, "Image corrupted");
538                  goto out;
539              }
540          }
541      }
542  
543      bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
544  
545  out:
546      if (ret) {
547          qed_free_l2_cache(&s->l2_cache);
548          qemu_vfree(s->l1_table);
549      }
550      return ret;
551  }
552  
553  typedef struct QEDOpenCo {
554      BlockDriverState *bs;
555      QDict *options;
556      int flags;
557      Error **errp;
558      int ret;
559  } QEDOpenCo;
560  
bdrv_qed_open_entry(void * opaque)561  static void coroutine_fn bdrv_qed_open_entry(void *opaque)
562  {
563      QEDOpenCo *qoc = opaque;
564      BDRVQEDState *s = qoc->bs->opaque;
565  
566      GRAPH_RDLOCK_GUARD();
567  
568      qemu_co_mutex_lock(&s->table_lock);
569      qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
570      qemu_co_mutex_unlock(&s->table_lock);
571  }
572  
bdrv_qed_open(BlockDriverState * bs,QDict * options,int flags,Error ** errp)573  static int coroutine_mixed_fn bdrv_qed_open(BlockDriverState *bs, QDict *options,
574                                              int flags, Error **errp)
575  {
576      QEDOpenCo qoc = {
577          .bs = bs,
578          .options = options,
579          .flags = flags,
580          .errp = errp,
581          .ret = -EINPROGRESS
582      };
583      int ret;
584  
585      ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
586      if (ret < 0) {
587          return ret;
588      }
589  
590      bdrv_qed_init_state(bs);
591      assert(!qemu_in_coroutine());
592      assert(qemu_get_current_aio_context() == qemu_get_aio_context());
593      qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
594      BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
595  
596      return qoc.ret;
597  }
598  
bdrv_qed_refresh_limits(BlockDriverState * bs,Error ** errp)599  static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
600  {
601      BDRVQEDState *s = bs->opaque;
602  
603      bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
604      bs->bl.max_pwrite_zeroes = QEMU_ALIGN_DOWN(INT_MAX, s->header.cluster_size);
605  }
606  
607  /* We have nothing to do for QED reopen, stubs just return
608   * success */
bdrv_qed_reopen_prepare(BDRVReopenState * state,BlockReopenQueue * queue,Error ** errp)609  static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
610                                     BlockReopenQueue *queue, Error **errp)
611  {
612      return 0;
613  }
614  
bdrv_qed_do_close(BlockDriverState * bs)615  static void GRAPH_RDLOCK bdrv_qed_do_close(BlockDriverState *bs)
616  {
617      BDRVQEDState *s = bs->opaque;
618  
619      bdrv_qed_detach_aio_context(bs);
620  
621      /* Ensure writes reach stable storage */
622      bdrv_flush(bs->file->bs);
623  
624      /* Clean shutdown, no check required on next open */
625      if (s->header.features & QED_F_NEED_CHECK) {
626          s->header.features &= ~QED_F_NEED_CHECK;
627          qed_write_header_sync(s);
628      }
629  
630      qed_free_l2_cache(&s->l2_cache);
631      qemu_vfree(s->l1_table);
632  }
633  
bdrv_qed_close(BlockDriverState * bs)634  static void GRAPH_UNLOCKED bdrv_qed_close(BlockDriverState *bs)
635  {
636      GLOBAL_STATE_CODE();
637      GRAPH_RDLOCK_GUARD_MAINLOOP();
638  
639      bdrv_qed_do_close(bs);
640  }
641  
642  static int coroutine_fn GRAPH_UNLOCKED
bdrv_qed_co_create(BlockdevCreateOptions * opts,Error ** errp)643  bdrv_qed_co_create(BlockdevCreateOptions *opts, Error **errp)
644  {
645      BlockdevCreateOptionsQed *qed_opts;
646      BlockBackend *blk = NULL;
647      BlockDriverState *bs = NULL;
648  
649      QEDHeader header;
650      QEDHeader le_header;
651      uint8_t *l1_table = NULL;
652      size_t l1_size;
653      int ret = 0;
654  
655      assert(opts->driver == BLOCKDEV_DRIVER_QED);
656      qed_opts = &opts->u.qed;
657  
658      /* Validate options and set default values */
659      if (!qed_opts->has_cluster_size) {
660          qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
661      }
662      if (!qed_opts->has_table_size) {
663          qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
664      }
665  
666      if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
667          error_setg(errp, "QED cluster size must be within range [%u, %u] "
668                           "and power of 2",
669                     QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
670          return -EINVAL;
671      }
672      if (!qed_is_table_size_valid(qed_opts->table_size)) {
673          error_setg(errp, "QED table size must be within range [%u, %u] "
674                           "and power of 2",
675                     QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
676          return -EINVAL;
677      }
678      if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
679                                   qed_opts->table_size))
680      {
681          error_setg(errp, "QED image size must be a non-zero multiple of "
682                           "cluster size and less than %" PRIu64 " bytes",
683                     qed_max_image_size(qed_opts->cluster_size,
684                                        qed_opts->table_size));
685          return -EINVAL;
686      }
687  
688      /* Create BlockBackend to write to the image */
689      bs = bdrv_co_open_blockdev_ref(qed_opts->file, errp);
690      if (bs == NULL) {
691          return -EIO;
692      }
693  
694      blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
695                               errp);
696      if (!blk) {
697          ret = -EPERM;
698          goto out;
699      }
700      blk_set_allow_write_beyond_eof(blk, true);
701  
702      /* Prepare image format */
703      header = (QEDHeader) {
704          .magic = QED_MAGIC,
705          .cluster_size = qed_opts->cluster_size,
706          .table_size = qed_opts->table_size,
707          .header_size = 1,
708          .features = 0,
709          .compat_features = 0,
710          .l1_table_offset = qed_opts->cluster_size,
711          .image_size = qed_opts->size,
712      };
713  
714      l1_size = header.cluster_size * header.table_size;
715  
716      /*
717       * The QED format associates file length with allocation status,
718       * so a new file (which is empty) must have a length of 0.
719       */
720      ret = blk_co_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp);
721      if (ret < 0) {
722          goto out;
723      }
724  
725      if (qed_opts->backing_file) {
726          header.features |= QED_F_BACKING_FILE;
727          header.backing_filename_offset = sizeof(le_header);
728          header.backing_filename_size = strlen(qed_opts->backing_file);
729  
730          if (qed_opts->has_backing_fmt) {
731              const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
732              if (qed_fmt_is_raw(backing_fmt)) {
733                  header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
734              }
735          }
736      }
737  
738      qed_header_cpu_to_le(&header, &le_header);
739      ret = blk_co_pwrite(blk, 0, sizeof(le_header), &le_header, 0);
740      if (ret < 0) {
741          goto out;
742      }
743      ret = blk_co_pwrite(blk, sizeof(le_header), header.backing_filename_size,
744                       qed_opts->backing_file, 0);
745      if (ret < 0) {
746          goto out;
747      }
748  
749      l1_table = g_malloc0(l1_size);
750      ret = blk_co_pwrite(blk, header.l1_table_offset, l1_size, l1_table, 0);
751      if (ret < 0) {
752          goto out;
753      }
754  
755      ret = 0; /* success */
756  out:
757      g_free(l1_table);
758      blk_co_unref(blk);
759      bdrv_co_unref(bs);
760      return ret;
761  }
762  
763  static int coroutine_fn GRAPH_UNLOCKED
bdrv_qed_co_create_opts(BlockDriver * drv,const char * filename,QemuOpts * opts,Error ** errp)764  bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,
765                          QemuOpts *opts, Error **errp)
766  {
767      BlockdevCreateOptions *create_options = NULL;
768      QDict *qdict;
769      Visitor *v;
770      BlockDriverState *bs = NULL;
771      int ret;
772  
773      static const QDictRenames opt_renames[] = {
774          { BLOCK_OPT_BACKING_FILE,       "backing-file" },
775          { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
776          { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
777          { BLOCK_OPT_TABLE_SIZE,         "table-size" },
778          { NULL, NULL },
779      };
780  
781      /* Parse options and convert legacy syntax */
782      qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);
783  
784      if (!qdict_rename_keys(qdict, opt_renames, errp)) {
785          ret = -EINVAL;
786          goto fail;
787      }
788  
789      /* Create and open the file (protocol layer) */
790      ret = bdrv_co_create_file(filename, opts, errp);
791      if (ret < 0) {
792          goto fail;
793      }
794  
795      bs = bdrv_co_open(filename, NULL, NULL,
796                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
797      if (bs == NULL) {
798          ret = -EIO;
799          goto fail;
800      }
801  
802      /* Now get the QAPI type BlockdevCreateOptions */
803      qdict_put_str(qdict, "driver", "qed");
804      qdict_put_str(qdict, "file", bs->node_name);
805  
806      v = qobject_input_visitor_new_flat_confused(qdict, errp);
807      if (!v) {
808          ret = -EINVAL;
809          goto fail;
810      }
811  
812      visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
813      visit_free(v);
814      if (!create_options) {
815          ret = -EINVAL;
816          goto fail;
817      }
818  
819      /* Silently round up size */
820      assert(create_options->driver == BLOCKDEV_DRIVER_QED);
821      create_options->u.qed.size =
822          ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);
823  
824      /* Create the qed image (format layer) */
825      ret = bdrv_qed_co_create(create_options, errp);
826  
827  fail:
828      qobject_unref(qdict);
829      bdrv_co_unref(bs);
830      qapi_free_BlockdevCreateOptions(create_options);
831      return ret;
832  }
833  
834  static int coroutine_fn GRAPH_RDLOCK
bdrv_qed_co_block_status(BlockDriverState * bs,bool want_zero,int64_t pos,int64_t bytes,int64_t * pnum,int64_t * map,BlockDriverState ** file)835  bdrv_qed_co_block_status(BlockDriverState *bs, bool want_zero, int64_t pos,
836                           int64_t bytes, int64_t *pnum, int64_t *map,
837                           BlockDriverState **file)
838  {
839      BDRVQEDState *s = bs->opaque;
840      size_t len = MIN(bytes, SIZE_MAX);
841      int status;
842      QEDRequest request = { .l2_table = NULL };
843      uint64_t offset;
844      int ret;
845  
846      qemu_co_mutex_lock(&s->table_lock);
847      ret = qed_find_cluster(s, &request, pos, &len, &offset);
848  
849      *pnum = len;
850      switch (ret) {
851      case QED_CLUSTER_FOUND:
852          *map = offset | qed_offset_into_cluster(s, pos);
853          status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
854          *file = bs->file->bs;
855          break;
856      case QED_CLUSTER_ZERO:
857          status = BDRV_BLOCK_ZERO;
858          break;
859      case QED_CLUSTER_L2:
860      case QED_CLUSTER_L1:
861          status = 0;
862          break;
863      default:
864          assert(ret < 0);
865          status = ret;
866          break;
867      }
868  
869      qed_unref_l2_cache_entry(request.l2_table);
870      qemu_co_mutex_unlock(&s->table_lock);
871  
872      return status;
873  }
874  
acb_to_s(QEDAIOCB * acb)875  static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
876  {
877      return acb->bs->opaque;
878  }
879  
880  /**
881   * Read from the backing file or zero-fill if no backing file
882   *
883   * @s:              QED state
884   * @pos:            Byte position in device
885   * @qiov:           Destination I/O vector
886   *
887   * This function reads qiov->size bytes starting at pos from the backing file.
888   * If there is no backing file then zeroes are read.
889   */
890  static int coroutine_fn GRAPH_RDLOCK
qed_read_backing_file(BDRVQEDState * s,uint64_t pos,QEMUIOVector * qiov)891  qed_read_backing_file(BDRVQEDState *s, uint64_t pos, QEMUIOVector *qiov)
892  {
893      if (s->bs->backing) {
894          BLKDBG_CO_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
895          return bdrv_co_preadv(s->bs->backing, pos, qiov->size, qiov, 0);
896      }
897      qemu_iovec_memset(qiov, 0, 0, qiov->size);
898      return 0;
899  }
900  
901  /**
902   * Copy data from backing file into the image
903   *
904   * @s:          QED state
905   * @pos:        Byte position in device
906   * @len:        Number of bytes
907   * @offset:     Byte offset in image file
908   */
909  static int coroutine_fn GRAPH_RDLOCK
qed_copy_from_backing_file(BDRVQEDState * s,uint64_t pos,uint64_t len,uint64_t offset)910  qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, uint64_t len,
911                             uint64_t offset)
912  {
913      QEMUIOVector qiov;
914      int ret;
915  
916      /* Skip copy entirely if there is no work to do */
917      if (len == 0) {
918          return 0;
919      }
920  
921      qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
922  
923      ret = qed_read_backing_file(s, pos, &qiov);
924  
925      if (ret) {
926          goto out;
927      }
928  
929      BLKDBG_CO_EVENT(s->bs->file, BLKDBG_COW_WRITE);
930      ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
931      if (ret < 0) {
932          goto out;
933      }
934      ret = 0;
935  out:
936      qemu_vfree(qemu_iovec_buf(&qiov));
937      return ret;
938  }
939  
940  /**
941   * Link one or more contiguous clusters into a table
942   *
943   * @s:              QED state
944   * @table:          L2 table
945   * @index:          First cluster index
946   * @n:              Number of contiguous clusters
947   * @cluster:        First cluster offset
948   *
949   * The cluster offset may be an allocated byte offset in the image file, the
950   * zero cluster marker, or the unallocated cluster marker.
951   *
952   * Called with table_lock held.
953   */
qed_update_l2_table(BDRVQEDState * s,QEDTable * table,int index,unsigned int n,uint64_t cluster)954  static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
955                                               int index, unsigned int n,
956                                               uint64_t cluster)
957  {
958      int i;
959      for (i = index; i < index + n; i++) {
960          table->offsets[i] = cluster;
961          if (!qed_offset_is_unalloc_cluster(cluster) &&
962              !qed_offset_is_zero_cluster(cluster)) {
963              cluster += s->header.cluster_size;
964          }
965      }
966  }
967  
968  /* Called with table_lock held.  */
qed_aio_complete(QEDAIOCB * acb)969  static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
970  {
971      BDRVQEDState *s = acb_to_s(acb);
972  
973      /* Free resources */
974      qemu_iovec_destroy(&acb->cur_qiov);
975      qed_unref_l2_cache_entry(acb->request.l2_table);
976  
977      /* Free the buffer we may have allocated for zero writes */
978      if (acb->flags & QED_AIOCB_ZERO) {
979          qemu_vfree(acb->qiov->iov[0].iov_base);
980          acb->qiov->iov[0].iov_base = NULL;
981      }
982  
983      /* Start next allocating write request waiting behind this one.  Note that
984       * requests enqueue themselves when they first hit an unallocated cluster
985       * but they wait until the entire request is finished before waking up the
986       * next request in the queue.  This ensures that we don't cycle through
987       * requests multiple times but rather finish one at a time completely.
988       */
989      if (acb == s->allocating_acb) {
990          s->allocating_acb = NULL;
991          if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
992              qemu_co_queue_next(&s->allocating_write_reqs);
993          } else if (s->header.features & QED_F_NEED_CHECK) {
994              qed_start_need_check_timer(s);
995          }
996      }
997  }
998  
999  /**
1000   * Update L1 table with new L2 table offset and write it out
1001   *
1002   * Called with table_lock held.
1003   */
qed_aio_write_l1_update(QEDAIOCB * acb)1004  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_l1_update(QEDAIOCB *acb)
1005  {
1006      BDRVQEDState *s = acb_to_s(acb);
1007      CachedL2Table *l2_table = acb->request.l2_table;
1008      uint64_t l2_offset = l2_table->offset;
1009      int index, ret;
1010  
1011      index = qed_l1_index(s, acb->cur_pos);
1012      s->l1_table->offsets[index] = l2_table->offset;
1013  
1014      ret = qed_write_l1_table(s, index, 1);
1015  
1016      /* Commit the current L2 table to the cache */
1017      qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
1018  
1019      /* This is guaranteed to succeed because we just committed the entry to the
1020       * cache.
1021       */
1022      acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
1023      assert(acb->request.l2_table != NULL);
1024  
1025      return ret;
1026  }
1027  
1028  
1029  /**
1030   * Update L2 table with new cluster offsets and write them out
1031   *
1032   * Called with table_lock held.
1033   */
1034  static int coroutine_fn GRAPH_RDLOCK
qed_aio_write_l2_update(QEDAIOCB * acb,uint64_t offset)1035  qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
1036  {
1037      BDRVQEDState *s = acb_to_s(acb);
1038      bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1039      int index, ret;
1040  
1041      if (need_alloc) {
1042          qed_unref_l2_cache_entry(acb->request.l2_table);
1043          acb->request.l2_table = qed_new_l2_table(s);
1044      }
1045  
1046      index = qed_l2_index(s, acb->cur_pos);
1047      qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1048                           offset);
1049  
1050      if (need_alloc) {
1051          /* Write out the whole new L2 table */
1052          ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
1053          if (ret) {
1054              return ret;
1055          }
1056          return qed_aio_write_l1_update(acb);
1057      } else {
1058          /* Write out only the updated part of the L2 table */
1059          ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
1060                                   false);
1061          if (ret) {
1062              return ret;
1063          }
1064      }
1065      return 0;
1066  }
1067  
1068  /**
1069   * Write data to the image file
1070   *
1071   * Called with table_lock *not* held.
1072   */
qed_aio_write_main(QEDAIOCB * acb)1073  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_main(QEDAIOCB *acb)
1074  {
1075      BDRVQEDState *s = acb_to_s(acb);
1076      uint64_t offset = acb->cur_cluster +
1077                        qed_offset_into_cluster(s, acb->cur_pos);
1078  
1079      trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
1080  
1081      BLKDBG_CO_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1082      return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
1083                             &acb->cur_qiov, 0);
1084  }
1085  
1086  /**
1087   * Populate untouched regions of new data cluster
1088   *
1089   * Called with table_lock held.
1090   */
qed_aio_write_cow(QEDAIOCB * acb)1091  static int coroutine_fn GRAPH_RDLOCK qed_aio_write_cow(QEDAIOCB *acb)
1092  {
1093      BDRVQEDState *s = acb_to_s(acb);
1094      uint64_t start, len, offset;
1095      int ret;
1096  
1097      qemu_co_mutex_unlock(&s->table_lock);
1098  
1099      /* Populate front untouched region of new data cluster */
1100      start = qed_start_of_cluster(s, acb->cur_pos);
1101      len = qed_offset_into_cluster(s, acb->cur_pos);
1102  
1103      trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1104      ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
1105      if (ret < 0) {
1106          goto out;
1107      }
1108  
1109      /* Populate back untouched region of new data cluster */
1110      start = acb->cur_pos + acb->cur_qiov.size;
1111      len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
1112      offset = acb->cur_cluster +
1113               qed_offset_into_cluster(s, acb->cur_pos) +
1114               acb->cur_qiov.size;
1115  
1116      trace_qed_aio_write_postfill(s, acb, start, len, offset);
1117      ret = qed_copy_from_backing_file(s, start, len, offset);
1118      if (ret < 0) {
1119          goto out;
1120      }
1121  
1122      ret = qed_aio_write_main(acb);
1123      if (ret < 0) {
1124          goto out;
1125      }
1126  
1127      if (s->bs->backing) {
1128          /*
1129           * Flush new data clusters before updating the L2 table
1130           *
1131           * This flush is necessary when a backing file is in use.  A crash
1132           * during an allocating write could result in empty clusters in the
1133           * image.  If the write only touched a subregion of the cluster,
1134           * then backing image sectors have been lost in the untouched
1135           * region.  The solution is to flush after writing a new data
1136           * cluster and before updating the L2 table.
1137           */
1138          ret = bdrv_co_flush(s->bs->file->bs);
1139      }
1140  
1141  out:
1142      qemu_co_mutex_lock(&s->table_lock);
1143      return ret;
1144  }
1145  
1146  /**
1147   * Check if the QED_F_NEED_CHECK bit should be set during allocating write
1148   */
qed_should_set_need_check(BDRVQEDState * s)1149  static bool GRAPH_RDLOCK qed_should_set_need_check(BDRVQEDState *s)
1150  {
1151      /* The flush before L2 update path ensures consistency */
1152      if (s->bs->backing) {
1153          return false;
1154      }
1155  
1156      return !(s->header.features & QED_F_NEED_CHECK);
1157  }
1158  
1159  /**
1160   * Write new data cluster
1161   *
1162   * @acb:        Write request
1163   * @len:        Length in bytes
1164   *
1165   * This path is taken when writing to previously unallocated clusters.
1166   *
1167   * Called with table_lock held.
1168   */
1169  static int coroutine_fn GRAPH_RDLOCK
qed_aio_write_alloc(QEDAIOCB * acb,size_t len)1170  qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1171  {
1172      BDRVQEDState *s = acb_to_s(acb);
1173      int ret;
1174  
1175      /* Cancel timer when the first allocating request comes in */
1176      if (s->allocating_acb == NULL) {
1177          qed_cancel_need_check_timer(s);
1178      }
1179  
1180      /* Freeze this request if another allocating write is in progress */
1181      if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
1182          if (s->allocating_acb != NULL) {
1183              qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
1184              assert(s->allocating_acb == NULL);
1185          }
1186          s->allocating_acb = acb;
1187          return -EAGAIN; /* start over with looking up table entries */
1188      }
1189  
1190      acb->cur_nclusters = qed_bytes_to_clusters(s,
1191              qed_offset_into_cluster(s, acb->cur_pos) + len);
1192      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1193  
1194      if (acb->flags & QED_AIOCB_ZERO) {
1195          /* Skip ahead if the clusters are already zero */
1196          if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1197              return 0;
1198          }
1199          acb->cur_cluster = 1;
1200      } else {
1201          acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
1202      }
1203  
1204      if (qed_should_set_need_check(s)) {
1205          s->header.features |= QED_F_NEED_CHECK;
1206          ret = qed_write_header(s);
1207          if (ret < 0) {
1208              return ret;
1209          }
1210      }
1211  
1212      if (!(acb->flags & QED_AIOCB_ZERO)) {
1213          ret = qed_aio_write_cow(acb);
1214          if (ret < 0) {
1215              return ret;
1216          }
1217      }
1218  
1219      return qed_aio_write_l2_update(acb, acb->cur_cluster);
1220  }
1221  
1222  /**
1223   * Write data cluster in place
1224   *
1225   * @acb:        Write request
1226   * @offset:     Cluster offset in bytes
1227   * @len:        Length in bytes
1228   *
1229   * This path is taken when writing to already allocated clusters.
1230   *
1231   * Called with table_lock held.
1232   */
1233  static int coroutine_fn GRAPH_RDLOCK
qed_aio_write_inplace(QEDAIOCB * acb,uint64_t offset,size_t len)1234  qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
1235  {
1236      BDRVQEDState *s = acb_to_s(acb);
1237      int r;
1238  
1239      qemu_co_mutex_unlock(&s->table_lock);
1240  
1241      /* Allocate buffer for zero writes */
1242      if (acb->flags & QED_AIOCB_ZERO) {
1243          struct iovec *iov = acb->qiov->iov;
1244  
1245          if (!iov->iov_base) {
1246              iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
1247              if (iov->iov_base == NULL) {
1248                  r = -ENOMEM;
1249                  goto out;
1250              }
1251              memset(iov->iov_base, 0, iov->iov_len);
1252          }
1253      }
1254  
1255      /* Calculate the I/O vector */
1256      acb->cur_cluster = offset;
1257      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1258  
1259      /* Do the actual write.  */
1260      r = qed_aio_write_main(acb);
1261  out:
1262      qemu_co_mutex_lock(&s->table_lock);
1263      return r;
1264  }
1265  
1266  /**
1267   * Write data cluster
1268   *
1269   * @opaque:     Write request
1270   * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1271   * @offset:     Cluster offset in bytes
1272   * @len:        Length in bytes
1273   *
1274   * Called with table_lock held.
1275   */
1276  static int coroutine_fn GRAPH_RDLOCK
qed_aio_write_data(void * opaque,int ret,uint64_t offset,size_t len)1277  qed_aio_write_data(void *opaque, int ret, uint64_t offset, size_t len)
1278  {
1279      QEDAIOCB *acb = opaque;
1280  
1281      trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1282  
1283      acb->find_cluster_ret = ret;
1284  
1285      switch (ret) {
1286      case QED_CLUSTER_FOUND:
1287          return qed_aio_write_inplace(acb, offset, len);
1288  
1289      case QED_CLUSTER_L2:
1290      case QED_CLUSTER_L1:
1291      case QED_CLUSTER_ZERO:
1292          return qed_aio_write_alloc(acb, len);
1293  
1294      default:
1295          g_assert_not_reached();
1296      }
1297  }
1298  
1299  /**
1300   * Read data cluster
1301   *
1302   * @opaque:     Read request
1303   * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1304   * @offset:     Cluster offset in bytes
1305   * @len:        Length in bytes
1306   *
1307   * Called with table_lock held.
1308   */
1309  static int coroutine_fn GRAPH_RDLOCK
qed_aio_read_data(void * opaque,int ret,uint64_t offset,size_t len)1310  qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
1311  {
1312      QEDAIOCB *acb = opaque;
1313      BDRVQEDState *s = acb_to_s(acb);
1314      BlockDriverState *bs = acb->bs;
1315      int r;
1316  
1317      qemu_co_mutex_unlock(&s->table_lock);
1318  
1319      /* Adjust offset into cluster */
1320      offset += qed_offset_into_cluster(s, acb->cur_pos);
1321  
1322      trace_qed_aio_read_data(s, acb, ret, offset, len);
1323  
1324      qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1325  
1326      /* Handle zero cluster and backing file reads, otherwise read
1327       * data cluster directly.
1328       */
1329      if (ret == QED_CLUSTER_ZERO) {
1330          qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1331          r = 0;
1332      } else if (ret != QED_CLUSTER_FOUND) {
1333          r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov);
1334      } else {
1335          BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
1336          r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
1337                             &acb->cur_qiov, 0);
1338      }
1339  
1340      qemu_co_mutex_lock(&s->table_lock);
1341      return r;
1342  }
1343  
1344  /**
1345   * Begin next I/O or complete the request
1346   */
qed_aio_next_io(QEDAIOCB * acb)1347  static int coroutine_fn GRAPH_RDLOCK qed_aio_next_io(QEDAIOCB *acb)
1348  {
1349      BDRVQEDState *s = acb_to_s(acb);
1350      uint64_t offset;
1351      size_t len;
1352      int ret;
1353  
1354      qemu_co_mutex_lock(&s->table_lock);
1355      while (1) {
1356          trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
1357  
1358          acb->qiov_offset += acb->cur_qiov.size;
1359          acb->cur_pos += acb->cur_qiov.size;
1360          qemu_iovec_reset(&acb->cur_qiov);
1361  
1362          /* Complete request */
1363          if (acb->cur_pos >= acb->end_pos) {
1364              ret = 0;
1365              break;
1366          }
1367  
1368          /* Find next cluster and start I/O */
1369          len = acb->end_pos - acb->cur_pos;
1370          ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
1371          if (ret < 0) {
1372              break;
1373          }
1374  
1375          if (acb->flags & QED_AIOCB_WRITE) {
1376              ret = qed_aio_write_data(acb, ret, offset, len);
1377          } else {
1378              ret = qed_aio_read_data(acb, ret, offset, len);
1379          }
1380  
1381          if (ret < 0 && ret != -EAGAIN) {
1382              break;
1383          }
1384      }
1385  
1386      trace_qed_aio_complete(s, acb, ret);
1387      qed_aio_complete(acb);
1388      qemu_co_mutex_unlock(&s->table_lock);
1389      return ret;
1390  }
1391  
1392  static int coroutine_fn GRAPH_RDLOCK
qed_co_request(BlockDriverState * bs,int64_t sector_num,QEMUIOVector * qiov,int nb_sectors,int flags)1393  qed_co_request(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov,
1394                 int nb_sectors, int flags)
1395  {
1396      QEDAIOCB acb = {
1397          .bs         = bs,
1398          .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
1399          .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
1400          .qiov       = qiov,
1401          .flags      = flags,
1402      };
1403      qemu_iovec_init(&acb.cur_qiov, qiov->niov);
1404  
1405      trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
1406  
1407      /* Start request */
1408      return qed_aio_next_io(&acb);
1409  }
1410  
1411  static int coroutine_fn GRAPH_RDLOCK
bdrv_qed_co_readv(BlockDriverState * bs,int64_t sector_num,int nb_sectors,QEMUIOVector * qiov)1412  bdrv_qed_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1413                    QEMUIOVector *qiov)
1414  {
1415      return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
1416  }
1417  
1418  static int coroutine_fn GRAPH_RDLOCK
bdrv_qed_co_writev(BlockDriverState * bs,int64_t sector_num,int nb_sectors,QEMUIOVector * qiov,int flags)1419  bdrv_qed_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1420                     QEMUIOVector *qiov, int flags)
1421  {
1422      return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
1423  }
1424  
1425  static int coroutine_fn GRAPH_RDLOCK
bdrv_qed_co_pwrite_zeroes(BlockDriverState * bs,int64_t offset,int64_t bytes,BdrvRequestFlags flags)1426  bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
1427                            BdrvRequestFlags flags)
1428  {
1429      BDRVQEDState *s = bs->opaque;
1430  
1431      /*
1432       * Zero writes start without an I/O buffer.  If a buffer becomes necessary
1433       * then it will be allocated during request processing.
1434       */
1435      QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
1436  
1437      /*
1438       * QED is not prepared for 63bit write-zero requests, so rely on
1439       * max_pwrite_zeroes.
1440       */
1441      assert(bytes <= INT_MAX);
1442  
1443      /* Fall back if the request is not aligned */
1444      if (qed_offset_into_cluster(s, offset) ||
1445          qed_offset_into_cluster(s, bytes)) {
1446          return -ENOTSUP;
1447      }
1448  
1449      return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
1450                            bytes >> BDRV_SECTOR_BITS,
1451                            QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1452  }
1453  
1454  static int coroutine_fn GRAPH_RDLOCK
bdrv_qed_co_truncate(BlockDriverState * bs,int64_t offset,bool exact,PreallocMode prealloc,BdrvRequestFlags flags,Error ** errp)1455  bdrv_qed_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
1456                       PreallocMode prealloc, BdrvRequestFlags flags,
1457                       Error **errp)
1458  {
1459      BDRVQEDState *s = bs->opaque;
1460      uint64_t old_image_size;
1461      int ret;
1462  
1463      if (prealloc != PREALLOC_MODE_OFF) {
1464          error_setg(errp, "Unsupported preallocation mode '%s'",
1465                     PreallocMode_str(prealloc));
1466          return -ENOTSUP;
1467      }
1468  
1469      if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1470                                   s->header.table_size)) {
1471          error_setg(errp, "Invalid image size specified");
1472          return -EINVAL;
1473      }
1474  
1475      if ((uint64_t)offset < s->header.image_size) {
1476          error_setg(errp, "Shrinking images is currently not supported");
1477          return -ENOTSUP;
1478      }
1479  
1480      old_image_size = s->header.image_size;
1481      s->header.image_size = offset;
1482      ret = qed_write_header_sync(s);
1483      if (ret < 0) {
1484          s->header.image_size = old_image_size;
1485          error_setg_errno(errp, -ret, "Failed to update the image size");
1486      }
1487      return ret;
1488  }
1489  
bdrv_qed_co_getlength(BlockDriverState * bs)1490  static int64_t coroutine_fn bdrv_qed_co_getlength(BlockDriverState *bs)
1491  {
1492      BDRVQEDState *s = bs->opaque;
1493      return s->header.image_size;
1494  }
1495  
1496  static int coroutine_fn
bdrv_qed_co_get_info(BlockDriverState * bs,BlockDriverInfo * bdi)1497  bdrv_qed_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1498  {
1499      BDRVQEDState *s = bs->opaque;
1500  
1501      memset(bdi, 0, sizeof(*bdi));
1502      bdi->cluster_size = s->header.cluster_size;
1503      bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1504      return 0;
1505  }
1506  
1507  static int coroutine_fn GRAPH_RDLOCK
bdrv_qed_co_change_backing_file(BlockDriverState * bs,const char * backing_file,const char * backing_fmt)1508  bdrv_qed_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
1509                                  const char *backing_fmt)
1510  {
1511      BDRVQEDState *s = bs->opaque;
1512      QEDHeader new_header, le_header;
1513      void *buffer;
1514      size_t buffer_len, backing_file_len;
1515      int ret;
1516  
1517      /* Refuse to set backing filename if unknown compat feature bits are
1518       * active.  If the image uses an unknown compat feature then we may not
1519       * know the layout of data following the header structure and cannot safely
1520       * add a new string.
1521       */
1522      if (backing_file && (s->header.compat_features &
1523                           ~QED_COMPAT_FEATURE_MASK)) {
1524          return -ENOTSUP;
1525      }
1526  
1527      memcpy(&new_header, &s->header, sizeof(new_header));
1528  
1529      new_header.features &= ~(QED_F_BACKING_FILE |
1530                               QED_F_BACKING_FORMAT_NO_PROBE);
1531  
1532      /* Adjust feature flags */
1533      if (backing_file) {
1534          new_header.features |= QED_F_BACKING_FILE;
1535  
1536          if (qed_fmt_is_raw(backing_fmt)) {
1537              new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
1538          }
1539      }
1540  
1541      /* Calculate new header size */
1542      backing_file_len = 0;
1543  
1544      if (backing_file) {
1545          backing_file_len = strlen(backing_file);
1546      }
1547  
1548      buffer_len = sizeof(new_header);
1549      new_header.backing_filename_offset = buffer_len;
1550      new_header.backing_filename_size = backing_file_len;
1551      buffer_len += backing_file_len;
1552  
1553      /* Make sure we can rewrite header without failing */
1554      if (buffer_len > new_header.header_size * new_header.cluster_size) {
1555          return -ENOSPC;
1556      }
1557  
1558      /* Prepare new header */
1559      buffer = g_malloc(buffer_len);
1560  
1561      qed_header_cpu_to_le(&new_header, &le_header);
1562      memcpy(buffer, &le_header, sizeof(le_header));
1563      buffer_len = sizeof(le_header);
1564  
1565      if (backing_file) {
1566          memcpy(buffer + buffer_len, backing_file, backing_file_len);
1567          buffer_len += backing_file_len;
1568      }
1569  
1570      /* Write new header */
1571      ret = bdrv_co_pwrite_sync(bs->file, 0, buffer_len, buffer, 0);
1572      g_free(buffer);
1573      if (ret == 0) {
1574          memcpy(&s->header, &new_header, sizeof(new_header));
1575      }
1576      return ret;
1577  }
1578  
1579  static void coroutine_fn GRAPH_RDLOCK
bdrv_qed_co_invalidate_cache(BlockDriverState * bs,Error ** errp)1580  bdrv_qed_co_invalidate_cache(BlockDriverState *bs, Error **errp)
1581  {
1582      ERRP_GUARD();
1583      BDRVQEDState *s = bs->opaque;
1584      int ret;
1585  
1586      bdrv_qed_do_close(bs);
1587  
1588      bdrv_qed_init_state(bs);
1589      qemu_co_mutex_lock(&s->table_lock);
1590      ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, errp);
1591      qemu_co_mutex_unlock(&s->table_lock);
1592      if (ret < 0) {
1593          error_prepend(errp, "Could not reopen qed layer: ");
1594      }
1595  }
1596  
1597  static int coroutine_fn GRAPH_RDLOCK
bdrv_qed_co_check(BlockDriverState * bs,BdrvCheckResult * result,BdrvCheckMode fix)1598  bdrv_qed_co_check(BlockDriverState *bs, BdrvCheckResult *result,
1599                    BdrvCheckMode fix)
1600  {
1601      BDRVQEDState *s = bs->opaque;
1602      int ret;
1603  
1604      qemu_co_mutex_lock(&s->table_lock);
1605      ret = qed_check(s, result, !!fix);
1606      qemu_co_mutex_unlock(&s->table_lock);
1607  
1608      return ret;
1609  }
1610  
1611  static QemuOptsList qed_create_opts = {
1612      .name = "qed-create-opts",
1613      .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
1614      .desc = {
1615          {
1616              .name = BLOCK_OPT_SIZE,
1617              .type = QEMU_OPT_SIZE,
1618              .help = "Virtual disk size"
1619          },
1620          {
1621              .name = BLOCK_OPT_BACKING_FILE,
1622              .type = QEMU_OPT_STRING,
1623              .help = "File name of a base image"
1624          },
1625          {
1626              .name = BLOCK_OPT_BACKING_FMT,
1627              .type = QEMU_OPT_STRING,
1628              .help = "Image format of the base image"
1629          },
1630          {
1631              .name = BLOCK_OPT_CLUSTER_SIZE,
1632              .type = QEMU_OPT_SIZE,
1633              .help = "Cluster size (in bytes)",
1634              .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
1635          },
1636          {
1637              .name = BLOCK_OPT_TABLE_SIZE,
1638              .type = QEMU_OPT_SIZE,
1639              .help = "L1/L2 table size (in clusters)"
1640          },
1641          { /* end of list */ }
1642      }
1643  };
1644  
1645  static BlockDriver bdrv_qed = {
1646      .format_name                    = "qed",
1647      .instance_size                  = sizeof(BDRVQEDState),
1648      .create_opts                    = &qed_create_opts,
1649      .is_format                      = true,
1650      .supports_backing               = true,
1651  
1652      .bdrv_probe                     = bdrv_qed_probe,
1653      .bdrv_open                      = bdrv_qed_open,
1654      .bdrv_close                     = bdrv_qed_close,
1655      .bdrv_reopen_prepare            = bdrv_qed_reopen_prepare,
1656      .bdrv_child_perm                = bdrv_default_perms,
1657      .bdrv_co_create                 = bdrv_qed_co_create,
1658      .bdrv_co_create_opts            = bdrv_qed_co_create_opts,
1659      .bdrv_has_zero_init             = bdrv_has_zero_init_1,
1660      .bdrv_co_block_status           = bdrv_qed_co_block_status,
1661      .bdrv_co_readv                  = bdrv_qed_co_readv,
1662      .bdrv_co_writev                 = bdrv_qed_co_writev,
1663      .bdrv_co_pwrite_zeroes          = bdrv_qed_co_pwrite_zeroes,
1664      .bdrv_co_truncate               = bdrv_qed_co_truncate,
1665      .bdrv_co_getlength              = bdrv_qed_co_getlength,
1666      .bdrv_co_get_info               = bdrv_qed_co_get_info,
1667      .bdrv_refresh_limits            = bdrv_qed_refresh_limits,
1668      .bdrv_co_change_backing_file    = bdrv_qed_co_change_backing_file,
1669      .bdrv_co_invalidate_cache       = bdrv_qed_co_invalidate_cache,
1670      .bdrv_co_check                  = bdrv_qed_co_check,
1671      .bdrv_detach_aio_context        = bdrv_qed_detach_aio_context,
1672      .bdrv_attach_aio_context        = bdrv_qed_attach_aio_context,
1673      .bdrv_drain_begin               = bdrv_qed_drain_begin,
1674  };
1675  
bdrv_qed_init(void)1676  static void bdrv_qed_init(void)
1677  {
1678      bdrv_register(&bdrv_qed);
1679  }
1680  
1681  block_init(bdrv_qed_init);
1682