xref: /openbmc/qemu/block/qcow.c (revision 9971cbac2f30a89ddb094dc9627d2d16dc6e5875)
1  /*
2   * Block driver for the QCOW format
3   *
4   * Copyright (c) 2004-2006 Fabrice Bellard
5   *
6   * Permission is hereby granted, free of charge, to any person obtaining a copy
7   * of this software and associated documentation files (the "Software"), to deal
8   * in the Software without restriction, including without limitation the rights
9   * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10   * copies of the Software, and to permit persons to whom the Software is
11   * furnished to do so, subject to the following conditions:
12   *
13   * The above copyright notice and this permission notice shall be included in
14   * all copies or substantial portions of the Software.
15   *
16   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19   * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22   * THE SOFTWARE.
23   */
24  
25  #include "qemu/osdep.h"
26  #include "qapi/error.h"
27  #include "qemu/error-report.h"
28  #include "block/block_int.h"
29  #include "block/qdict.h"
30  #include "sysemu/block-backend.h"
31  #include "qemu/module.h"
32  #include "qemu/option.h"
33  #include "qemu/bswap.h"
34  #include "qemu/cutils.h"
35  #include "qemu/memalign.h"
36  #include <zlib.h>
37  #include "qapi/qmp/qdict.h"
38  #include "qapi/qmp/qstring.h"
39  #include "qapi/qobject-input-visitor.h"
40  #include "qapi/qapi-visit-block-core.h"
41  #include "crypto/block.h"
42  #include "migration/blocker.h"
43  #include "crypto.h"
44  
45  /**************************************************************/
46  /* QEMU COW block driver with compression and encryption support */
47  
48  #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
49  #define QCOW_VERSION 1
50  
51  #define QCOW_CRYPT_NONE 0
52  #define QCOW_CRYPT_AES  1
53  
54  #define QCOW_OFLAG_COMPRESSED (1LL << 63)
55  
56  typedef struct QCowHeader {
57      uint32_t magic;
58      uint32_t version;
59      uint64_t backing_file_offset;
60      uint32_t backing_file_size;
61      uint32_t mtime;
62      uint64_t size; /* in bytes */
63      uint8_t cluster_bits;
64      uint8_t l2_bits;
65      uint16_t padding;
66      uint32_t crypt_method;
67      uint64_t l1_table_offset;
68  } QEMU_PACKED QCowHeader;
69  
70  #define L2_CACHE_SIZE 16
71  
72  typedef struct BDRVQcowState {
73      int cluster_bits;
74      int cluster_size;
75      int l2_bits;
76      int l2_size;
77      unsigned int l1_size;
78      uint64_t cluster_offset_mask;
79      uint64_t l1_table_offset;
80      uint64_t *l1_table;
81      uint64_t *l2_cache;
82      uint64_t l2_cache_offsets[L2_CACHE_SIZE];
83      uint32_t l2_cache_counts[L2_CACHE_SIZE];
84      uint8_t *cluster_cache;
85      uint8_t *cluster_data;
86      uint64_t cluster_cache_offset;
87      QCryptoBlock *crypto; /* Disk encryption format driver */
88      uint32_t crypt_method_header;
89      CoMutex lock;
90      Error *migration_blocker;
91  } BDRVQcowState;
92  
93  static QemuOptsList qcow_create_opts;
94  
95  static int coroutine_fn GRAPH_RDLOCK
96  decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
97  
98  static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
99  {
100      const QCowHeader *cow_header = (const void *)buf;
101  
102      if (buf_size >= sizeof(QCowHeader) &&
103          be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
104          be32_to_cpu(cow_header->version) == QCOW_VERSION)
105          return 100;
106      else
107          return 0;
108  }
109  
110  static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
111                       Error **errp)
112  {
113      BDRVQcowState *s = bs->opaque;
114      unsigned int len, i, shift;
115      int ret;
116      QCowHeader header;
117      QCryptoBlockOpenOptions *crypto_opts = NULL;
118      unsigned int cflags = 0;
119      QDict *encryptopts = NULL;
120      const char *encryptfmt;
121  
122      qdict_extract_subqdict(options, &encryptopts, "encrypt.");
123      encryptfmt = qdict_get_try_str(encryptopts, "format");
124  
125      ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
126      if (ret < 0) {
127          goto fail_unlocked;
128      }
129  
130      bdrv_graph_rdlock_main_loop();
131  
132      ret = bdrv_pread(bs->file, 0, sizeof(header), &header, 0);
133      if (ret < 0) {
134          goto fail;
135      }
136      header.magic = be32_to_cpu(header.magic);
137      header.version = be32_to_cpu(header.version);
138      header.backing_file_offset = be64_to_cpu(header.backing_file_offset);
139      header.backing_file_size = be32_to_cpu(header.backing_file_size);
140      header.mtime = be32_to_cpu(header.mtime);
141      header.size = be64_to_cpu(header.size);
142      header.crypt_method = be32_to_cpu(header.crypt_method);
143      header.l1_table_offset = be64_to_cpu(header.l1_table_offset);
144  
145      if (header.magic != QCOW_MAGIC) {
146          error_setg(errp, "Image not in qcow format");
147          ret = -EINVAL;
148          goto fail;
149      }
150      if (header.version != QCOW_VERSION) {
151          error_setg(errp, "qcow (v%d) does not support qcow version %" PRIu32,
152                     QCOW_VERSION, header.version);
153          if (header.version == 2 || header.version == 3) {
154              error_append_hint(errp, "Try the 'qcow2' driver instead.\n");
155          }
156  
157          ret = -ENOTSUP;
158          goto fail;
159      }
160  
161      if (header.size <= 1) {
162          error_setg(errp, "Image size is too small (must be at least 2 bytes)");
163          ret = -EINVAL;
164          goto fail;
165      }
166      if (header.cluster_bits < 9 || header.cluster_bits > 16) {
167          error_setg(errp, "Cluster size must be between 512 and 64k");
168          ret = -EINVAL;
169          goto fail;
170      }
171  
172      /* l2_bits specifies number of entries; storing a uint64_t in each entry,
173       * so bytes = num_entries << 3. */
174      if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) {
175          error_setg(errp, "L2 table size must be between 512 and 64k");
176          ret = -EINVAL;
177          goto fail;
178      }
179  
180      s->crypt_method_header = header.crypt_method;
181      if (s->crypt_method_header) {
182          if (bdrv_uses_whitelist() &&
183              s->crypt_method_header == QCOW_CRYPT_AES) {
184              error_setg(errp,
185                         "Use of AES-CBC encrypted qcow images is no longer "
186                         "supported in system emulators");
187              error_append_hint(errp,
188                                "You can use 'qemu-img convert' to convert your "
189                                "image to an alternative supported format, such "
190                                "as unencrypted qcow, or raw with the LUKS "
191                                "format instead.\n");
192              ret = -ENOSYS;
193              goto fail;
194          }
195          if (s->crypt_method_header == QCOW_CRYPT_AES) {
196              if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
197                  error_setg(errp,
198                             "Header reported 'aes' encryption format but "
199                             "options specify '%s'", encryptfmt);
200                  ret = -EINVAL;
201                  goto fail;
202              }
203              qdict_put_str(encryptopts, "format", "qcow");
204              crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
205              if (!crypto_opts) {
206                  ret = -EINVAL;
207                  goto fail;
208              }
209  
210              if (flags & BDRV_O_NO_IO) {
211                  cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
212              }
213              s->crypto = qcrypto_block_open(crypto_opts, "encrypt.",
214                                             NULL, NULL, cflags, errp);
215              if (!s->crypto) {
216                  ret = -EINVAL;
217                  goto fail;
218              }
219          } else {
220              error_setg(errp, "invalid encryption method in qcow header");
221              ret = -EINVAL;
222              goto fail;
223          }
224          bs->encrypted = true;
225      } else {
226          if (encryptfmt) {
227              error_setg(errp, "No encryption in image header, but options "
228                         "specified format '%s'", encryptfmt);
229              ret = -EINVAL;
230              goto fail;
231          }
232      }
233      s->cluster_bits = header.cluster_bits;
234      s->cluster_size = 1 << s->cluster_bits;
235      s->l2_bits = header.l2_bits;
236      s->l2_size = 1 << s->l2_bits;
237      bs->total_sectors = header.size / 512;
238      s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
239  
240      /* read the level 1 table */
241      shift = s->cluster_bits + s->l2_bits;
242      if (header.size > UINT64_MAX - (1LL << shift)) {
243          error_setg(errp, "Image too large");
244          ret = -EINVAL;
245          goto fail;
246      } else {
247          uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift;
248          if (l1_size > INT_MAX / sizeof(uint64_t)) {
249              error_setg(errp, "Image too large");
250              ret = -EINVAL;
251              goto fail;
252          }
253          s->l1_size = l1_size;
254      }
255  
256      s->l1_table_offset = header.l1_table_offset;
257      s->l1_table = g_try_new(uint64_t, s->l1_size);
258      if (s->l1_table == NULL) {
259          error_setg(errp, "Could not allocate memory for L1 table");
260          ret = -ENOMEM;
261          goto fail;
262      }
263  
264      ret = bdrv_pread(bs->file, s->l1_table_offset,
265                       s->l1_size * sizeof(uint64_t), s->l1_table, 0);
266      if (ret < 0) {
267          goto fail;
268      }
269  
270      for(i = 0;i < s->l1_size; i++) {
271          s->l1_table[i] = be64_to_cpu(s->l1_table[i]);
272      }
273  
274      /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */
275      s->l2_cache =
276          qemu_try_blockalign(bs->file->bs,
277                              s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
278      if (s->l2_cache == NULL) {
279          error_setg(errp, "Could not allocate L2 table cache");
280          ret = -ENOMEM;
281          goto fail;
282      }
283      s->cluster_cache = g_malloc(s->cluster_size);
284      s->cluster_data = g_malloc(s->cluster_size);
285      s->cluster_cache_offset = -1;
286  
287      /* read the backing file name */
288      if (header.backing_file_offset != 0) {
289          len = header.backing_file_size;
290          if (len > 1023 || len >= sizeof(bs->backing_file)) {
291              error_setg(errp, "Backing file name too long");
292              ret = -EINVAL;
293              goto fail;
294          }
295          ret = bdrv_pread(bs->file, header.backing_file_offset, len,
296                           bs->auto_backing_file, 0);
297          if (ret < 0) {
298              goto fail;
299          }
300          bs->auto_backing_file[len] = '\0';
301          pstrcpy(bs->backing_file, sizeof(bs->backing_file),
302                  bs->auto_backing_file);
303      }
304  
305      /* Disable migration when qcow images are used */
306      error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
307                 "does not support live migration",
308                 bdrv_get_device_or_node_name(bs));
309  
310      ret = migrate_add_blocker_normal(&s->migration_blocker, errp);
311      if (ret < 0) {
312          goto fail;
313      }
314  
315      qobject_unref(encryptopts);
316      qapi_free_QCryptoBlockOpenOptions(crypto_opts);
317      qemu_co_mutex_init(&s->lock);
318      bdrv_graph_rdunlock_main_loop();
319      return 0;
320  
321  fail:
322      bdrv_graph_rdunlock_main_loop();
323  fail_unlocked:
324      g_free(s->l1_table);
325      qemu_vfree(s->l2_cache);
326      g_free(s->cluster_cache);
327      g_free(s->cluster_data);
328      qcrypto_block_free(s->crypto);
329      qobject_unref(encryptopts);
330      qapi_free_QCryptoBlockOpenOptions(crypto_opts);
331      return ret;
332  }
333  
334  
335  /* We have nothing to do for QCOW reopen, stubs just return
336   * success */
337  static int qcow_reopen_prepare(BDRVReopenState *state,
338                                 BlockReopenQueue *queue, Error **errp)
339  {
340      return 0;
341  }
342  
343  
344  /* 'allocate' is:
345   *
346   * 0 to not allocate.
347   *
348   * 1 to allocate a normal cluster (for sector-aligned byte offsets 'n_start'
349   * to 'n_end' within the cluster)
350   *
351   * 2 to allocate a compressed cluster of size
352   * 'compressed_size'. 'compressed_size' must be > 0 and <
353   * cluster_size
354   *
355   * return 0 if not allocated, 1 if *result is assigned, and negative
356   * errno on failure.
357   */
358  static int coroutine_fn GRAPH_RDLOCK
359  get_cluster_offset(BlockDriverState *bs, uint64_t offset, int allocate,
360                     int compressed_size, int n_start, int n_end,
361                     uint64_t *result)
362  {
363      BDRVQcowState *s = bs->opaque;
364      int min_index, i, j, l1_index, l2_index, ret;
365      int64_t l2_offset;
366      uint64_t *l2_table, cluster_offset, tmp;
367      uint32_t min_count;
368      int new_l2_table;
369  
370      *result = 0;
371      l1_index = offset >> (s->l2_bits + s->cluster_bits);
372      l2_offset = s->l1_table[l1_index];
373      new_l2_table = 0;
374      if (!l2_offset) {
375          if (!allocate)
376              return 0;
377          /* allocate a new l2 entry */
378          l2_offset = bdrv_co_getlength(bs->file->bs);
379          if (l2_offset < 0) {
380              return l2_offset;
381          }
382          /* round to cluster size */
383          l2_offset = QEMU_ALIGN_UP(l2_offset, s->cluster_size);
384          /* update the L1 entry */
385          s->l1_table[l1_index] = l2_offset;
386          tmp = cpu_to_be64(l2_offset);
387          BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_UPDATE);
388          ret = bdrv_co_pwrite_sync(bs->file,
389                                    s->l1_table_offset + l1_index * sizeof(tmp),
390                                    sizeof(tmp), &tmp, 0);
391          if (ret < 0) {
392              return ret;
393          }
394          new_l2_table = 1;
395      }
396      for(i = 0; i < L2_CACHE_SIZE; i++) {
397          if (l2_offset == s->l2_cache_offsets[i]) {
398              /* increment the hit count */
399              if (++s->l2_cache_counts[i] == 0xffffffff) {
400                  for(j = 0; j < L2_CACHE_SIZE; j++) {
401                      s->l2_cache_counts[j] >>= 1;
402                  }
403              }
404              l2_table = s->l2_cache + (i << s->l2_bits);
405              goto found;
406          }
407      }
408      /* not found: load a new entry in the least used one */
409      min_index = 0;
410      min_count = 0xffffffff;
411      for(i = 0; i < L2_CACHE_SIZE; i++) {
412          if (s->l2_cache_counts[i] < min_count) {
413              min_count = s->l2_cache_counts[i];
414              min_index = i;
415          }
416      }
417      l2_table = s->l2_cache + (min_index << s->l2_bits);
418      BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_LOAD);
419      if (new_l2_table) {
420          memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
421          ret = bdrv_co_pwrite_sync(bs->file, l2_offset,
422                                    s->l2_size * sizeof(uint64_t), l2_table, 0);
423          if (ret < 0) {
424              return ret;
425          }
426      } else {
427          ret = bdrv_co_pread(bs->file, l2_offset,
428                              s->l2_size * sizeof(uint64_t), l2_table, 0);
429          if (ret < 0) {
430              return ret;
431          }
432      }
433      s->l2_cache_offsets[min_index] = l2_offset;
434      s->l2_cache_counts[min_index] = 1;
435   found:
436      l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
437      cluster_offset = be64_to_cpu(l2_table[l2_index]);
438      if (!cluster_offset ||
439          ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
440          if (!allocate)
441              return 0;
442          BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
443          assert(QEMU_IS_ALIGNED(n_start | n_end, BDRV_SECTOR_SIZE));
444          /* allocate a new cluster */
445          if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
446              (n_end - n_start) < s->cluster_size) {
447              /* if the cluster is already compressed, we must
448                 decompress it in the case it is not completely
449                 overwritten */
450              if (decompress_cluster(bs, cluster_offset) < 0) {
451                  return -EIO;
452              }
453              cluster_offset = bdrv_co_getlength(bs->file->bs);
454              if ((int64_t) cluster_offset < 0) {
455                  return cluster_offset;
456              }
457              cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
458              /* write the cluster content */
459              BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
460              ret = bdrv_co_pwrite(bs->file, cluster_offset, s->cluster_size,
461                                   s->cluster_cache, 0);
462              if (ret < 0) {
463                  return ret;
464              }
465          } else {
466              cluster_offset = bdrv_co_getlength(bs->file->bs);
467              if ((int64_t) cluster_offset < 0) {
468                  return cluster_offset;
469              }
470              if (allocate == 1) {
471                  /* round to cluster size */
472                  cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
473                  if (cluster_offset + s->cluster_size > INT64_MAX) {
474                      return -E2BIG;
475                  }
476                  ret = bdrv_co_truncate(bs->file,
477                                         cluster_offset + s->cluster_size,
478                                         false, PREALLOC_MODE_OFF, 0, NULL);
479                  if (ret < 0) {
480                      return ret;
481                  }
482                  /* if encrypted, we must initialize the cluster
483                     content which won't be written */
484                  if (bs->encrypted &&
485                      (n_end - n_start) < s->cluster_size) {
486                      uint64_t start_offset;
487                      assert(s->crypto);
488                      start_offset = offset & ~(s->cluster_size - 1);
489                      for (i = 0; i < s->cluster_size; i += BDRV_SECTOR_SIZE) {
490                          if (i < n_start || i >= n_end) {
491                              memset(s->cluster_data, 0x00, BDRV_SECTOR_SIZE);
492                              if (qcrypto_block_encrypt(s->crypto,
493                                                        start_offset + i,
494                                                        s->cluster_data,
495                                                        BDRV_SECTOR_SIZE,
496                                                        NULL) < 0) {
497                                  return -EIO;
498                              }
499                              BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
500                              ret = bdrv_co_pwrite(bs->file, cluster_offset + i,
501                                                   BDRV_SECTOR_SIZE,
502                                                   s->cluster_data, 0);
503                              if (ret < 0) {
504                                  return ret;
505                              }
506                          }
507                      }
508                  }
509              } else if (allocate == 2) {
510                  cluster_offset |= QCOW_OFLAG_COMPRESSED |
511                      (uint64_t)compressed_size << (63 - s->cluster_bits);
512              }
513          }
514          /* update L2 table */
515          tmp = cpu_to_be64(cluster_offset);
516          l2_table[l2_index] = tmp;
517          if (allocate == 2) {
518              BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
519          } else {
520              BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE);
521          }
522          ret = bdrv_co_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
523                                    sizeof(tmp), &tmp, 0);
524          if (ret < 0) {
525              return ret;
526          }
527      }
528      *result = cluster_offset;
529      return 1;
530  }
531  
532  static int coroutine_fn GRAPH_RDLOCK
533  qcow_co_block_status(BlockDriverState *bs, bool want_zero,
534                       int64_t offset, int64_t bytes, int64_t *pnum,
535                       int64_t *map, BlockDriverState **file)
536  {
537      BDRVQcowState *s = bs->opaque;
538      int index_in_cluster, ret;
539      int64_t n;
540      uint64_t cluster_offset;
541  
542      qemu_co_mutex_lock(&s->lock);
543      ret = get_cluster_offset(bs, offset, 0, 0, 0, 0, &cluster_offset);
544      qemu_co_mutex_unlock(&s->lock);
545      if (ret < 0) {
546          return ret;
547      }
548      index_in_cluster = offset & (s->cluster_size - 1);
549      n = s->cluster_size - index_in_cluster;
550      if (n > bytes) {
551          n = bytes;
552      }
553      *pnum = n;
554      if (!cluster_offset) {
555          return 0;
556      }
557      if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
558          return BDRV_BLOCK_DATA | BDRV_BLOCK_COMPRESSED;
559      }
560      if (s->crypto) {
561          return BDRV_BLOCK_DATA;
562      }
563      *map = cluster_offset | index_in_cluster;
564      *file = bs->file->bs;
565      return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
566  }
567  
568  static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
569                               const uint8_t *buf, int buf_size)
570  {
571      z_stream strm1, *strm = &strm1;
572      int ret, out_len;
573  
574      memset(strm, 0, sizeof(*strm));
575  
576      strm->next_in = (uint8_t *)buf;
577      strm->avail_in = buf_size;
578      strm->next_out = out_buf;
579      strm->avail_out = out_buf_size;
580  
581      ret = inflateInit2(strm, -12);
582      if (ret != Z_OK)
583          return -1;
584      ret = inflate(strm, Z_FINISH);
585      out_len = strm->next_out - out_buf;
586      if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
587          out_len != out_buf_size) {
588          inflateEnd(strm);
589          return -1;
590      }
591      inflateEnd(strm);
592      return 0;
593  }
594  
595  static int coroutine_fn GRAPH_RDLOCK
596  decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
597  {
598      BDRVQcowState *s = bs->opaque;
599      int ret, csize;
600      uint64_t coffset;
601  
602      coffset = cluster_offset & s->cluster_offset_mask;
603      if (s->cluster_cache_offset != coffset) {
604          csize = cluster_offset >> (63 - s->cluster_bits);
605          csize &= (s->cluster_size - 1);
606          BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
607          ret = bdrv_co_pread(bs->file, coffset, csize, s->cluster_data, 0);
608          if (ret < 0)
609              return -1;
610          if (decompress_buffer(s->cluster_cache, s->cluster_size,
611                                s->cluster_data, csize) < 0) {
612              return -1;
613          }
614          s->cluster_cache_offset = coffset;
615      }
616      return 0;
617  }
618  
619  static void qcow_refresh_limits(BlockDriverState *bs, Error **errp)
620  {
621      /* At least encrypted images require 512-byte alignment. Apply the
622       * limit universally, rather than just on encrypted images, as
623       * it's easier to let the block layer handle rounding than to
624       * audit this code further. */
625      bs->bl.request_alignment = BDRV_SECTOR_SIZE;
626  }
627  
628  static int coroutine_fn GRAPH_RDLOCK
629  qcow_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
630                 QEMUIOVector *qiov, BdrvRequestFlags flags)
631  {
632      BDRVQcowState *s = bs->opaque;
633      int offset_in_cluster;
634      int ret = 0, n;
635      uint64_t cluster_offset;
636      uint8_t *buf;
637      void *orig_buf;
638  
639      if (qiov->niov > 1) {
640          buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
641          if (buf == NULL) {
642              return -ENOMEM;
643          }
644      } else {
645          orig_buf = NULL;
646          buf = (uint8_t *)qiov->iov->iov_base;
647      }
648  
649      qemu_co_mutex_lock(&s->lock);
650  
651      while (bytes != 0) {
652          /* prepare next request */
653          ret = get_cluster_offset(bs, offset, 0, 0, 0, 0, &cluster_offset);
654          if (ret < 0) {
655              break;
656          }
657          offset_in_cluster = offset & (s->cluster_size - 1);
658          n = s->cluster_size - offset_in_cluster;
659          if (n > bytes) {
660              n = bytes;
661          }
662  
663          if (!cluster_offset) {
664              if (bs->backing) {
665                  /* read from the base image */
666                  qemu_co_mutex_unlock(&s->lock);
667                  /* qcow2 emits this on bs->file instead of bs->backing */
668                  BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
669                  ret = bdrv_co_pread(bs->backing, offset, n, buf, 0);
670                  qemu_co_mutex_lock(&s->lock);
671                  if (ret < 0) {
672                      break;
673                  }
674              } else {
675                  /* Note: in this case, no need to wait */
676                  memset(buf, 0, n);
677              }
678          } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
679              /* add AIO support for compressed blocks ? */
680              if (decompress_cluster(bs, cluster_offset) < 0) {
681                  ret = -EIO;
682                  break;
683              }
684              memcpy(buf, s->cluster_cache + offset_in_cluster, n);
685          } else {
686              if ((cluster_offset & 511) != 0) {
687                  ret = -EIO;
688                  break;
689              }
690              qemu_co_mutex_unlock(&s->lock);
691              BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
692              ret = bdrv_co_pread(bs->file, cluster_offset + offset_in_cluster,
693                                  n, buf, 0);
694              qemu_co_mutex_lock(&s->lock);
695              if (ret < 0) {
696                  break;
697              }
698              if (bs->encrypted) {
699                  assert(s->crypto);
700                  if (qcrypto_block_decrypt(s->crypto,
701                                            offset, buf, n, NULL) < 0) {
702                      ret = -EIO;
703                      break;
704                  }
705              }
706          }
707          ret = 0;
708  
709          bytes -= n;
710          offset += n;
711          buf += n;
712      }
713  
714      qemu_co_mutex_unlock(&s->lock);
715  
716      if (qiov->niov > 1) {
717          qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
718          qemu_vfree(orig_buf);
719      }
720  
721      return ret;
722  }
723  
724  static int coroutine_fn GRAPH_RDLOCK
725  qcow_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
726                  QEMUIOVector *qiov, BdrvRequestFlags flags)
727  {
728      BDRVQcowState *s = bs->opaque;
729      int offset_in_cluster;
730      uint64_t cluster_offset;
731      int ret = 0, n;
732      uint8_t *buf;
733      void *orig_buf;
734  
735      s->cluster_cache_offset = -1; /* disable compressed cache */
736  
737      /* We must always copy the iov when encrypting, so we
738       * don't modify the original data buffer during encryption */
739      if (bs->encrypted || qiov->niov > 1) {
740          buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
741          if (buf == NULL) {
742              return -ENOMEM;
743          }
744          qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
745      } else {
746          orig_buf = NULL;
747          buf = (uint8_t *)qiov->iov->iov_base;
748      }
749  
750      qemu_co_mutex_lock(&s->lock);
751  
752      while (bytes != 0) {
753          offset_in_cluster = offset & (s->cluster_size - 1);
754          n = s->cluster_size - offset_in_cluster;
755          if (n > bytes) {
756              n = bytes;
757          }
758          ret = get_cluster_offset(bs, offset, 1, 0, offset_in_cluster,
759                                   offset_in_cluster + n, &cluster_offset);
760          if (ret < 0) {
761              break;
762          }
763          if (!cluster_offset || (cluster_offset & 511) != 0) {
764              ret = -EIO;
765              break;
766          }
767          if (bs->encrypted) {
768              assert(s->crypto);
769              if (qcrypto_block_encrypt(s->crypto, offset, buf, n, NULL) < 0) {
770                  ret = -EIO;
771                  break;
772              }
773          }
774  
775          qemu_co_mutex_unlock(&s->lock);
776          BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
777          ret = bdrv_co_pwrite(bs->file, cluster_offset + offset_in_cluster,
778                               n, buf, 0);
779          qemu_co_mutex_lock(&s->lock);
780          if (ret < 0) {
781              break;
782          }
783          ret = 0;
784  
785          bytes -= n;
786          offset += n;
787          buf += n;
788      }
789      qemu_co_mutex_unlock(&s->lock);
790  
791      qemu_vfree(orig_buf);
792  
793      return ret;
794  }
795  
796  static void qcow_close(BlockDriverState *bs)
797  {
798      BDRVQcowState *s = bs->opaque;
799  
800      qcrypto_block_free(s->crypto);
801      s->crypto = NULL;
802      g_free(s->l1_table);
803      qemu_vfree(s->l2_cache);
804      g_free(s->cluster_cache);
805      g_free(s->cluster_data);
806  
807      migrate_del_blocker(&s->migration_blocker);
808  }
809  
810  static int coroutine_fn GRAPH_UNLOCKED
811  qcow_co_create(BlockdevCreateOptions *opts, Error **errp)
812  {
813      BlockdevCreateOptionsQcow *qcow_opts;
814      int header_size, backing_filename_len, l1_size, shift, i;
815      QCowHeader header;
816      uint8_t *tmp;
817      int64_t total_size = 0;
818      int ret;
819      BlockDriverState *bs;
820      BlockBackend *qcow_blk;
821      QCryptoBlock *crypto = NULL;
822  
823      assert(opts->driver == BLOCKDEV_DRIVER_QCOW);
824      qcow_opts = &opts->u.qcow;
825  
826      /* Sanity checks */
827      total_size = qcow_opts->size;
828      if (total_size == 0) {
829          error_setg(errp, "Image size is too small, cannot be zero length");
830          return -EINVAL;
831      }
832  
833      if (qcow_opts->encrypt &&
834          qcow_opts->encrypt->format != Q_CRYPTO_BLOCK_FORMAT_QCOW)
835      {
836          error_setg(errp, "Unsupported encryption format");
837          return -EINVAL;
838      }
839  
840      /* Create BlockBackend to write to the image */
841      bs = bdrv_co_open_blockdev_ref(qcow_opts->file, errp);
842      if (bs == NULL) {
843          return -EIO;
844      }
845  
846      qcow_blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE,
847                                    BLK_PERM_ALL, errp);
848      if (!qcow_blk) {
849          ret = -EPERM;
850          goto exit;
851      }
852      blk_set_allow_write_beyond_eof(qcow_blk, true);
853  
854      /* Create image format */
855      memset(&header, 0, sizeof(header));
856      header.magic = cpu_to_be32(QCOW_MAGIC);
857      header.version = cpu_to_be32(QCOW_VERSION);
858      header.size = cpu_to_be64(total_size);
859      header_size = sizeof(header);
860      backing_filename_len = 0;
861      if (qcow_opts->backing_file) {
862          if (strcmp(qcow_opts->backing_file, "fat:")) {
863              header.backing_file_offset = cpu_to_be64(header_size);
864              backing_filename_len = strlen(qcow_opts->backing_file);
865              header.backing_file_size = cpu_to_be32(backing_filename_len);
866              header_size += backing_filename_len;
867          } else {
868              /* special backing file for vvfat */
869              qcow_opts->backing_file = NULL;
870          }
871          header.cluster_bits = 9; /* 512 byte cluster to avoid copying
872                                      unmodified sectors */
873          header.l2_bits = 12; /* 32 KB L2 tables */
874      } else {
875          header.cluster_bits = 12; /* 4 KB clusters */
876          header.l2_bits = 9; /* 4 KB L2 tables */
877      }
878      header_size = (header_size + 7) & ~7;
879      shift = header.cluster_bits + header.l2_bits;
880      l1_size = (total_size + (1LL << shift) - 1) >> shift;
881  
882      header.l1_table_offset = cpu_to_be64(header_size);
883  
884      if (qcow_opts->encrypt) {
885          header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
886  
887          crypto = qcrypto_block_create(qcow_opts->encrypt, "encrypt.",
888                                        NULL, NULL, NULL, 0, errp);
889          if (!crypto) {
890              ret = -EINVAL;
891              goto exit;
892          }
893      } else {
894          header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
895      }
896  
897      /* write all the data */
898      ret = blk_co_pwrite(qcow_blk, 0, sizeof(header), &header, 0);
899      if (ret < 0) {
900          goto exit;
901      }
902  
903      if (qcow_opts->backing_file) {
904          ret = blk_co_pwrite(qcow_blk, sizeof(header), backing_filename_len,
905                              qcow_opts->backing_file, 0);
906          if (ret < 0) {
907              goto exit;
908          }
909      }
910  
911      tmp = g_malloc0(BDRV_SECTOR_SIZE);
912      for (i = 0; i < DIV_ROUND_UP(sizeof(uint64_t) * l1_size, BDRV_SECTOR_SIZE);
913           i++) {
914          ret = blk_co_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i,
915                              BDRV_SECTOR_SIZE, tmp, 0);
916          if (ret < 0) {
917              g_free(tmp);
918              goto exit;
919          }
920      }
921  
922      g_free(tmp);
923      ret = 0;
924  exit:
925      blk_co_unref(qcow_blk);
926      bdrv_co_unref(bs);
927      qcrypto_block_free(crypto);
928      return ret;
929  }
930  
931  static int coroutine_fn GRAPH_UNLOCKED
932  qcow_co_create_opts(BlockDriver *drv, const char *filename,
933                      QemuOpts *opts, Error **errp)
934  {
935      BlockdevCreateOptions *create_options = NULL;
936      BlockDriverState *bs = NULL;
937      QDict *qdict = NULL;
938      Visitor *v;
939      const char *val;
940      int ret;
941      char *backing_fmt;
942  
943      static const QDictRenames opt_renames[] = {
944          { BLOCK_OPT_BACKING_FILE,       "backing-file" },
945          { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
946          { NULL, NULL },
947      };
948  
949      /*
950       * We can't actually store a backing format, but can check that
951       * the user's request made sense.
952       */
953      backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
954      if (backing_fmt && !bdrv_find_format(backing_fmt)) {
955          error_setg(errp, "unrecognized backing format '%s'", backing_fmt);
956          ret = -EINVAL;
957          goto fail;
958      }
959  
960      /* Parse options and convert legacy syntax */
961      qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qcow_create_opts, true);
962  
963      val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
964      if (val && !strcmp(val, "on")) {
965          qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
966      } else if (val && !strcmp(val, "off")) {
967          qdict_del(qdict, BLOCK_OPT_ENCRYPT);
968      }
969  
970      val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
971      if (val && !strcmp(val, "aes")) {
972          qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
973      }
974  
975      if (!qdict_rename_keys(qdict, opt_renames, errp)) {
976          ret = -EINVAL;
977          goto fail;
978      }
979  
980      /* Create and open the file (protocol layer) */
981      ret = bdrv_co_create_file(filename, opts, errp);
982      if (ret < 0) {
983          goto fail;
984      }
985  
986      bs = bdrv_co_open(filename, NULL, NULL,
987                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
988      if (bs == NULL) {
989          ret = -EIO;
990          goto fail;
991      }
992  
993      /* Now get the QAPI type BlockdevCreateOptions */
994      qdict_put_str(qdict, "driver", "qcow");
995      qdict_put_str(qdict, "file", bs->node_name);
996  
997      v = qobject_input_visitor_new_flat_confused(qdict, errp);
998      if (!v) {
999          ret = -EINVAL;
1000          goto fail;
1001      }
1002  
1003      visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
1004      visit_free(v);
1005      if (!create_options) {
1006          ret = -EINVAL;
1007          goto fail;
1008      }
1009  
1010      /* Silently round up size */
1011      assert(create_options->driver == BLOCKDEV_DRIVER_QCOW);
1012      create_options->u.qcow.size =
1013          ROUND_UP(create_options->u.qcow.size, BDRV_SECTOR_SIZE);
1014  
1015      /* Create the qcow image (format layer) */
1016      ret = qcow_co_create(create_options, errp);
1017      if (ret < 0) {
1018          goto fail;
1019      }
1020  
1021      ret = 0;
1022  fail:
1023      g_free(backing_fmt);
1024      qobject_unref(qdict);
1025      bdrv_co_unref(bs);
1026      qapi_free_BlockdevCreateOptions(create_options);
1027      return ret;
1028  }
1029  
1030  static int GRAPH_RDLOCK qcow_make_empty(BlockDriverState *bs)
1031  {
1032      BDRVQcowState *s = bs->opaque;
1033      uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1034      int ret;
1035  
1036      memset(s->l1_table, 0, l1_length);
1037      if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, l1_length, s->l1_table,
1038                           0) < 0)
1039          return -1;
1040      ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, false,
1041                          PREALLOC_MODE_OFF, 0, NULL);
1042      if (ret < 0)
1043          return ret;
1044  
1045      memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1046      memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1047      memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1048  
1049      return 0;
1050  }
1051  
1052  /* XXX: put compressed sectors first, then all the cluster aligned
1053     tables to avoid losing bytes in alignment */
1054  static int coroutine_fn GRAPH_RDLOCK
1055  qcow_co_pwritev_compressed(BlockDriverState *bs, int64_t offset, int64_t bytes,
1056                             QEMUIOVector *qiov)
1057  {
1058      BDRVQcowState *s = bs->opaque;
1059      z_stream strm;
1060      int ret, out_len;
1061      uint8_t *buf, *out_buf;
1062      uint64_t cluster_offset;
1063  
1064      buf = qemu_blockalign(bs, s->cluster_size);
1065      if (bytes != s->cluster_size) {
1066          if (bytes > s->cluster_size ||
1067              offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
1068          {
1069              qemu_vfree(buf);
1070              return -EINVAL;
1071          }
1072          /* Zero-pad last write if image size is not cluster aligned */
1073          memset(buf + bytes, 0, s->cluster_size - bytes);
1074      }
1075      qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
1076  
1077      out_buf = g_malloc(s->cluster_size);
1078  
1079      /* best compression, small window, no zlib header */
1080      memset(&strm, 0, sizeof(strm));
1081      ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1082                         Z_DEFLATED, -12,
1083                         9, Z_DEFAULT_STRATEGY);
1084      if (ret != 0) {
1085          ret = -EINVAL;
1086          goto fail;
1087      }
1088  
1089      strm.avail_in = s->cluster_size;
1090      strm.next_in = (uint8_t *)buf;
1091      strm.avail_out = s->cluster_size;
1092      strm.next_out = out_buf;
1093  
1094      ret = deflate(&strm, Z_FINISH);
1095      if (ret != Z_STREAM_END && ret != Z_OK) {
1096          deflateEnd(&strm);
1097          ret = -EINVAL;
1098          goto fail;
1099      }
1100      out_len = strm.next_out - out_buf;
1101  
1102      deflateEnd(&strm);
1103  
1104      if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1105          /* could not compress: write normal cluster */
1106          ret = qcow_co_pwritev(bs, offset, bytes, qiov, 0);
1107          if (ret < 0) {
1108              goto fail;
1109          }
1110          goto success;
1111      }
1112      qemu_co_mutex_lock(&s->lock);
1113      ret = get_cluster_offset(bs, offset, 2, out_len, 0, 0, &cluster_offset);
1114      qemu_co_mutex_unlock(&s->lock);
1115      if (ret < 0) {
1116          goto fail;
1117      }
1118      if (cluster_offset == 0) {
1119          ret = -EIO;
1120          goto fail;
1121      }
1122      cluster_offset &= s->cluster_offset_mask;
1123  
1124      BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1125      ret = bdrv_co_pwrite(bs->file, cluster_offset, out_len, out_buf, 0);
1126      if (ret < 0) {
1127          goto fail;
1128      }
1129  success:
1130      ret = 0;
1131  fail:
1132      qemu_vfree(buf);
1133      g_free(out_buf);
1134      return ret;
1135  }
1136  
1137  static int coroutine_fn
1138  qcow_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1139  {
1140      BDRVQcowState *s = bs->opaque;
1141      bdi->cluster_size = s->cluster_size;
1142      return 0;
1143  }
1144  
1145  static QemuOptsList qcow_create_opts = {
1146      .name = "qcow-create-opts",
1147      .head = QTAILQ_HEAD_INITIALIZER(qcow_create_opts.head),
1148      .desc = {
1149          {
1150              .name = BLOCK_OPT_SIZE,
1151              .type = QEMU_OPT_SIZE,
1152              .help = "Virtual disk size"
1153          },
1154          {
1155              .name = BLOCK_OPT_BACKING_FILE,
1156              .type = QEMU_OPT_STRING,
1157              .help = "File name of a base image"
1158          },
1159          {
1160              .name = BLOCK_OPT_BACKING_FMT,
1161              .type = QEMU_OPT_STRING,
1162              .help = "Format of the backing image",
1163          },
1164          {
1165              .name = BLOCK_OPT_ENCRYPT,
1166              .type = QEMU_OPT_BOOL,
1167              .help = "Encrypt the image with format 'aes'. (Deprecated "
1168                      "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
1169          },
1170          {
1171              .name = BLOCK_OPT_ENCRYPT_FORMAT,
1172              .type = QEMU_OPT_STRING,
1173              .help = "Encrypt the image, format choices: 'aes'",
1174          },
1175          BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."),
1176          { /* end of list */ }
1177      }
1178  };
1179  
1180  static const char *const qcow_strong_runtime_opts[] = {
1181      "encrypt." BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET,
1182  
1183      NULL
1184  };
1185  
1186  static BlockDriver bdrv_qcow = {
1187      .format_name	= "qcow",
1188      .instance_size	= sizeof(BDRVQcowState),
1189      .bdrv_probe		= qcow_probe,
1190      .bdrv_open		= qcow_open,
1191      .bdrv_close		= qcow_close,
1192      .bdrv_child_perm        = bdrv_default_perms,
1193      .bdrv_reopen_prepare    = qcow_reopen_prepare,
1194      .bdrv_co_create         = qcow_co_create,
1195      .bdrv_co_create_opts    = qcow_co_create_opts,
1196      .bdrv_has_zero_init     = bdrv_has_zero_init_1,
1197      .is_format              = true,
1198      .supports_backing       = true,
1199      .bdrv_refresh_limits    = qcow_refresh_limits,
1200  
1201      .bdrv_co_preadv         = qcow_co_preadv,
1202      .bdrv_co_pwritev        = qcow_co_pwritev,
1203      .bdrv_co_block_status   = qcow_co_block_status,
1204  
1205      .bdrv_make_empty        = qcow_make_empty,
1206      .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed,
1207      .bdrv_co_get_info       = qcow_co_get_info,
1208  
1209      .create_opts            = &qcow_create_opts,
1210      .strong_runtime_opts    = qcow_strong_runtime_opts,
1211  };
1212  
1213  static void bdrv_qcow_init(void)
1214  {
1215      bdrv_register(&bdrv_qcow);
1216  }
1217  
1218  block_init(bdrv_qcow_init);
1219