xref: /openbmc/qemu/block/qcow.c (revision 4a4ff4c5)
1 /*
2  * Block driver for the QCOW format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu/error-report.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/bswap.h"
33 #include <zlib.h>
34 #include "qapi/qmp/qdict.h"
35 #include "qapi/qmp/qstring.h"
36 #include "qapi/qobject-input-visitor.h"
37 #include "qapi/qapi-visit-block-core.h"
38 #include "crypto/block.h"
39 #include "migration/blocker.h"
40 #include "block/crypto.h"
41 
42 /**************************************************************/
43 /* QEMU COW block driver with compression and encryption support */
44 
45 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
46 #define QCOW_VERSION 1
47 
48 #define QCOW_CRYPT_NONE 0
49 #define QCOW_CRYPT_AES  1
50 
51 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
52 
53 typedef struct QCowHeader {
54     uint32_t magic;
55     uint32_t version;
56     uint64_t backing_file_offset;
57     uint32_t backing_file_size;
58     uint32_t mtime;
59     uint64_t size; /* in bytes */
60     uint8_t cluster_bits;
61     uint8_t l2_bits;
62     uint16_t padding;
63     uint32_t crypt_method;
64     uint64_t l1_table_offset;
65 } QEMU_PACKED QCowHeader;
66 
67 #define L2_CACHE_SIZE 16
68 
69 typedef struct BDRVQcowState {
70     int cluster_bits;
71     int cluster_size;
72     int cluster_sectors;
73     int l2_bits;
74     int l2_size;
75     unsigned int l1_size;
76     uint64_t cluster_offset_mask;
77     uint64_t l1_table_offset;
78     uint64_t *l1_table;
79     uint64_t *l2_cache;
80     uint64_t l2_cache_offsets[L2_CACHE_SIZE];
81     uint32_t l2_cache_counts[L2_CACHE_SIZE];
82     uint8_t *cluster_cache;
83     uint8_t *cluster_data;
84     uint64_t cluster_cache_offset;
85     QCryptoBlock *crypto; /* Disk encryption format driver */
86     uint32_t crypt_method_header;
87     CoMutex lock;
88     Error *migration_blocker;
89 } BDRVQcowState;
90 
91 static QemuOptsList qcow_create_opts;
92 
93 static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
94 
95 static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
96 {
97     const QCowHeader *cow_header = (const void *)buf;
98 
99     if (buf_size >= sizeof(QCowHeader) &&
100         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
101         be32_to_cpu(cow_header->version) == QCOW_VERSION)
102         return 100;
103     else
104         return 0;
105 }
106 
107 static QemuOptsList qcow_runtime_opts = {
108     .name = "qcow",
109     .head = QTAILQ_HEAD_INITIALIZER(qcow_runtime_opts.head),
110     .desc = {
111         BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."),
112         { /* end of list */ }
113     },
114 };
115 
116 static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
117                      Error **errp)
118 {
119     BDRVQcowState *s = bs->opaque;
120     unsigned int len, i, shift;
121     int ret;
122     QCowHeader header;
123     Error *local_err = NULL;
124     QCryptoBlockOpenOptions *crypto_opts = NULL;
125     unsigned int cflags = 0;
126     QDict *encryptopts = NULL;
127     const char *encryptfmt;
128 
129     qdict_extract_subqdict(options, &encryptopts, "encrypt.");
130     encryptfmt = qdict_get_try_str(encryptopts, "format");
131 
132     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
133                                false, errp);
134     if (!bs->file) {
135         ret = -EINVAL;
136         goto fail;
137     }
138 
139     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
140     if (ret < 0) {
141         goto fail;
142     }
143     be32_to_cpus(&header.magic);
144     be32_to_cpus(&header.version);
145     be64_to_cpus(&header.backing_file_offset);
146     be32_to_cpus(&header.backing_file_size);
147     be32_to_cpus(&header.mtime);
148     be64_to_cpus(&header.size);
149     be32_to_cpus(&header.crypt_method);
150     be64_to_cpus(&header.l1_table_offset);
151 
152     if (header.magic != QCOW_MAGIC) {
153         error_setg(errp, "Image not in qcow format");
154         ret = -EINVAL;
155         goto fail;
156     }
157     if (header.version != QCOW_VERSION) {
158         error_setg(errp, "Unsupported qcow version %" PRIu32, header.version);
159         ret = -ENOTSUP;
160         goto fail;
161     }
162 
163     if (header.size <= 1) {
164         error_setg(errp, "Image size is too small (must be at least 2 bytes)");
165         ret = -EINVAL;
166         goto fail;
167     }
168     if (header.cluster_bits < 9 || header.cluster_bits > 16) {
169         error_setg(errp, "Cluster size must be between 512 and 64k");
170         ret = -EINVAL;
171         goto fail;
172     }
173 
174     /* l2_bits specifies number of entries; storing a uint64_t in each entry,
175      * so bytes = num_entries << 3. */
176     if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) {
177         error_setg(errp, "L2 table size must be between 512 and 64k");
178         ret = -EINVAL;
179         goto fail;
180     }
181 
182     s->crypt_method_header = header.crypt_method;
183     if (s->crypt_method_header) {
184         if (bdrv_uses_whitelist() &&
185             s->crypt_method_header == QCOW_CRYPT_AES) {
186             error_setg(errp,
187                        "Use of AES-CBC encrypted qcow images is no longer "
188                        "supported in system emulators");
189             error_append_hint(errp,
190                               "You can use 'qemu-img convert' to convert your "
191                               "image to an alternative supported format, such "
192                               "as unencrypted qcow, or raw with the LUKS "
193                               "format instead.\n");
194             ret = -ENOSYS;
195             goto fail;
196         }
197         if (s->crypt_method_header == QCOW_CRYPT_AES) {
198             if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
199                 error_setg(errp,
200                            "Header reported 'aes' encryption format but "
201                            "options specify '%s'", encryptfmt);
202                 ret = -EINVAL;
203                 goto fail;
204             }
205             qdict_del(encryptopts, "format");
206             crypto_opts = block_crypto_open_opts_init(
207                 Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp);
208             if (!crypto_opts) {
209                 ret = -EINVAL;
210                 goto fail;
211             }
212 
213             if (flags & BDRV_O_NO_IO) {
214                 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
215             }
216             s->crypto = qcrypto_block_open(crypto_opts, "encrypt.",
217                                            NULL, NULL, cflags, errp);
218             if (!s->crypto) {
219                 ret = -EINVAL;
220                 goto fail;
221             }
222         } else {
223             error_setg(errp, "invalid encryption method in qcow header");
224             ret = -EINVAL;
225             goto fail;
226         }
227         bs->encrypted = true;
228     } else {
229         if (encryptfmt) {
230             error_setg(errp, "No encryption in image header, but options "
231                        "specified format '%s'", encryptfmt);
232             ret = -EINVAL;
233             goto fail;
234         }
235     }
236     s->cluster_bits = header.cluster_bits;
237     s->cluster_size = 1 << s->cluster_bits;
238     s->cluster_sectors = 1 << (s->cluster_bits - 9);
239     s->l2_bits = header.l2_bits;
240     s->l2_size = 1 << s->l2_bits;
241     bs->total_sectors = header.size / 512;
242     s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
243 
244     /* read the level 1 table */
245     shift = s->cluster_bits + s->l2_bits;
246     if (header.size > UINT64_MAX - (1LL << shift)) {
247         error_setg(errp, "Image too large");
248         ret = -EINVAL;
249         goto fail;
250     } else {
251         uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift;
252         if (l1_size > INT_MAX / sizeof(uint64_t)) {
253             error_setg(errp, "Image too large");
254             ret = -EINVAL;
255             goto fail;
256         }
257         s->l1_size = l1_size;
258     }
259 
260     s->l1_table_offset = header.l1_table_offset;
261     s->l1_table = g_try_new(uint64_t, s->l1_size);
262     if (s->l1_table == NULL) {
263         error_setg(errp, "Could not allocate memory for L1 table");
264         ret = -ENOMEM;
265         goto fail;
266     }
267 
268     ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
269                s->l1_size * sizeof(uint64_t));
270     if (ret < 0) {
271         goto fail;
272     }
273 
274     for(i = 0;i < s->l1_size; i++) {
275         be64_to_cpus(&s->l1_table[i]);
276     }
277 
278     /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */
279     s->l2_cache =
280         qemu_try_blockalign(bs->file->bs,
281                             s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
282     if (s->l2_cache == NULL) {
283         error_setg(errp, "Could not allocate L2 table cache");
284         ret = -ENOMEM;
285         goto fail;
286     }
287     s->cluster_cache = g_malloc(s->cluster_size);
288     s->cluster_data = g_malloc(s->cluster_size);
289     s->cluster_cache_offset = -1;
290 
291     /* read the backing file name */
292     if (header.backing_file_offset != 0) {
293         len = header.backing_file_size;
294         if (len > 1023 || len >= sizeof(bs->backing_file)) {
295             error_setg(errp, "Backing file name too long");
296             ret = -EINVAL;
297             goto fail;
298         }
299         ret = bdrv_pread(bs->file, header.backing_file_offset,
300                    bs->backing_file, len);
301         if (ret < 0) {
302             goto fail;
303         }
304         bs->backing_file[len] = '\0';
305     }
306 
307     /* Disable migration when qcow images are used */
308     error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
309                "does not support live migration",
310                bdrv_get_device_or_node_name(bs));
311     ret = migrate_add_blocker(s->migration_blocker, &local_err);
312     if (local_err) {
313         error_propagate(errp, local_err);
314         error_free(s->migration_blocker);
315         goto fail;
316     }
317 
318     qobject_unref(encryptopts);
319     qapi_free_QCryptoBlockOpenOptions(crypto_opts);
320     qemu_co_mutex_init(&s->lock);
321     return 0;
322 
323  fail:
324     g_free(s->l1_table);
325     qemu_vfree(s->l2_cache);
326     g_free(s->cluster_cache);
327     g_free(s->cluster_data);
328     qcrypto_block_free(s->crypto);
329     qobject_unref(encryptopts);
330     qapi_free_QCryptoBlockOpenOptions(crypto_opts);
331     return ret;
332 }
333 
334 
335 /* We have nothing to do for QCOW reopen, stubs just return
336  * success */
337 static int qcow_reopen_prepare(BDRVReopenState *state,
338                                BlockReopenQueue *queue, Error **errp)
339 {
340     return 0;
341 }
342 
343 
344 /* 'allocate' is:
345  *
346  * 0 to not allocate.
347  *
348  * 1 to allocate a normal cluster (for sector indexes 'n_start' to
349  * 'n_end')
350  *
351  * 2 to allocate a compressed cluster of size
352  * 'compressed_size'. 'compressed_size' must be > 0 and <
353  * cluster_size
354  *
355  * return 0 if not allocated, 1 if *result is assigned, and negative
356  * errno on failure.
357  */
358 static int get_cluster_offset(BlockDriverState *bs,
359                               uint64_t offset, int allocate,
360                               int compressed_size,
361                               int n_start, int n_end, uint64_t *result)
362 {
363     BDRVQcowState *s = bs->opaque;
364     int min_index, i, j, l1_index, l2_index, ret;
365     int64_t l2_offset;
366     uint64_t *l2_table, cluster_offset, tmp;
367     uint32_t min_count;
368     int new_l2_table;
369 
370     *result = 0;
371     l1_index = offset >> (s->l2_bits + s->cluster_bits);
372     l2_offset = s->l1_table[l1_index];
373     new_l2_table = 0;
374     if (!l2_offset) {
375         if (!allocate)
376             return 0;
377         /* allocate a new l2 entry */
378         l2_offset = bdrv_getlength(bs->file->bs);
379         if (l2_offset < 0) {
380             return l2_offset;
381         }
382         /* round to cluster size */
383         l2_offset = QEMU_ALIGN_UP(l2_offset, s->cluster_size);
384         /* update the L1 entry */
385         s->l1_table[l1_index] = l2_offset;
386         tmp = cpu_to_be64(l2_offset);
387         BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
388         ret = bdrv_pwrite_sync(bs->file,
389                                s->l1_table_offset + l1_index * sizeof(tmp),
390                                &tmp, sizeof(tmp));
391         if (ret < 0) {
392             return ret;
393         }
394         new_l2_table = 1;
395     }
396     for(i = 0; i < L2_CACHE_SIZE; i++) {
397         if (l2_offset == s->l2_cache_offsets[i]) {
398             /* increment the hit count */
399             if (++s->l2_cache_counts[i] == 0xffffffff) {
400                 for(j = 0; j < L2_CACHE_SIZE; j++) {
401                     s->l2_cache_counts[j] >>= 1;
402                 }
403             }
404             l2_table = s->l2_cache + (i << s->l2_bits);
405             goto found;
406         }
407     }
408     /* not found: load a new entry in the least used one */
409     min_index = 0;
410     min_count = 0xffffffff;
411     for(i = 0; i < L2_CACHE_SIZE; i++) {
412         if (s->l2_cache_counts[i] < min_count) {
413             min_count = s->l2_cache_counts[i];
414             min_index = i;
415         }
416     }
417     l2_table = s->l2_cache + (min_index << s->l2_bits);
418     BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
419     if (new_l2_table) {
420         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
421         ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
422                                s->l2_size * sizeof(uint64_t));
423         if (ret < 0) {
424             return ret;
425         }
426     } else {
427         ret = bdrv_pread(bs->file, l2_offset, l2_table,
428                          s->l2_size * sizeof(uint64_t));
429         if (ret < 0) {
430             return ret;
431         }
432     }
433     s->l2_cache_offsets[min_index] = l2_offset;
434     s->l2_cache_counts[min_index] = 1;
435  found:
436     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
437     cluster_offset = be64_to_cpu(l2_table[l2_index]);
438     if (!cluster_offset ||
439         ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
440         if (!allocate)
441             return 0;
442         BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
443         /* allocate a new cluster */
444         if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
445             (n_end - n_start) < s->cluster_sectors) {
446             /* if the cluster is already compressed, we must
447                decompress it in the case it is not completely
448                overwritten */
449             if (decompress_cluster(bs, cluster_offset) < 0) {
450                 return -EIO;
451             }
452             cluster_offset = bdrv_getlength(bs->file->bs);
453             if ((int64_t) cluster_offset < 0) {
454                 return cluster_offset;
455             }
456             cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
457             /* write the cluster content */
458             BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
459             ret = bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
460                               s->cluster_size);
461             if (ret < 0) {
462                 return ret;
463             }
464         } else {
465             cluster_offset = bdrv_getlength(bs->file->bs);
466             if ((int64_t) cluster_offset < 0) {
467                 return cluster_offset;
468             }
469             if (allocate == 1) {
470                 /* round to cluster size */
471                 cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
472                 if (cluster_offset + s->cluster_size > INT64_MAX) {
473                     return -E2BIG;
474                 }
475                 ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
476                                     PREALLOC_MODE_OFF, NULL);
477                 if (ret < 0) {
478                     return ret;
479                 }
480                 /* if encrypted, we must initialize the cluster
481                    content which won't be written */
482                 if (bs->encrypted &&
483                     (n_end - n_start) < s->cluster_sectors) {
484                     uint64_t start_sect;
485                     assert(s->crypto);
486                     start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
487                     for(i = 0; i < s->cluster_sectors; i++) {
488                         if (i < n_start || i >= n_end) {
489                             memset(s->cluster_data, 0x00, 512);
490                             if (qcrypto_block_encrypt(s->crypto,
491                                                       (start_sect + i) *
492                                                       BDRV_SECTOR_SIZE,
493                                                       s->cluster_data,
494                                                       BDRV_SECTOR_SIZE,
495                                                       NULL) < 0) {
496                                 return -EIO;
497                             }
498                             BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
499                             ret = bdrv_pwrite(bs->file,
500                                               cluster_offset + i * 512,
501                                               s->cluster_data, 512);
502                             if (ret < 0) {
503                                 return ret;
504                             }
505                         }
506                     }
507                 }
508             } else if (allocate == 2) {
509                 cluster_offset |= QCOW_OFLAG_COMPRESSED |
510                     (uint64_t)compressed_size << (63 - s->cluster_bits);
511             }
512         }
513         /* update L2 table */
514         tmp = cpu_to_be64(cluster_offset);
515         l2_table[l2_index] = tmp;
516         if (allocate == 2) {
517             BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
518         } else {
519             BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
520         }
521         ret = bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
522                                &tmp, sizeof(tmp));
523         if (ret < 0) {
524             return ret;
525         }
526     }
527     *result = cluster_offset;
528     return 1;
529 }
530 
531 static int coroutine_fn qcow_co_block_status(BlockDriverState *bs,
532                                              bool want_zero,
533                                              int64_t offset, int64_t bytes,
534                                              int64_t *pnum, int64_t *map,
535                                              BlockDriverState **file)
536 {
537     BDRVQcowState *s = bs->opaque;
538     int index_in_cluster, ret;
539     int64_t n;
540     uint64_t cluster_offset;
541 
542     qemu_co_mutex_lock(&s->lock);
543     ret = get_cluster_offset(bs, offset, 0, 0, 0, 0, &cluster_offset);
544     qemu_co_mutex_unlock(&s->lock);
545     if (ret < 0) {
546         return ret;
547     }
548     index_in_cluster = offset & (s->cluster_size - 1);
549     n = s->cluster_size - index_in_cluster;
550     if (n > bytes) {
551         n = bytes;
552     }
553     *pnum = n;
554     if (!cluster_offset) {
555         return 0;
556     }
557     if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypto) {
558         return BDRV_BLOCK_DATA;
559     }
560     *map = cluster_offset | index_in_cluster;
561     *file = bs->file->bs;
562     return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
563 }
564 
565 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
566                              const uint8_t *buf, int buf_size)
567 {
568     z_stream strm1, *strm = &strm1;
569     int ret, out_len;
570 
571     memset(strm, 0, sizeof(*strm));
572 
573     strm->next_in = (uint8_t *)buf;
574     strm->avail_in = buf_size;
575     strm->next_out = out_buf;
576     strm->avail_out = out_buf_size;
577 
578     ret = inflateInit2(strm, -12);
579     if (ret != Z_OK)
580         return -1;
581     ret = inflate(strm, Z_FINISH);
582     out_len = strm->next_out - out_buf;
583     if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
584         out_len != out_buf_size) {
585         inflateEnd(strm);
586         return -1;
587     }
588     inflateEnd(strm);
589     return 0;
590 }
591 
592 static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
593 {
594     BDRVQcowState *s = bs->opaque;
595     int ret, csize;
596     uint64_t coffset;
597 
598     coffset = cluster_offset & s->cluster_offset_mask;
599     if (s->cluster_cache_offset != coffset) {
600         csize = cluster_offset >> (63 - s->cluster_bits);
601         csize &= (s->cluster_size - 1);
602         BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
603         ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
604         if (ret != csize)
605             return -1;
606         if (decompress_buffer(s->cluster_cache, s->cluster_size,
607                               s->cluster_data, csize) < 0) {
608             return -1;
609         }
610         s->cluster_cache_offset = coffset;
611     }
612     return 0;
613 }
614 
615 static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
616                          int nb_sectors, QEMUIOVector *qiov)
617 {
618     BDRVQcowState *s = bs->opaque;
619     int index_in_cluster;
620     int ret = 0, n;
621     uint64_t cluster_offset;
622     struct iovec hd_iov;
623     QEMUIOVector hd_qiov;
624     uint8_t *buf;
625     void *orig_buf;
626 
627     if (qiov->niov > 1) {
628         buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
629         if (buf == NULL) {
630             return -ENOMEM;
631         }
632     } else {
633         orig_buf = NULL;
634         buf = (uint8_t *)qiov->iov->iov_base;
635     }
636 
637     qemu_co_mutex_lock(&s->lock);
638 
639     while (nb_sectors != 0) {
640         /* prepare next request */
641         ret = get_cluster_offset(bs, sector_num << 9,
642                                  0, 0, 0, 0, &cluster_offset);
643         if (ret < 0) {
644             break;
645         }
646         index_in_cluster = sector_num & (s->cluster_sectors - 1);
647         n = s->cluster_sectors - index_in_cluster;
648         if (n > nb_sectors) {
649             n = nb_sectors;
650         }
651 
652         if (!cluster_offset) {
653             if (bs->backing) {
654                 /* read from the base image */
655                 hd_iov.iov_base = (void *)buf;
656                 hd_iov.iov_len = n * 512;
657                 qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
658                 qemu_co_mutex_unlock(&s->lock);
659                 /* qcow2 emits this on bs->file instead of bs->backing */
660                 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
661                 ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
662                 qemu_co_mutex_lock(&s->lock);
663                 if (ret < 0) {
664                     break;
665                 }
666             } else {
667                 /* Note: in this case, no need to wait */
668                 memset(buf, 0, 512 * n);
669             }
670         } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
671             /* add AIO support for compressed blocks ? */
672             if (decompress_cluster(bs, cluster_offset) < 0) {
673                 ret = -EIO;
674                 break;
675             }
676             memcpy(buf,
677                    s->cluster_cache + index_in_cluster * 512, 512 * n);
678         } else {
679             if ((cluster_offset & 511) != 0) {
680                 ret = -EIO;
681                 break;
682             }
683             hd_iov.iov_base = (void *)buf;
684             hd_iov.iov_len = n * 512;
685             qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
686             qemu_co_mutex_unlock(&s->lock);
687             BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
688             ret = bdrv_co_readv(bs->file,
689                                 (cluster_offset >> 9) + index_in_cluster,
690                                 n, &hd_qiov);
691             qemu_co_mutex_lock(&s->lock);
692             if (ret < 0) {
693                 break;
694             }
695             if (bs->encrypted) {
696                 assert(s->crypto);
697                 if (qcrypto_block_decrypt(s->crypto,
698                                           sector_num * BDRV_SECTOR_SIZE, buf,
699                                           n * BDRV_SECTOR_SIZE, NULL) < 0) {
700                     ret = -EIO;
701                     break;
702                 }
703             }
704         }
705         ret = 0;
706 
707         nb_sectors -= n;
708         sector_num += n;
709         buf += n * 512;
710     }
711 
712     qemu_co_mutex_unlock(&s->lock);
713 
714     if (qiov->niov > 1) {
715         qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
716         qemu_vfree(orig_buf);
717     }
718 
719     return ret;
720 }
721 
722 static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
723                                        int nb_sectors, QEMUIOVector *qiov,
724                                        int flags)
725 {
726     BDRVQcowState *s = bs->opaque;
727     int index_in_cluster;
728     uint64_t cluster_offset;
729     int ret = 0, n;
730     struct iovec hd_iov;
731     QEMUIOVector hd_qiov;
732     uint8_t *buf;
733     void *orig_buf;
734 
735     assert(!flags);
736     s->cluster_cache_offset = -1; /* disable compressed cache */
737 
738     /* We must always copy the iov when encrypting, so we
739      * don't modify the original data buffer during encryption */
740     if (bs->encrypted || qiov->niov > 1) {
741         buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
742         if (buf == NULL) {
743             return -ENOMEM;
744         }
745         qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
746     } else {
747         orig_buf = NULL;
748         buf = (uint8_t *)qiov->iov->iov_base;
749     }
750 
751     qemu_co_mutex_lock(&s->lock);
752 
753     while (nb_sectors != 0) {
754 
755         index_in_cluster = sector_num & (s->cluster_sectors - 1);
756         n = s->cluster_sectors - index_in_cluster;
757         if (n > nb_sectors) {
758             n = nb_sectors;
759         }
760         ret = get_cluster_offset(bs, sector_num << 9, 1, 0,
761                                  index_in_cluster,
762                                  index_in_cluster + n, &cluster_offset);
763         if (ret < 0) {
764             break;
765         }
766         if (!cluster_offset || (cluster_offset & 511) != 0) {
767             ret = -EIO;
768             break;
769         }
770         if (bs->encrypted) {
771             assert(s->crypto);
772             if (qcrypto_block_encrypt(s->crypto, sector_num * BDRV_SECTOR_SIZE,
773                                       buf, n * BDRV_SECTOR_SIZE, NULL) < 0) {
774                 ret = -EIO;
775                 break;
776             }
777         }
778 
779         hd_iov.iov_base = (void *)buf;
780         hd_iov.iov_len = n * 512;
781         qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
782         qemu_co_mutex_unlock(&s->lock);
783         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
784         ret = bdrv_co_writev(bs->file,
785                              (cluster_offset >> 9) + index_in_cluster,
786                              n, &hd_qiov);
787         qemu_co_mutex_lock(&s->lock);
788         if (ret < 0) {
789             break;
790         }
791         ret = 0;
792 
793         nb_sectors -= n;
794         sector_num += n;
795         buf += n * 512;
796     }
797     qemu_co_mutex_unlock(&s->lock);
798 
799     qemu_vfree(orig_buf);
800 
801     return ret;
802 }
803 
804 static void qcow_close(BlockDriverState *bs)
805 {
806     BDRVQcowState *s = bs->opaque;
807 
808     qcrypto_block_free(s->crypto);
809     s->crypto = NULL;
810     g_free(s->l1_table);
811     qemu_vfree(s->l2_cache);
812     g_free(s->cluster_cache);
813     g_free(s->cluster_data);
814 
815     migrate_del_blocker(s->migration_blocker);
816     error_free(s->migration_blocker);
817 }
818 
819 static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
820                                        Error **errp)
821 {
822     BlockdevCreateOptionsQcow *qcow_opts;
823     int header_size, backing_filename_len, l1_size, shift, i;
824     QCowHeader header;
825     uint8_t *tmp;
826     int64_t total_size = 0;
827     int ret;
828     BlockDriverState *bs;
829     BlockBackend *qcow_blk;
830     QCryptoBlock *crypto = NULL;
831 
832     assert(opts->driver == BLOCKDEV_DRIVER_QCOW);
833     qcow_opts = &opts->u.qcow;
834 
835     /* Sanity checks */
836     total_size = qcow_opts->size;
837     if (total_size == 0) {
838         error_setg(errp, "Image size is too small, cannot be zero length");
839         return -EINVAL;
840     }
841 
842     if (qcow_opts->has_encrypt &&
843         qcow_opts->encrypt->format != Q_CRYPTO_BLOCK_FORMAT_QCOW)
844     {
845         error_setg(errp, "Unsupported encryption format");
846         return -EINVAL;
847     }
848 
849     /* Create BlockBackend to write to the image */
850     bs = bdrv_open_blockdev_ref(qcow_opts->file, errp);
851     if (bs == NULL) {
852         return -EIO;
853     }
854 
855     qcow_blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
856     ret = blk_insert_bs(qcow_blk, bs, errp);
857     if (ret < 0) {
858         goto exit;
859     }
860     blk_set_allow_write_beyond_eof(qcow_blk, true);
861 
862     /* Create image format */
863     ret = blk_truncate(qcow_blk, 0, PREALLOC_MODE_OFF, errp);
864     if (ret < 0) {
865         goto exit;
866     }
867 
868     memset(&header, 0, sizeof(header));
869     header.magic = cpu_to_be32(QCOW_MAGIC);
870     header.version = cpu_to_be32(QCOW_VERSION);
871     header.size = cpu_to_be64(total_size);
872     header_size = sizeof(header);
873     backing_filename_len = 0;
874     if (qcow_opts->has_backing_file) {
875         if (strcmp(qcow_opts->backing_file, "fat:")) {
876             header.backing_file_offset = cpu_to_be64(header_size);
877             backing_filename_len = strlen(qcow_opts->backing_file);
878             header.backing_file_size = cpu_to_be32(backing_filename_len);
879             header_size += backing_filename_len;
880         } else {
881             /* special backing file for vvfat */
882             qcow_opts->has_backing_file = false;
883         }
884         header.cluster_bits = 9; /* 512 byte cluster to avoid copying
885                                     unmodified sectors */
886         header.l2_bits = 12; /* 32 KB L2 tables */
887     } else {
888         header.cluster_bits = 12; /* 4 KB clusters */
889         header.l2_bits = 9; /* 4 KB L2 tables */
890     }
891     header_size = (header_size + 7) & ~7;
892     shift = header.cluster_bits + header.l2_bits;
893     l1_size = (total_size + (1LL << shift) - 1) >> shift;
894 
895     header.l1_table_offset = cpu_to_be64(header_size);
896 
897     if (qcow_opts->has_encrypt) {
898         header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
899 
900         crypto = qcrypto_block_create(qcow_opts->encrypt, "encrypt.",
901                                       NULL, NULL, NULL, errp);
902         if (!crypto) {
903             ret = -EINVAL;
904             goto exit;
905         }
906     } else {
907         header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
908     }
909 
910     /* write all the data */
911     ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header), 0);
912     if (ret != sizeof(header)) {
913         goto exit;
914     }
915 
916     if (qcow_opts->has_backing_file) {
917         ret = blk_pwrite(qcow_blk, sizeof(header),
918                          qcow_opts->backing_file, backing_filename_len, 0);
919         if (ret != backing_filename_len) {
920             goto exit;
921         }
922     }
923 
924     tmp = g_malloc0(BDRV_SECTOR_SIZE);
925     for (i = 0; i < DIV_ROUND_UP(sizeof(uint64_t) * l1_size, BDRV_SECTOR_SIZE);
926          i++) {
927         ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i,
928                          tmp, BDRV_SECTOR_SIZE, 0);
929         if (ret != BDRV_SECTOR_SIZE) {
930             g_free(tmp);
931             goto exit;
932         }
933     }
934 
935     g_free(tmp);
936     ret = 0;
937 exit:
938     blk_unref(qcow_blk);
939     qcrypto_block_free(crypto);
940     return ret;
941 }
942 
943 static int coroutine_fn qcow_co_create_opts(const char *filename,
944                                             QemuOpts *opts, Error **errp)
945 {
946     BlockdevCreateOptions *create_options = NULL;
947     BlockDriverState *bs = NULL;
948     QDict *qdict = NULL;
949     QObject *qobj;
950     Visitor *v;
951     const char *val;
952     Error *local_err = NULL;
953     int ret;
954 
955     static const QDictRenames opt_renames[] = {
956         { BLOCK_OPT_BACKING_FILE,       "backing-file" },
957         { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
958         { NULL, NULL },
959     };
960 
961     /* Parse options and convert legacy syntax */
962     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qcow_create_opts, true);
963 
964     val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
965     if (val && !strcmp(val, "on")) {
966         qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
967     } else if (val && !strcmp(val, "off")) {
968         qdict_del(qdict, BLOCK_OPT_ENCRYPT);
969     }
970 
971     val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
972     if (val && !strcmp(val, "aes")) {
973         qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
974     }
975 
976     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
977         ret = -EINVAL;
978         goto fail;
979     }
980 
981     /* Create and open the file (protocol layer) */
982     ret = bdrv_create_file(filename, opts, &local_err);
983     if (ret < 0) {
984         error_propagate(errp, local_err);
985         goto fail;
986     }
987 
988     bs = bdrv_open(filename, NULL, NULL,
989                    BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
990     if (bs == NULL) {
991         ret = -EIO;
992         goto fail;
993     }
994 
995     /* Now get the QAPI type BlockdevCreateOptions */
996     qdict_put_str(qdict, "driver", "qcow");
997     qdict_put_str(qdict, "file", bs->node_name);
998 
999     qobj = qdict_crumple(qdict, errp);
1000     qobject_unref(qdict);
1001     qdict = qobject_to(QDict, qobj);
1002     if (qdict == NULL) {
1003         ret = -EINVAL;
1004         goto fail;
1005     }
1006 
1007     v = qobject_input_visitor_new_keyval(QOBJECT(qdict));
1008     visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
1009     visit_free(v);
1010 
1011     if (local_err) {
1012         error_propagate(errp, local_err);
1013         ret = -EINVAL;
1014         goto fail;
1015     }
1016 
1017     /* Silently round up size */
1018     assert(create_options->driver == BLOCKDEV_DRIVER_QCOW);
1019     create_options->u.qcow.size =
1020         ROUND_UP(create_options->u.qcow.size, BDRV_SECTOR_SIZE);
1021 
1022     /* Create the qcow image (format layer) */
1023     ret = qcow_co_create(create_options, errp);
1024     if (ret < 0) {
1025         goto fail;
1026     }
1027 
1028     ret = 0;
1029 fail:
1030     qobject_unref(qdict);
1031     bdrv_unref(bs);
1032     qapi_free_BlockdevCreateOptions(create_options);
1033     return ret;
1034 }
1035 
1036 static int qcow_make_empty(BlockDriverState *bs)
1037 {
1038     BDRVQcowState *s = bs->opaque;
1039     uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1040     int ret;
1041 
1042     memset(s->l1_table, 0, l1_length);
1043     if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
1044             l1_length) < 0)
1045         return -1;
1046     ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length,
1047                         PREALLOC_MODE_OFF, NULL);
1048     if (ret < 0)
1049         return ret;
1050 
1051     memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1052     memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1053     memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1054 
1055     return 0;
1056 }
1057 
1058 /* XXX: put compressed sectors first, then all the cluster aligned
1059    tables to avoid losing bytes in alignment */
1060 static coroutine_fn int
1061 qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1062                            uint64_t bytes, QEMUIOVector *qiov)
1063 {
1064     BDRVQcowState *s = bs->opaque;
1065     QEMUIOVector hd_qiov;
1066     struct iovec iov;
1067     z_stream strm;
1068     int ret, out_len;
1069     uint8_t *buf, *out_buf;
1070     uint64_t cluster_offset;
1071 
1072     buf = qemu_blockalign(bs, s->cluster_size);
1073     if (bytes != s->cluster_size) {
1074         if (bytes > s->cluster_size ||
1075             offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
1076         {
1077             qemu_vfree(buf);
1078             return -EINVAL;
1079         }
1080         /* Zero-pad last write if image size is not cluster aligned */
1081         memset(buf + bytes, 0, s->cluster_size - bytes);
1082     }
1083     qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
1084 
1085     out_buf = g_malloc(s->cluster_size);
1086 
1087     /* best compression, small window, no zlib header */
1088     memset(&strm, 0, sizeof(strm));
1089     ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1090                        Z_DEFLATED, -12,
1091                        9, Z_DEFAULT_STRATEGY);
1092     if (ret != 0) {
1093         ret = -EINVAL;
1094         goto fail;
1095     }
1096 
1097     strm.avail_in = s->cluster_size;
1098     strm.next_in = (uint8_t *)buf;
1099     strm.avail_out = s->cluster_size;
1100     strm.next_out = out_buf;
1101 
1102     ret = deflate(&strm, Z_FINISH);
1103     if (ret != Z_STREAM_END && ret != Z_OK) {
1104         deflateEnd(&strm);
1105         ret = -EINVAL;
1106         goto fail;
1107     }
1108     out_len = strm.next_out - out_buf;
1109 
1110     deflateEnd(&strm);
1111 
1112     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1113         /* could not compress: write normal cluster */
1114         ret = qcow_co_writev(bs, offset >> BDRV_SECTOR_BITS,
1115                              bytes >> BDRV_SECTOR_BITS, qiov, 0);
1116         if (ret < 0) {
1117             goto fail;
1118         }
1119         goto success;
1120     }
1121     qemu_co_mutex_lock(&s->lock);
1122     ret = get_cluster_offset(bs, offset, 2, out_len, 0, 0, &cluster_offset);
1123     qemu_co_mutex_unlock(&s->lock);
1124     if (ret < 0) {
1125         goto fail;
1126     }
1127     if (cluster_offset == 0) {
1128         ret = -EIO;
1129         goto fail;
1130     }
1131     cluster_offset &= s->cluster_offset_mask;
1132 
1133     iov = (struct iovec) {
1134         .iov_base   = out_buf,
1135         .iov_len    = out_len,
1136     };
1137     qemu_iovec_init_external(&hd_qiov, &iov, 1);
1138     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1139     ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
1140     if (ret < 0) {
1141         goto fail;
1142     }
1143 success:
1144     ret = 0;
1145 fail:
1146     qemu_vfree(buf);
1147     g_free(out_buf);
1148     return ret;
1149 }
1150 
1151 static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1152 {
1153     BDRVQcowState *s = bs->opaque;
1154     bdi->cluster_size = s->cluster_size;
1155     return 0;
1156 }
1157 
1158 static QemuOptsList qcow_create_opts = {
1159     .name = "qcow-create-opts",
1160     .head = QTAILQ_HEAD_INITIALIZER(qcow_create_opts.head),
1161     .desc = {
1162         {
1163             .name = BLOCK_OPT_SIZE,
1164             .type = QEMU_OPT_SIZE,
1165             .help = "Virtual disk size"
1166         },
1167         {
1168             .name = BLOCK_OPT_BACKING_FILE,
1169             .type = QEMU_OPT_STRING,
1170             .help = "File name of a base image"
1171         },
1172         {
1173             .name = BLOCK_OPT_ENCRYPT,
1174             .type = QEMU_OPT_BOOL,
1175             .help = "Encrypt the image with format 'aes'. (Deprecated "
1176                     "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
1177         },
1178         {
1179             .name = BLOCK_OPT_ENCRYPT_FORMAT,
1180             .type = QEMU_OPT_STRING,
1181             .help = "Encrypt the image, format choices: 'aes'",
1182         },
1183         BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."),
1184         { /* end of list */ }
1185     }
1186 };
1187 
1188 static BlockDriver bdrv_qcow = {
1189     .format_name	= "qcow",
1190     .instance_size	= sizeof(BDRVQcowState),
1191     .bdrv_probe		= qcow_probe,
1192     .bdrv_open		= qcow_open,
1193     .bdrv_close		= qcow_close,
1194     .bdrv_child_perm        = bdrv_format_default_perms,
1195     .bdrv_reopen_prepare    = qcow_reopen_prepare,
1196     .bdrv_co_create         = qcow_co_create,
1197     .bdrv_co_create_opts    = qcow_co_create_opts,
1198     .bdrv_has_zero_init     = bdrv_has_zero_init_1,
1199     .supports_backing       = true,
1200 
1201     .bdrv_co_readv          = qcow_co_readv,
1202     .bdrv_co_writev         = qcow_co_writev,
1203     .bdrv_co_block_status   = qcow_co_block_status,
1204 
1205     .bdrv_make_empty        = qcow_make_empty,
1206     .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed,
1207     .bdrv_get_info          = qcow_get_info,
1208 
1209     .create_opts            = &qcow_create_opts,
1210 };
1211 
1212 static void bdrv_qcow_init(void)
1213 {
1214     bdrv_register(&bdrv_qcow);
1215 }
1216 
1217 block_init(bdrv_qcow_init);
1218