xref: /openbmc/qemu/block/qcow.c (revision 89854803)
1 /*
2  * Block driver for the QCOW format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu/error-report.h"
28 #include "block/block_int.h"
29 #include "sysemu/block-backend.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/bswap.h"
33 #include <zlib.h>
34 #include "qapi/qmp/qdict.h"
35 #include "qapi/qmp/qstring.h"
36 #include "qapi/qobject-input-visitor.h"
37 #include "qapi/qapi-visit-block-core.h"
38 #include "crypto/block.h"
39 #include "migration/blocker.h"
40 #include "block/crypto.h"
41 
42 /**************************************************************/
43 /* QEMU COW block driver with compression and encryption support */
44 
45 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
46 #define QCOW_VERSION 1
47 
48 #define QCOW_CRYPT_NONE 0
49 #define QCOW_CRYPT_AES  1
50 
51 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
52 
53 typedef struct QCowHeader {
54     uint32_t magic;
55     uint32_t version;
56     uint64_t backing_file_offset;
57     uint32_t backing_file_size;
58     uint32_t mtime;
59     uint64_t size; /* in bytes */
60     uint8_t cluster_bits;
61     uint8_t l2_bits;
62     uint16_t padding;
63     uint32_t crypt_method;
64     uint64_t l1_table_offset;
65 } QEMU_PACKED QCowHeader;
66 
67 #define L2_CACHE_SIZE 16
68 
69 typedef struct BDRVQcowState {
70     int cluster_bits;
71     int cluster_size;
72     int cluster_sectors;
73     int l2_bits;
74     int l2_size;
75     unsigned int l1_size;
76     uint64_t cluster_offset_mask;
77     uint64_t l1_table_offset;
78     uint64_t *l1_table;
79     uint64_t *l2_cache;
80     uint64_t l2_cache_offsets[L2_CACHE_SIZE];
81     uint32_t l2_cache_counts[L2_CACHE_SIZE];
82     uint8_t *cluster_cache;
83     uint8_t *cluster_data;
84     uint64_t cluster_cache_offset;
85     QCryptoBlock *crypto; /* Disk encryption format driver */
86     uint32_t crypt_method_header;
87     CoMutex lock;
88     Error *migration_blocker;
89 } BDRVQcowState;
90 
91 static QemuOptsList qcow_create_opts;
92 
93 static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
94 
95 static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
96 {
97     const QCowHeader *cow_header = (const void *)buf;
98 
99     if (buf_size >= sizeof(QCowHeader) &&
100         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
101         be32_to_cpu(cow_header->version) == QCOW_VERSION)
102         return 100;
103     else
104         return 0;
105 }
106 
107 static QemuOptsList qcow_runtime_opts = {
108     .name = "qcow",
109     .head = QTAILQ_HEAD_INITIALIZER(qcow_runtime_opts.head),
110     .desc = {
111         BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."),
112         { /* end of list */ }
113     },
114 };
115 
116 static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
117                      Error **errp)
118 {
119     BDRVQcowState *s = bs->opaque;
120     unsigned int len, i, shift;
121     int ret;
122     QCowHeader header;
123     Error *local_err = NULL;
124     QCryptoBlockOpenOptions *crypto_opts = NULL;
125     unsigned int cflags = 0;
126     QDict *encryptopts = NULL;
127     const char *encryptfmt;
128 
129     qdict_extract_subqdict(options, &encryptopts, "encrypt.");
130     encryptfmt = qdict_get_try_str(encryptopts, "format");
131 
132     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
133                                false, errp);
134     if (!bs->file) {
135         ret = -EINVAL;
136         goto fail;
137     }
138 
139     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
140     if (ret < 0) {
141         goto fail;
142     }
143     be32_to_cpus(&header.magic);
144     be32_to_cpus(&header.version);
145     be64_to_cpus(&header.backing_file_offset);
146     be32_to_cpus(&header.backing_file_size);
147     be32_to_cpus(&header.mtime);
148     be64_to_cpus(&header.size);
149     be32_to_cpus(&header.crypt_method);
150     be64_to_cpus(&header.l1_table_offset);
151 
152     if (header.magic != QCOW_MAGIC) {
153         error_setg(errp, "Image not in qcow format");
154         ret = -EINVAL;
155         goto fail;
156     }
157     if (header.version != QCOW_VERSION) {
158         error_setg(errp, "Unsupported qcow version %" PRIu32, header.version);
159         ret = -ENOTSUP;
160         goto fail;
161     }
162 
163     if (header.size <= 1) {
164         error_setg(errp, "Image size is too small (must be at least 2 bytes)");
165         ret = -EINVAL;
166         goto fail;
167     }
168     if (header.cluster_bits < 9 || header.cluster_bits > 16) {
169         error_setg(errp, "Cluster size must be between 512 and 64k");
170         ret = -EINVAL;
171         goto fail;
172     }
173 
174     /* l2_bits specifies number of entries; storing a uint64_t in each entry,
175      * so bytes = num_entries << 3. */
176     if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) {
177         error_setg(errp, "L2 table size must be between 512 and 64k");
178         ret = -EINVAL;
179         goto fail;
180     }
181 
182     s->crypt_method_header = header.crypt_method;
183     if (s->crypt_method_header) {
184         if (bdrv_uses_whitelist() &&
185             s->crypt_method_header == QCOW_CRYPT_AES) {
186             error_setg(errp,
187                        "Use of AES-CBC encrypted qcow images is no longer "
188                        "supported in system emulators");
189             error_append_hint(errp,
190                               "You can use 'qemu-img convert' to convert your "
191                               "image to an alternative supported format, such "
192                               "as unencrypted qcow, or raw with the LUKS "
193                               "format instead.\n");
194             ret = -ENOSYS;
195             goto fail;
196         }
197         if (s->crypt_method_header == QCOW_CRYPT_AES) {
198             if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
199                 error_setg(errp,
200                            "Header reported 'aes' encryption format but "
201                            "options specify '%s'", encryptfmt);
202                 ret = -EINVAL;
203                 goto fail;
204             }
205             qdict_del(encryptopts, "format");
206             crypto_opts = block_crypto_open_opts_init(
207                 Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp);
208             if (!crypto_opts) {
209                 ret = -EINVAL;
210                 goto fail;
211             }
212 
213             if (flags & BDRV_O_NO_IO) {
214                 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
215             }
216             s->crypto = qcrypto_block_open(crypto_opts, "encrypt.",
217                                            NULL, NULL, cflags, errp);
218             if (!s->crypto) {
219                 ret = -EINVAL;
220                 goto fail;
221             }
222         } else {
223             error_setg(errp, "invalid encryption method in qcow header");
224             ret = -EINVAL;
225             goto fail;
226         }
227         bs->encrypted = true;
228     } else {
229         if (encryptfmt) {
230             error_setg(errp, "No encryption in image header, but options "
231                        "specified format '%s'", encryptfmt);
232             ret = -EINVAL;
233             goto fail;
234         }
235     }
236     s->cluster_bits = header.cluster_bits;
237     s->cluster_size = 1 << s->cluster_bits;
238     s->cluster_sectors = 1 << (s->cluster_bits - 9);
239     s->l2_bits = header.l2_bits;
240     s->l2_size = 1 << s->l2_bits;
241     bs->total_sectors = header.size / 512;
242     s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
243 
244     /* read the level 1 table */
245     shift = s->cluster_bits + s->l2_bits;
246     if (header.size > UINT64_MAX - (1LL << shift)) {
247         error_setg(errp, "Image too large");
248         ret = -EINVAL;
249         goto fail;
250     } else {
251         uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift;
252         if (l1_size > INT_MAX / sizeof(uint64_t)) {
253             error_setg(errp, "Image too large");
254             ret = -EINVAL;
255             goto fail;
256         }
257         s->l1_size = l1_size;
258     }
259 
260     s->l1_table_offset = header.l1_table_offset;
261     s->l1_table = g_try_new(uint64_t, s->l1_size);
262     if (s->l1_table == NULL) {
263         error_setg(errp, "Could not allocate memory for L1 table");
264         ret = -ENOMEM;
265         goto fail;
266     }
267 
268     ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
269                s->l1_size * sizeof(uint64_t));
270     if (ret < 0) {
271         goto fail;
272     }
273 
274     for(i = 0;i < s->l1_size; i++) {
275         be64_to_cpus(&s->l1_table[i]);
276     }
277 
278     /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */
279     s->l2_cache =
280         qemu_try_blockalign(bs->file->bs,
281                             s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
282     if (s->l2_cache == NULL) {
283         error_setg(errp, "Could not allocate L2 table cache");
284         ret = -ENOMEM;
285         goto fail;
286     }
287     s->cluster_cache = g_malloc(s->cluster_size);
288     s->cluster_data = g_malloc(s->cluster_size);
289     s->cluster_cache_offset = -1;
290 
291     /* read the backing file name */
292     if (header.backing_file_offset != 0) {
293         len = header.backing_file_size;
294         if (len > 1023 || len >= sizeof(bs->backing_file)) {
295             error_setg(errp, "Backing file name too long");
296             ret = -EINVAL;
297             goto fail;
298         }
299         ret = bdrv_pread(bs->file, header.backing_file_offset,
300                    bs->backing_file, len);
301         if (ret < 0) {
302             goto fail;
303         }
304         bs->backing_file[len] = '\0';
305     }
306 
307     /* Disable migration when qcow images are used */
308     error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
309                "does not support live migration",
310                bdrv_get_device_or_node_name(bs));
311     ret = migrate_add_blocker(s->migration_blocker, &local_err);
312     if (local_err) {
313         error_propagate(errp, local_err);
314         error_free(s->migration_blocker);
315         goto fail;
316     }
317 
318     qobject_unref(encryptopts);
319     qapi_free_QCryptoBlockOpenOptions(crypto_opts);
320     qemu_co_mutex_init(&s->lock);
321     return 0;
322 
323  fail:
324     g_free(s->l1_table);
325     qemu_vfree(s->l2_cache);
326     g_free(s->cluster_cache);
327     g_free(s->cluster_data);
328     qcrypto_block_free(s->crypto);
329     qobject_unref(encryptopts);
330     qapi_free_QCryptoBlockOpenOptions(crypto_opts);
331     return ret;
332 }
333 
334 
335 /* We have nothing to do for QCOW reopen, stubs just return
336  * success */
337 static int qcow_reopen_prepare(BDRVReopenState *state,
338                                BlockReopenQueue *queue, Error **errp)
339 {
340     return 0;
341 }
342 
343 
344 /* 'allocate' is:
345  *
346  * 0 to not allocate.
347  *
348  * 1 to allocate a normal cluster (for sector indexes 'n_start' to
349  * 'n_end')
350  *
351  * 2 to allocate a compressed cluster of size
352  * 'compressed_size'. 'compressed_size' must be > 0 and <
353  * cluster_size
354  *
355  * return 0 if not allocated, 1 if *result is assigned, and negative
356  * errno on failure.
357  */
358 static int get_cluster_offset(BlockDriverState *bs,
359                               uint64_t offset, int allocate,
360                               int compressed_size,
361                               int n_start, int n_end, uint64_t *result)
362 {
363     BDRVQcowState *s = bs->opaque;
364     int min_index, i, j, l1_index, l2_index, ret;
365     int64_t l2_offset;
366     uint64_t *l2_table, cluster_offset, tmp;
367     uint32_t min_count;
368     int new_l2_table;
369 
370     *result = 0;
371     l1_index = offset >> (s->l2_bits + s->cluster_bits);
372     l2_offset = s->l1_table[l1_index];
373     new_l2_table = 0;
374     if (!l2_offset) {
375         if (!allocate)
376             return 0;
377         /* allocate a new l2 entry */
378         l2_offset = bdrv_getlength(bs->file->bs);
379         if (l2_offset < 0) {
380             return l2_offset;
381         }
382         /* round to cluster size */
383         l2_offset = QEMU_ALIGN_UP(l2_offset, s->cluster_size);
384         /* update the L1 entry */
385         s->l1_table[l1_index] = l2_offset;
386         tmp = cpu_to_be64(l2_offset);
387         BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
388         ret = bdrv_pwrite_sync(bs->file,
389                                s->l1_table_offset + l1_index * sizeof(tmp),
390                                &tmp, sizeof(tmp));
391         if (ret < 0) {
392             return ret;
393         }
394         new_l2_table = 1;
395     }
396     for(i = 0; i < L2_CACHE_SIZE; i++) {
397         if (l2_offset == s->l2_cache_offsets[i]) {
398             /* increment the hit count */
399             if (++s->l2_cache_counts[i] == 0xffffffff) {
400                 for(j = 0; j < L2_CACHE_SIZE; j++) {
401                     s->l2_cache_counts[j] >>= 1;
402                 }
403             }
404             l2_table = s->l2_cache + (i << s->l2_bits);
405             goto found;
406         }
407     }
408     /* not found: load a new entry in the least used one */
409     min_index = 0;
410     min_count = 0xffffffff;
411     for(i = 0; i < L2_CACHE_SIZE; i++) {
412         if (s->l2_cache_counts[i] < min_count) {
413             min_count = s->l2_cache_counts[i];
414             min_index = i;
415         }
416     }
417     l2_table = s->l2_cache + (min_index << s->l2_bits);
418     BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
419     if (new_l2_table) {
420         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
421         ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
422                                s->l2_size * sizeof(uint64_t));
423         if (ret < 0) {
424             return ret;
425         }
426     } else {
427         ret = bdrv_pread(bs->file, l2_offset, l2_table,
428                          s->l2_size * sizeof(uint64_t));
429         if (ret < 0) {
430             return ret;
431         }
432     }
433     s->l2_cache_offsets[min_index] = l2_offset;
434     s->l2_cache_counts[min_index] = 1;
435  found:
436     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
437     cluster_offset = be64_to_cpu(l2_table[l2_index]);
438     if (!cluster_offset ||
439         ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
440         if (!allocate)
441             return 0;
442         BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
443         /* allocate a new cluster */
444         if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
445             (n_end - n_start) < s->cluster_sectors) {
446             /* if the cluster is already compressed, we must
447                decompress it in the case it is not completely
448                overwritten */
449             if (decompress_cluster(bs, cluster_offset) < 0) {
450                 return -EIO;
451             }
452             cluster_offset = bdrv_getlength(bs->file->bs);
453             if ((int64_t) cluster_offset < 0) {
454                 return cluster_offset;
455             }
456             cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
457             /* write the cluster content */
458             BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
459             ret = bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache,
460                               s->cluster_size);
461             if (ret < 0) {
462                 return ret;
463             }
464         } else {
465             cluster_offset = bdrv_getlength(bs->file->bs);
466             if ((int64_t) cluster_offset < 0) {
467                 return cluster_offset;
468             }
469             if (allocate == 1) {
470                 /* round to cluster size */
471                 cluster_offset = QEMU_ALIGN_UP(cluster_offset, s->cluster_size);
472                 if (cluster_offset + s->cluster_size > INT64_MAX) {
473                     return -E2BIG;
474                 }
475                 ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
476                                     PREALLOC_MODE_OFF, NULL);
477                 if (ret < 0) {
478                     return ret;
479                 }
480                 /* if encrypted, we must initialize the cluster
481                    content which won't be written */
482                 if (bs->encrypted &&
483                     (n_end - n_start) < s->cluster_sectors) {
484                     uint64_t start_sect;
485                     assert(s->crypto);
486                     start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
487                     for(i = 0; i < s->cluster_sectors; i++) {
488                         if (i < n_start || i >= n_end) {
489                             memset(s->cluster_data, 0x00, 512);
490                             if (qcrypto_block_encrypt(s->crypto,
491                                                       (start_sect + i) *
492                                                       BDRV_SECTOR_SIZE,
493                                                       s->cluster_data,
494                                                       BDRV_SECTOR_SIZE,
495                                                       NULL) < 0) {
496                                 return -EIO;
497                             }
498                             BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
499                             ret = bdrv_pwrite(bs->file,
500                                               cluster_offset + i * 512,
501                                               s->cluster_data, 512);
502                             if (ret < 0) {
503                                 return ret;
504                             }
505                         }
506                     }
507                 }
508             } else if (allocate == 2) {
509                 cluster_offset |= QCOW_OFLAG_COMPRESSED |
510                     (uint64_t)compressed_size << (63 - s->cluster_bits);
511             }
512         }
513         /* update L2 table */
514         tmp = cpu_to_be64(cluster_offset);
515         l2_table[l2_index] = tmp;
516         if (allocate == 2) {
517             BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
518         } else {
519             BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
520         }
521         ret = bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
522                                &tmp, sizeof(tmp));
523         if (ret < 0) {
524             return ret;
525         }
526     }
527     *result = cluster_offset;
528     return 1;
529 }
530 
531 static int coroutine_fn qcow_co_block_status(BlockDriverState *bs,
532                                              bool want_zero,
533                                              int64_t offset, int64_t bytes,
534                                              int64_t *pnum, int64_t *map,
535                                              BlockDriverState **file)
536 {
537     BDRVQcowState *s = bs->opaque;
538     int index_in_cluster, ret;
539     int64_t n;
540     uint64_t cluster_offset;
541 
542     qemu_co_mutex_lock(&s->lock);
543     ret = get_cluster_offset(bs, offset, 0, 0, 0, 0, &cluster_offset);
544     qemu_co_mutex_unlock(&s->lock);
545     if (ret < 0) {
546         return ret;
547     }
548     index_in_cluster = offset & (s->cluster_size - 1);
549     n = s->cluster_size - index_in_cluster;
550     if (n > bytes) {
551         n = bytes;
552     }
553     *pnum = n;
554     if (!cluster_offset) {
555         return 0;
556     }
557     if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypto) {
558         return BDRV_BLOCK_DATA;
559     }
560     *map = cluster_offset | index_in_cluster;
561     *file = bs->file->bs;
562     return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
563 }
564 
565 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
566                              const uint8_t *buf, int buf_size)
567 {
568     z_stream strm1, *strm = &strm1;
569     int ret, out_len;
570 
571     memset(strm, 0, sizeof(*strm));
572 
573     strm->next_in = (uint8_t *)buf;
574     strm->avail_in = buf_size;
575     strm->next_out = out_buf;
576     strm->avail_out = out_buf_size;
577 
578     ret = inflateInit2(strm, -12);
579     if (ret != Z_OK)
580         return -1;
581     ret = inflate(strm, Z_FINISH);
582     out_len = strm->next_out - out_buf;
583     if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
584         out_len != out_buf_size) {
585         inflateEnd(strm);
586         return -1;
587     }
588     inflateEnd(strm);
589     return 0;
590 }
591 
592 static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
593 {
594     BDRVQcowState *s = bs->opaque;
595     int ret, csize;
596     uint64_t coffset;
597 
598     coffset = cluster_offset & s->cluster_offset_mask;
599     if (s->cluster_cache_offset != coffset) {
600         csize = cluster_offset >> (63 - s->cluster_bits);
601         csize &= (s->cluster_size - 1);
602         BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
603         ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
604         if (ret != csize)
605             return -1;
606         if (decompress_buffer(s->cluster_cache, s->cluster_size,
607                               s->cluster_data, csize) < 0) {
608             return -1;
609         }
610         s->cluster_cache_offset = coffset;
611     }
612     return 0;
613 }
614 
615 static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
616                          int nb_sectors, QEMUIOVector *qiov)
617 {
618     BDRVQcowState *s = bs->opaque;
619     int index_in_cluster;
620     int ret = 0, n;
621     uint64_t cluster_offset;
622     struct iovec hd_iov;
623     QEMUIOVector hd_qiov;
624     uint8_t *buf;
625     void *orig_buf;
626 
627     if (qiov->niov > 1) {
628         buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
629         if (buf == NULL) {
630             return -ENOMEM;
631         }
632     } else {
633         orig_buf = NULL;
634         buf = (uint8_t *)qiov->iov->iov_base;
635     }
636 
637     qemu_co_mutex_lock(&s->lock);
638 
639     while (nb_sectors != 0) {
640         /* prepare next request */
641         ret = get_cluster_offset(bs, sector_num << 9,
642                                  0, 0, 0, 0, &cluster_offset);
643         if (ret < 0) {
644             break;
645         }
646         index_in_cluster = sector_num & (s->cluster_sectors - 1);
647         n = s->cluster_sectors - index_in_cluster;
648         if (n > nb_sectors) {
649             n = nb_sectors;
650         }
651 
652         if (!cluster_offset) {
653             if (bs->backing) {
654                 /* read from the base image */
655                 hd_iov.iov_base = (void *)buf;
656                 hd_iov.iov_len = n * 512;
657                 qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
658                 qemu_co_mutex_unlock(&s->lock);
659                 /* qcow2 emits this on bs->file instead of bs->backing */
660                 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
661                 ret = bdrv_co_readv(bs->backing, sector_num, n, &hd_qiov);
662                 qemu_co_mutex_lock(&s->lock);
663                 if (ret < 0) {
664                     break;
665                 }
666             } else {
667                 /* Note: in this case, no need to wait */
668                 memset(buf, 0, 512 * n);
669             }
670         } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
671             /* add AIO support for compressed blocks ? */
672             if (decompress_cluster(bs, cluster_offset) < 0) {
673                 ret = -EIO;
674                 break;
675             }
676             memcpy(buf,
677                    s->cluster_cache + index_in_cluster * 512, 512 * n);
678         } else {
679             if ((cluster_offset & 511) != 0) {
680                 ret = -EIO;
681                 break;
682             }
683             hd_iov.iov_base = (void *)buf;
684             hd_iov.iov_len = n * 512;
685             qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
686             qemu_co_mutex_unlock(&s->lock);
687             BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
688             ret = bdrv_co_readv(bs->file,
689                                 (cluster_offset >> 9) + index_in_cluster,
690                                 n, &hd_qiov);
691             qemu_co_mutex_lock(&s->lock);
692             if (ret < 0) {
693                 break;
694             }
695             if (bs->encrypted) {
696                 assert(s->crypto);
697                 if (qcrypto_block_decrypt(s->crypto,
698                                           sector_num * BDRV_SECTOR_SIZE, buf,
699                                           n * BDRV_SECTOR_SIZE, NULL) < 0) {
700                     ret = -EIO;
701                     break;
702                 }
703             }
704         }
705         ret = 0;
706 
707         nb_sectors -= n;
708         sector_num += n;
709         buf += n * 512;
710     }
711 
712     qemu_co_mutex_unlock(&s->lock);
713 
714     if (qiov->niov > 1) {
715         qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size);
716         qemu_vfree(orig_buf);
717     }
718 
719     return ret;
720 }
721 
722 static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
723                           int nb_sectors, QEMUIOVector *qiov)
724 {
725     BDRVQcowState *s = bs->opaque;
726     int index_in_cluster;
727     uint64_t cluster_offset;
728     int ret = 0, n;
729     struct iovec hd_iov;
730     QEMUIOVector hd_qiov;
731     uint8_t *buf;
732     void *orig_buf;
733 
734     s->cluster_cache_offset = -1; /* disable compressed cache */
735 
736     /* We must always copy the iov when encrypting, so we
737      * don't modify the original data buffer during encryption */
738     if (bs->encrypted || qiov->niov > 1) {
739         buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
740         if (buf == NULL) {
741             return -ENOMEM;
742         }
743         qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
744     } else {
745         orig_buf = NULL;
746         buf = (uint8_t *)qiov->iov->iov_base;
747     }
748 
749     qemu_co_mutex_lock(&s->lock);
750 
751     while (nb_sectors != 0) {
752 
753         index_in_cluster = sector_num & (s->cluster_sectors - 1);
754         n = s->cluster_sectors - index_in_cluster;
755         if (n > nb_sectors) {
756             n = nb_sectors;
757         }
758         ret = get_cluster_offset(bs, sector_num << 9, 1, 0,
759                                  index_in_cluster,
760                                  index_in_cluster + n, &cluster_offset);
761         if (ret < 0) {
762             break;
763         }
764         if (!cluster_offset || (cluster_offset & 511) != 0) {
765             ret = -EIO;
766             break;
767         }
768         if (bs->encrypted) {
769             assert(s->crypto);
770             if (qcrypto_block_encrypt(s->crypto, sector_num * BDRV_SECTOR_SIZE,
771                                       buf, n * BDRV_SECTOR_SIZE, NULL) < 0) {
772                 ret = -EIO;
773                 break;
774             }
775         }
776 
777         hd_iov.iov_base = (void *)buf;
778         hd_iov.iov_len = n * 512;
779         qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
780         qemu_co_mutex_unlock(&s->lock);
781         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
782         ret = bdrv_co_writev(bs->file,
783                              (cluster_offset >> 9) + index_in_cluster,
784                              n, &hd_qiov);
785         qemu_co_mutex_lock(&s->lock);
786         if (ret < 0) {
787             break;
788         }
789         ret = 0;
790 
791         nb_sectors -= n;
792         sector_num += n;
793         buf += n * 512;
794     }
795     qemu_co_mutex_unlock(&s->lock);
796 
797     qemu_vfree(orig_buf);
798 
799     return ret;
800 }
801 
802 static void qcow_close(BlockDriverState *bs)
803 {
804     BDRVQcowState *s = bs->opaque;
805 
806     qcrypto_block_free(s->crypto);
807     s->crypto = NULL;
808     g_free(s->l1_table);
809     qemu_vfree(s->l2_cache);
810     g_free(s->cluster_cache);
811     g_free(s->cluster_data);
812 
813     migrate_del_blocker(s->migration_blocker);
814     error_free(s->migration_blocker);
815 }
816 
817 static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
818                                        Error **errp)
819 {
820     BlockdevCreateOptionsQcow *qcow_opts;
821     int header_size, backing_filename_len, l1_size, shift, i;
822     QCowHeader header;
823     uint8_t *tmp;
824     int64_t total_size = 0;
825     int ret;
826     BlockDriverState *bs;
827     BlockBackend *qcow_blk;
828     QCryptoBlock *crypto = NULL;
829 
830     assert(opts->driver == BLOCKDEV_DRIVER_QCOW);
831     qcow_opts = &opts->u.qcow;
832 
833     /* Sanity checks */
834     total_size = qcow_opts->size;
835     if (total_size == 0) {
836         error_setg(errp, "Image size is too small, cannot be zero length");
837         return -EINVAL;
838     }
839 
840     if (qcow_opts->has_encrypt &&
841         qcow_opts->encrypt->format != Q_CRYPTO_BLOCK_FORMAT_QCOW)
842     {
843         error_setg(errp, "Unsupported encryption format");
844         return -EINVAL;
845     }
846 
847     /* Create BlockBackend to write to the image */
848     bs = bdrv_open_blockdev_ref(qcow_opts->file, errp);
849     if (bs == NULL) {
850         return -EIO;
851     }
852 
853     qcow_blk = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
854     ret = blk_insert_bs(qcow_blk, bs, errp);
855     if (ret < 0) {
856         goto exit;
857     }
858     blk_set_allow_write_beyond_eof(qcow_blk, true);
859 
860     /* Create image format */
861     ret = blk_truncate(qcow_blk, 0, PREALLOC_MODE_OFF, errp);
862     if (ret < 0) {
863         goto exit;
864     }
865 
866     memset(&header, 0, sizeof(header));
867     header.magic = cpu_to_be32(QCOW_MAGIC);
868     header.version = cpu_to_be32(QCOW_VERSION);
869     header.size = cpu_to_be64(total_size);
870     header_size = sizeof(header);
871     backing_filename_len = 0;
872     if (qcow_opts->has_backing_file) {
873         if (strcmp(qcow_opts->backing_file, "fat:")) {
874             header.backing_file_offset = cpu_to_be64(header_size);
875             backing_filename_len = strlen(qcow_opts->backing_file);
876             header.backing_file_size = cpu_to_be32(backing_filename_len);
877             header_size += backing_filename_len;
878         } else {
879             /* special backing file for vvfat */
880             qcow_opts->has_backing_file = false;
881         }
882         header.cluster_bits = 9; /* 512 byte cluster to avoid copying
883                                     unmodified sectors */
884         header.l2_bits = 12; /* 32 KB L2 tables */
885     } else {
886         header.cluster_bits = 12; /* 4 KB clusters */
887         header.l2_bits = 9; /* 4 KB L2 tables */
888     }
889     header_size = (header_size + 7) & ~7;
890     shift = header.cluster_bits + header.l2_bits;
891     l1_size = (total_size + (1LL << shift) - 1) >> shift;
892 
893     header.l1_table_offset = cpu_to_be64(header_size);
894 
895     if (qcow_opts->has_encrypt) {
896         header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
897 
898         crypto = qcrypto_block_create(qcow_opts->encrypt, "encrypt.",
899                                       NULL, NULL, NULL, errp);
900         if (!crypto) {
901             ret = -EINVAL;
902             goto exit;
903         }
904     } else {
905         header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
906     }
907 
908     /* write all the data */
909     ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header), 0);
910     if (ret != sizeof(header)) {
911         goto exit;
912     }
913 
914     if (qcow_opts->has_backing_file) {
915         ret = blk_pwrite(qcow_blk, sizeof(header),
916                          qcow_opts->backing_file, backing_filename_len, 0);
917         if (ret != backing_filename_len) {
918             goto exit;
919         }
920     }
921 
922     tmp = g_malloc0(BDRV_SECTOR_SIZE);
923     for (i = 0; i < DIV_ROUND_UP(sizeof(uint64_t) * l1_size, BDRV_SECTOR_SIZE);
924          i++) {
925         ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE * i,
926                          tmp, BDRV_SECTOR_SIZE, 0);
927         if (ret != BDRV_SECTOR_SIZE) {
928             g_free(tmp);
929             goto exit;
930         }
931     }
932 
933     g_free(tmp);
934     ret = 0;
935 exit:
936     blk_unref(qcow_blk);
937     qcrypto_block_free(crypto);
938     return ret;
939 }
940 
941 static int coroutine_fn qcow_co_create_opts(const char *filename,
942                                             QemuOpts *opts, Error **errp)
943 {
944     BlockdevCreateOptions *create_options = NULL;
945     BlockDriverState *bs = NULL;
946     QDict *qdict = NULL;
947     QObject *qobj;
948     Visitor *v;
949     const char *val;
950     Error *local_err = NULL;
951     int ret;
952 
953     static const QDictRenames opt_renames[] = {
954         { BLOCK_OPT_BACKING_FILE,       "backing-file" },
955         { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
956         { NULL, NULL },
957     };
958 
959     /* Parse options and convert legacy syntax */
960     qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qcow_create_opts, true);
961 
962     val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
963     if (val && !strcmp(val, "on")) {
964         qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
965     } else if (val && !strcmp(val, "off")) {
966         qdict_del(qdict, BLOCK_OPT_ENCRYPT);
967     }
968 
969     val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
970     if (val && !strcmp(val, "aes")) {
971         qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
972     }
973 
974     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
975         ret = -EINVAL;
976         goto fail;
977     }
978 
979     /* Create and open the file (protocol layer) */
980     ret = bdrv_create_file(filename, opts, &local_err);
981     if (ret < 0) {
982         error_propagate(errp, local_err);
983         goto fail;
984     }
985 
986     bs = bdrv_open(filename, NULL, NULL,
987                    BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
988     if (bs == NULL) {
989         ret = -EIO;
990         goto fail;
991     }
992 
993     /* Now get the QAPI type BlockdevCreateOptions */
994     qdict_put_str(qdict, "driver", "qcow");
995     qdict_put_str(qdict, "file", bs->node_name);
996 
997     qobj = qdict_crumple(qdict, errp);
998     qobject_unref(qdict);
999     qdict = qobject_to(QDict, qobj);
1000     if (qdict == NULL) {
1001         ret = -EINVAL;
1002         goto fail;
1003     }
1004 
1005     v = qobject_input_visitor_new_keyval(QOBJECT(qdict));
1006     visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
1007     visit_free(v);
1008 
1009     if (local_err) {
1010         error_propagate(errp, local_err);
1011         ret = -EINVAL;
1012         goto fail;
1013     }
1014 
1015     /* Silently round up size */
1016     assert(create_options->driver == BLOCKDEV_DRIVER_QCOW);
1017     create_options->u.qcow.size =
1018         ROUND_UP(create_options->u.qcow.size, BDRV_SECTOR_SIZE);
1019 
1020     /* Create the qcow image (format layer) */
1021     ret = qcow_co_create(create_options, errp);
1022     if (ret < 0) {
1023         goto fail;
1024     }
1025 
1026     ret = 0;
1027 fail:
1028     qobject_unref(qdict);
1029     bdrv_unref(bs);
1030     qapi_free_BlockdevCreateOptions(create_options);
1031     return ret;
1032 }
1033 
1034 static int qcow_make_empty(BlockDriverState *bs)
1035 {
1036     BDRVQcowState *s = bs->opaque;
1037     uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1038     int ret;
1039 
1040     memset(s->l1_table, 0, l1_length);
1041     if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
1042             l1_length) < 0)
1043         return -1;
1044     ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length,
1045                         PREALLOC_MODE_OFF, NULL);
1046     if (ret < 0)
1047         return ret;
1048 
1049     memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1050     memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1051     memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1052 
1053     return 0;
1054 }
1055 
1056 /* XXX: put compressed sectors first, then all the cluster aligned
1057    tables to avoid losing bytes in alignment */
1058 static coroutine_fn int
1059 qcow_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1060                            uint64_t bytes, QEMUIOVector *qiov)
1061 {
1062     BDRVQcowState *s = bs->opaque;
1063     QEMUIOVector hd_qiov;
1064     struct iovec iov;
1065     z_stream strm;
1066     int ret, out_len;
1067     uint8_t *buf, *out_buf;
1068     uint64_t cluster_offset;
1069 
1070     buf = qemu_blockalign(bs, s->cluster_size);
1071     if (bytes != s->cluster_size) {
1072         if (bytes > s->cluster_size ||
1073             offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
1074         {
1075             qemu_vfree(buf);
1076             return -EINVAL;
1077         }
1078         /* Zero-pad last write if image size is not cluster aligned */
1079         memset(buf + bytes, 0, s->cluster_size - bytes);
1080     }
1081     qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
1082 
1083     out_buf = g_malloc(s->cluster_size);
1084 
1085     /* best compression, small window, no zlib header */
1086     memset(&strm, 0, sizeof(strm));
1087     ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1088                        Z_DEFLATED, -12,
1089                        9, Z_DEFAULT_STRATEGY);
1090     if (ret != 0) {
1091         ret = -EINVAL;
1092         goto fail;
1093     }
1094 
1095     strm.avail_in = s->cluster_size;
1096     strm.next_in = (uint8_t *)buf;
1097     strm.avail_out = s->cluster_size;
1098     strm.next_out = out_buf;
1099 
1100     ret = deflate(&strm, Z_FINISH);
1101     if (ret != Z_STREAM_END && ret != Z_OK) {
1102         deflateEnd(&strm);
1103         ret = -EINVAL;
1104         goto fail;
1105     }
1106     out_len = strm.next_out - out_buf;
1107 
1108     deflateEnd(&strm);
1109 
1110     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1111         /* could not compress: write normal cluster */
1112         ret = qcow_co_writev(bs, offset >> BDRV_SECTOR_BITS,
1113                              bytes >> BDRV_SECTOR_BITS, qiov);
1114         if (ret < 0) {
1115             goto fail;
1116         }
1117         goto success;
1118     }
1119     qemu_co_mutex_lock(&s->lock);
1120     ret = get_cluster_offset(bs, offset, 2, out_len, 0, 0, &cluster_offset);
1121     qemu_co_mutex_unlock(&s->lock);
1122     if (ret < 0) {
1123         goto fail;
1124     }
1125     if (cluster_offset == 0) {
1126         ret = -EIO;
1127         goto fail;
1128     }
1129     cluster_offset &= s->cluster_offset_mask;
1130 
1131     iov = (struct iovec) {
1132         .iov_base   = out_buf,
1133         .iov_len    = out_len,
1134     };
1135     qemu_iovec_init_external(&hd_qiov, &iov, 1);
1136     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1137     ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
1138     if (ret < 0) {
1139         goto fail;
1140     }
1141 success:
1142     ret = 0;
1143 fail:
1144     qemu_vfree(buf);
1145     g_free(out_buf);
1146     return ret;
1147 }
1148 
1149 static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1150 {
1151     BDRVQcowState *s = bs->opaque;
1152     bdi->cluster_size = s->cluster_size;
1153     return 0;
1154 }
1155 
1156 static QemuOptsList qcow_create_opts = {
1157     .name = "qcow-create-opts",
1158     .head = QTAILQ_HEAD_INITIALIZER(qcow_create_opts.head),
1159     .desc = {
1160         {
1161             .name = BLOCK_OPT_SIZE,
1162             .type = QEMU_OPT_SIZE,
1163             .help = "Virtual disk size"
1164         },
1165         {
1166             .name = BLOCK_OPT_BACKING_FILE,
1167             .type = QEMU_OPT_STRING,
1168             .help = "File name of a base image"
1169         },
1170         {
1171             .name = BLOCK_OPT_ENCRYPT,
1172             .type = QEMU_OPT_BOOL,
1173             .help = "Encrypt the image with format 'aes'. (Deprecated "
1174                     "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
1175         },
1176         {
1177             .name = BLOCK_OPT_ENCRYPT_FORMAT,
1178             .type = QEMU_OPT_STRING,
1179             .help = "Encrypt the image, format choices: 'aes'",
1180         },
1181         BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."),
1182         { /* end of list */ }
1183     }
1184 };
1185 
1186 static BlockDriver bdrv_qcow = {
1187     .format_name	= "qcow",
1188     .instance_size	= sizeof(BDRVQcowState),
1189     .bdrv_probe		= qcow_probe,
1190     .bdrv_open		= qcow_open,
1191     .bdrv_close		= qcow_close,
1192     .bdrv_child_perm        = bdrv_format_default_perms,
1193     .bdrv_reopen_prepare    = qcow_reopen_prepare,
1194     .bdrv_co_create         = qcow_co_create,
1195     .bdrv_co_create_opts    = qcow_co_create_opts,
1196     .bdrv_has_zero_init     = bdrv_has_zero_init_1,
1197     .supports_backing       = true,
1198 
1199     .bdrv_co_readv          = qcow_co_readv,
1200     .bdrv_co_writev         = qcow_co_writev,
1201     .bdrv_co_block_status   = qcow_co_block_status,
1202 
1203     .bdrv_make_empty        = qcow_make_empty,
1204     .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed,
1205     .bdrv_get_info          = qcow_get_info,
1206 
1207     .create_opts            = &qcow_create_opts,
1208 };
1209 
1210 static void bdrv_qcow_init(void)
1211 {
1212     bdrv_register(&bdrv_qcow);
1213 }
1214 
1215 block_init(bdrv_qcow_init);
1216