xref: /openbmc/qemu/block/qcow2.c (revision 761d524d)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "qemu-common.h"
25 #include "block/block_int.h"
26 #include "qemu/module.h"
27 #include <zlib.h>
28 #include "qemu/aes.h"
29 #include "block/qcow2.h"
30 #include "qemu/error-report.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qapi/qmp/qbool.h"
33 #include "trace.h"
34 
35 /*
36   Differences with QCOW:
37 
38   - Support for multiple incremental snapshots.
39   - Memory management by reference counts.
40   - Clusters which have a reference count of one have the bit
41     QCOW_OFLAG_COPIED to optimize write performance.
42   - Size of compressed clusters is stored in sectors to reduce bit usage
43     in the cluster offsets.
44   - Support for storing additional data (such as the VM state) in the
45     snapshots.
46   - If a backing store is used, the cluster size is not constrained
47     (could be backported to QCOW).
48   - L2 tables have always a size of one cluster.
49 */
50 
51 
52 typedef struct {
53     uint32_t magic;
54     uint32_t len;
55 } QCowExtension;
56 
57 #define  QCOW2_EXT_MAGIC_END 0
58 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
59 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
60 
61 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
62 {
63     const QCowHeader *cow_header = (const void *)buf;
64 
65     if (buf_size >= sizeof(QCowHeader) &&
66         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
67         be32_to_cpu(cow_header->version) >= 2)
68         return 100;
69     else
70         return 0;
71 }
72 
73 
74 /*
75  * read qcow2 extension and fill bs
76  * start reading from start_offset
77  * finish reading upon magic of value 0 or when end_offset reached
78  * unknown magic is skipped (future extension this version knows nothing about)
79  * return 0 upon success, non-0 otherwise
80  */
81 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
82                                  uint64_t end_offset, void **p_feature_table)
83 {
84     BDRVQcowState *s = bs->opaque;
85     QCowExtension ext;
86     uint64_t offset;
87     int ret;
88 
89 #ifdef DEBUG_EXT
90     printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
91 #endif
92     offset = start_offset;
93     while (offset < end_offset) {
94 
95 #ifdef DEBUG_EXT
96         /* Sanity check */
97         if (offset > s->cluster_size)
98             printf("qcow2_read_extension: suspicious offset %lu\n", offset);
99 
100         printf("attempting to read extended header in offset %lu\n", offset);
101 #endif
102 
103         if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) {
104             fprintf(stderr, "qcow2_read_extension: ERROR: "
105                     "pread fail from offset %" PRIu64 "\n",
106                     offset);
107             return 1;
108         }
109         be32_to_cpus(&ext.magic);
110         be32_to_cpus(&ext.len);
111         offset += sizeof(ext);
112 #ifdef DEBUG_EXT
113         printf("ext.magic = 0x%x\n", ext.magic);
114 #endif
115         if (ext.len > end_offset - offset) {
116             error_report("Header extension too large");
117             return -EINVAL;
118         }
119 
120         switch (ext.magic) {
121         case QCOW2_EXT_MAGIC_END:
122             return 0;
123 
124         case QCOW2_EXT_MAGIC_BACKING_FORMAT:
125             if (ext.len >= sizeof(bs->backing_format)) {
126                 fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
127                         " (>=%zu)\n",
128                         ext.len, sizeof(bs->backing_format));
129                 return 2;
130             }
131             if (bdrv_pread(bs->file, offset , bs->backing_format,
132                            ext.len) != ext.len)
133                 return 3;
134             bs->backing_format[ext.len] = '\0';
135 #ifdef DEBUG_EXT
136             printf("Qcow2: Got format extension %s\n", bs->backing_format);
137 #endif
138             break;
139 
140         case QCOW2_EXT_MAGIC_FEATURE_TABLE:
141             if (p_feature_table != NULL) {
142                 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
143                 ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
144                 if (ret < 0) {
145                     return ret;
146                 }
147 
148                 *p_feature_table = feature_table;
149             }
150             break;
151 
152         default:
153             /* unknown magic - save it in case we need to rewrite the header */
154             {
155                 Qcow2UnknownHeaderExtension *uext;
156 
157                 uext = g_malloc0(sizeof(*uext)  + ext.len);
158                 uext->magic = ext.magic;
159                 uext->len = ext.len;
160                 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
161 
162                 ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
163                 if (ret < 0) {
164                     return ret;
165                 }
166             }
167             break;
168         }
169 
170         offset += ((ext.len + 7) & ~7);
171     }
172 
173     return 0;
174 }
175 
176 static void cleanup_unknown_header_ext(BlockDriverState *bs)
177 {
178     BDRVQcowState *s = bs->opaque;
179     Qcow2UnknownHeaderExtension *uext, *next;
180 
181     QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
182         QLIST_REMOVE(uext, next);
183         g_free(uext);
184     }
185 }
186 
187 static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs,
188     const char *fmt, ...)
189 {
190     char msg[64];
191     va_list ap;
192 
193     va_start(ap, fmt);
194     vsnprintf(msg, sizeof(msg), fmt, ap);
195     va_end(ap);
196 
197     qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
198         bs->device_name, "qcow2", msg);
199 }
200 
201 static void report_unsupported_feature(BlockDriverState *bs,
202     Qcow2Feature *table, uint64_t mask)
203 {
204     while (table && table->name[0] != '\0') {
205         if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
206             if (mask & (1 << table->bit)) {
207                 report_unsupported(bs, "%.46s",table->name);
208                 mask &= ~(1 << table->bit);
209             }
210         }
211         table++;
212     }
213 
214     if (mask) {
215         report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask);
216     }
217 }
218 
219 /*
220  * Sets the dirty bit and flushes afterwards if necessary.
221  *
222  * The incompatible_features bit is only set if the image file header was
223  * updated successfully.  Therefore it is not required to check the return
224  * value of this function.
225  */
226 int qcow2_mark_dirty(BlockDriverState *bs)
227 {
228     BDRVQcowState *s = bs->opaque;
229     uint64_t val;
230     int ret;
231 
232     assert(s->qcow_version >= 3);
233 
234     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
235         return 0; /* already dirty */
236     }
237 
238     val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
239     ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
240                       &val, sizeof(val));
241     if (ret < 0) {
242         return ret;
243     }
244     ret = bdrv_flush(bs->file);
245     if (ret < 0) {
246         return ret;
247     }
248 
249     /* Only treat image as dirty if the header was updated successfully */
250     s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
251     return 0;
252 }
253 
254 /*
255  * Clears the dirty bit and flushes before if necessary.  Only call this
256  * function when there are no pending requests, it does not guard against
257  * concurrent requests dirtying the image.
258  */
259 static int qcow2_mark_clean(BlockDriverState *bs)
260 {
261     BDRVQcowState *s = bs->opaque;
262 
263     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
264         int ret = bdrv_flush(bs);
265         if (ret < 0) {
266             return ret;
267         }
268 
269         s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
270         return qcow2_update_header(bs);
271     }
272     return 0;
273 }
274 
275 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
276                        BdrvCheckMode fix)
277 {
278     int ret = qcow2_check_refcounts(bs, result, fix);
279     if (ret < 0) {
280         return ret;
281     }
282 
283     if (fix && result->check_errors == 0 && result->corruptions == 0) {
284         return qcow2_mark_clean(bs);
285     }
286     return ret;
287 }
288 
289 static QemuOptsList qcow2_runtime_opts = {
290     .name = "qcow2",
291     .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
292     .desc = {
293         {
294             .name = "lazy_refcounts",
295             .type = QEMU_OPT_BOOL,
296             .help = "Postpone refcount updates",
297         },
298         {
299             .name = QCOW2_OPT_DISCARD_REQUEST,
300             .type = QEMU_OPT_BOOL,
301             .help = "Pass guest discard requests to the layer below",
302         },
303         {
304             .name = QCOW2_OPT_DISCARD_SNAPSHOT,
305             .type = QEMU_OPT_BOOL,
306             .help = "Generate discard requests when snapshot related space "
307                     "is freed",
308         },
309         {
310             .name = QCOW2_OPT_DISCARD_OTHER,
311             .type = QEMU_OPT_BOOL,
312             .help = "Generate discard requests when other clusters are freed",
313         },
314         { /* end of list */ }
315     },
316 };
317 
318 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
319 {
320     BDRVQcowState *s = bs->opaque;
321     int len, i, ret = 0;
322     QCowHeader header;
323     QemuOpts *opts;
324     Error *local_err = NULL;
325     uint64_t ext_end;
326     uint64_t l1_vm_state_index;
327 
328     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
329     if (ret < 0) {
330         goto fail;
331     }
332     be32_to_cpus(&header.magic);
333     be32_to_cpus(&header.version);
334     be64_to_cpus(&header.backing_file_offset);
335     be32_to_cpus(&header.backing_file_size);
336     be64_to_cpus(&header.size);
337     be32_to_cpus(&header.cluster_bits);
338     be32_to_cpus(&header.crypt_method);
339     be64_to_cpus(&header.l1_table_offset);
340     be32_to_cpus(&header.l1_size);
341     be64_to_cpus(&header.refcount_table_offset);
342     be32_to_cpus(&header.refcount_table_clusters);
343     be64_to_cpus(&header.snapshots_offset);
344     be32_to_cpus(&header.nb_snapshots);
345 
346     if (header.magic != QCOW_MAGIC) {
347         ret = -EMEDIUMTYPE;
348         goto fail;
349     }
350     if (header.version < 2 || header.version > 3) {
351         report_unsupported(bs, "QCOW version %d", header.version);
352         ret = -ENOTSUP;
353         goto fail;
354     }
355 
356     s->qcow_version = header.version;
357 
358     /* Initialise version 3 header fields */
359     if (header.version == 2) {
360         header.incompatible_features    = 0;
361         header.compatible_features      = 0;
362         header.autoclear_features       = 0;
363         header.refcount_order           = 4;
364         header.header_length            = 72;
365     } else {
366         be64_to_cpus(&header.incompatible_features);
367         be64_to_cpus(&header.compatible_features);
368         be64_to_cpus(&header.autoclear_features);
369         be32_to_cpus(&header.refcount_order);
370         be32_to_cpus(&header.header_length);
371     }
372 
373     if (header.header_length > sizeof(header)) {
374         s->unknown_header_fields_size = header.header_length - sizeof(header);
375         s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
376         ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
377                          s->unknown_header_fields_size);
378         if (ret < 0) {
379             goto fail;
380         }
381     }
382 
383     if (header.backing_file_offset) {
384         ext_end = header.backing_file_offset;
385     } else {
386         ext_end = 1 << header.cluster_bits;
387     }
388 
389     /* Handle feature bits */
390     s->incompatible_features    = header.incompatible_features;
391     s->compatible_features      = header.compatible_features;
392     s->autoclear_features       = header.autoclear_features;
393 
394     if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
395         void *feature_table = NULL;
396         qcow2_read_extensions(bs, header.header_length, ext_end,
397                               &feature_table);
398         report_unsupported_feature(bs, feature_table,
399                                    s->incompatible_features &
400                                    ~QCOW2_INCOMPAT_MASK);
401         ret = -ENOTSUP;
402         goto fail;
403     }
404 
405     /* Check support for various header values */
406     if (header.refcount_order != 4) {
407         report_unsupported(bs, "%d bit reference counts",
408                            1 << header.refcount_order);
409         ret = -ENOTSUP;
410         goto fail;
411     }
412 
413     if (header.cluster_bits < MIN_CLUSTER_BITS ||
414         header.cluster_bits > MAX_CLUSTER_BITS) {
415         ret = -EINVAL;
416         goto fail;
417     }
418     if (header.crypt_method > QCOW_CRYPT_AES) {
419         ret = -EINVAL;
420         goto fail;
421     }
422     s->crypt_method_header = header.crypt_method;
423     if (s->crypt_method_header) {
424         bs->encrypted = 1;
425     }
426     s->cluster_bits = header.cluster_bits;
427     s->cluster_size = 1 << s->cluster_bits;
428     s->cluster_sectors = 1 << (s->cluster_bits - 9);
429     s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
430     s->l2_size = 1 << s->l2_bits;
431     bs->total_sectors = header.size / 512;
432     s->csize_shift = (62 - (s->cluster_bits - 8));
433     s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
434     s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
435     s->refcount_table_offset = header.refcount_table_offset;
436     s->refcount_table_size =
437         header.refcount_table_clusters << (s->cluster_bits - 3);
438 
439     s->snapshots_offset = header.snapshots_offset;
440     s->nb_snapshots = header.nb_snapshots;
441 
442     /* read the level 1 table */
443     s->l1_size = header.l1_size;
444 
445     l1_vm_state_index = size_to_l1(s, header.size);
446     if (l1_vm_state_index > INT_MAX) {
447         ret = -EFBIG;
448         goto fail;
449     }
450     s->l1_vm_state_index = l1_vm_state_index;
451 
452     /* the L1 table must contain at least enough entries to put
453        header.size bytes */
454     if (s->l1_size < s->l1_vm_state_index) {
455         ret = -EINVAL;
456         goto fail;
457     }
458     s->l1_table_offset = header.l1_table_offset;
459     if (s->l1_size > 0) {
460         s->l1_table = g_malloc0(
461             align_offset(s->l1_size * sizeof(uint64_t), 512));
462         ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
463                          s->l1_size * sizeof(uint64_t));
464         if (ret < 0) {
465             goto fail;
466         }
467         for(i = 0;i < s->l1_size; i++) {
468             be64_to_cpus(&s->l1_table[i]);
469         }
470     }
471 
472     /* alloc L2 table/refcount block cache */
473     s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE);
474     s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE);
475 
476     s->cluster_cache = g_malloc(s->cluster_size);
477     /* one more sector for decompressed data alignment */
478     s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
479                                   + 512);
480     s->cluster_cache_offset = -1;
481     s->flags = flags;
482 
483     ret = qcow2_refcount_init(bs);
484     if (ret != 0) {
485         goto fail;
486     }
487 
488     QLIST_INIT(&s->cluster_allocs);
489     QTAILQ_INIT(&s->discards);
490 
491     /* read qcow2 extensions */
492     if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) {
493         ret = -EINVAL;
494         goto fail;
495     }
496 
497     /* read the backing file name */
498     if (header.backing_file_offset != 0) {
499         len = header.backing_file_size;
500         if (len > 1023) {
501             len = 1023;
502         }
503         ret = bdrv_pread(bs->file, header.backing_file_offset,
504                          bs->backing_file, len);
505         if (ret < 0) {
506             goto fail;
507         }
508         bs->backing_file[len] = '\0';
509     }
510 
511     ret = qcow2_read_snapshots(bs);
512     if (ret < 0) {
513         goto fail;
514     }
515 
516     /* Clear unknown autoclear feature bits */
517     if (!bs->read_only && s->autoclear_features != 0) {
518         s->autoclear_features = 0;
519         ret = qcow2_update_header(bs);
520         if (ret < 0) {
521             goto fail;
522         }
523     }
524 
525     /* Initialise locks */
526     qemu_co_mutex_init(&s->lock);
527 
528     /* Repair image if dirty */
529     if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
530         (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
531         BdrvCheckResult result = {0};
532 
533         ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS);
534         if (ret < 0) {
535             goto fail;
536         }
537     }
538 
539     /* Enable lazy_refcounts according to image and command line options */
540     opts = qemu_opts_create_nofail(&qcow2_runtime_opts);
541     qemu_opts_absorb_qdict(opts, options, &local_err);
542     if (error_is_set(&local_err)) {
543         qerror_report_err(local_err);
544         error_free(local_err);
545         ret = -EINVAL;
546         goto fail;
547     }
548 
549     s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
550         (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
551 
552     s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
553     s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
554     s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
555         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
556                           flags & BDRV_O_UNMAP);
557     s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
558         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
559     s->discard_passthrough[QCOW2_DISCARD_OTHER] =
560         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
561 
562     qemu_opts_del(opts);
563 
564     if (s->use_lazy_refcounts && s->qcow_version < 3) {
565         qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require "
566             "a qcow2 image with at least qemu 1.1 compatibility level");
567         ret = -EINVAL;
568         goto fail;
569     }
570 
571 #ifdef DEBUG_ALLOC
572     {
573         BdrvCheckResult result = {0};
574         qcow2_check_refcounts(bs, &result, 0);
575     }
576 #endif
577     return ret;
578 
579  fail:
580     g_free(s->unknown_header_fields);
581     cleanup_unknown_header_ext(bs);
582     qcow2_free_snapshots(bs);
583     qcow2_refcount_close(bs);
584     g_free(s->l1_table);
585     if (s->l2_table_cache) {
586         qcow2_cache_destroy(bs, s->l2_table_cache);
587     }
588     g_free(s->cluster_cache);
589     qemu_vfree(s->cluster_data);
590     return ret;
591 }
592 
593 static int qcow2_set_key(BlockDriverState *bs, const char *key)
594 {
595     BDRVQcowState *s = bs->opaque;
596     uint8_t keybuf[16];
597     int len, i;
598 
599     memset(keybuf, 0, 16);
600     len = strlen(key);
601     if (len > 16)
602         len = 16;
603     /* XXX: we could compress the chars to 7 bits to increase
604        entropy */
605     for(i = 0;i < len;i++) {
606         keybuf[i] = key[i];
607     }
608     s->crypt_method = s->crypt_method_header;
609 
610     if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
611         return -1;
612     if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
613         return -1;
614 #if 0
615     /* test */
616     {
617         uint8_t in[16];
618         uint8_t out[16];
619         uint8_t tmp[16];
620         for(i=0;i<16;i++)
621             in[i] = i;
622         AES_encrypt(in, tmp, &s->aes_encrypt_key);
623         AES_decrypt(tmp, out, &s->aes_decrypt_key);
624         for(i = 0; i < 16; i++)
625             printf(" %02x", tmp[i]);
626         printf("\n");
627         for(i = 0; i < 16; i++)
628             printf(" %02x", out[i]);
629         printf("\n");
630     }
631 #endif
632     return 0;
633 }
634 
635 /* We have nothing to do for QCOW2 reopen, stubs just return
636  * success */
637 static int qcow2_reopen_prepare(BDRVReopenState *state,
638                                 BlockReopenQueue *queue, Error **errp)
639 {
640     return 0;
641 }
642 
643 static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
644         int64_t sector_num, int nb_sectors, int *pnum)
645 {
646     BDRVQcowState *s = bs->opaque;
647     uint64_t cluster_offset;
648     int ret;
649 
650     *pnum = nb_sectors;
651     /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
652      * can't pass them on today */
653     qemu_co_mutex_lock(&s->lock);
654     ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
655     qemu_co_mutex_unlock(&s->lock);
656     if (ret < 0) {
657         *pnum = 0;
658     }
659 
660     return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
661 }
662 
663 /* handle reading after the end of the backing file */
664 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
665                   int64_t sector_num, int nb_sectors)
666 {
667     int n1;
668     if ((sector_num + nb_sectors) <= bs->total_sectors)
669         return nb_sectors;
670     if (sector_num >= bs->total_sectors)
671         n1 = 0;
672     else
673         n1 = bs->total_sectors - sector_num;
674 
675     qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
676 
677     return n1;
678 }
679 
680 static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
681                           int remaining_sectors, QEMUIOVector *qiov)
682 {
683     BDRVQcowState *s = bs->opaque;
684     int index_in_cluster, n1;
685     int ret;
686     int cur_nr_sectors; /* number of sectors in current iteration */
687     uint64_t cluster_offset = 0;
688     uint64_t bytes_done = 0;
689     QEMUIOVector hd_qiov;
690     uint8_t *cluster_data = NULL;
691 
692     qemu_iovec_init(&hd_qiov, qiov->niov);
693 
694     qemu_co_mutex_lock(&s->lock);
695 
696     while (remaining_sectors != 0) {
697 
698         /* prepare next request */
699         cur_nr_sectors = remaining_sectors;
700         if (s->crypt_method) {
701             cur_nr_sectors = MIN(cur_nr_sectors,
702                 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
703         }
704 
705         ret = qcow2_get_cluster_offset(bs, sector_num << 9,
706             &cur_nr_sectors, &cluster_offset);
707         if (ret < 0) {
708             goto fail;
709         }
710 
711         index_in_cluster = sector_num & (s->cluster_sectors - 1);
712 
713         qemu_iovec_reset(&hd_qiov);
714         qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
715             cur_nr_sectors * 512);
716 
717         switch (ret) {
718         case QCOW2_CLUSTER_UNALLOCATED:
719 
720             if (bs->backing_hd) {
721                 /* read from the base image */
722                 n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
723                     sector_num, cur_nr_sectors);
724                 if (n1 > 0) {
725                     BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
726                     qemu_co_mutex_unlock(&s->lock);
727                     ret = bdrv_co_readv(bs->backing_hd, sector_num,
728                                         n1, &hd_qiov);
729                     qemu_co_mutex_lock(&s->lock);
730                     if (ret < 0) {
731                         goto fail;
732                     }
733                 }
734             } else {
735                 /* Note: in this case, no need to wait */
736                 qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
737             }
738             break;
739 
740         case QCOW2_CLUSTER_ZERO:
741             qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
742             break;
743 
744         case QCOW2_CLUSTER_COMPRESSED:
745             /* add AIO support for compressed blocks ? */
746             ret = qcow2_decompress_cluster(bs, cluster_offset);
747             if (ret < 0) {
748                 goto fail;
749             }
750 
751             qemu_iovec_from_buf(&hd_qiov, 0,
752                 s->cluster_cache + index_in_cluster * 512,
753                 512 * cur_nr_sectors);
754             break;
755 
756         case QCOW2_CLUSTER_NORMAL:
757             if ((cluster_offset & 511) != 0) {
758                 ret = -EIO;
759                 goto fail;
760             }
761 
762             if (s->crypt_method) {
763                 /*
764                  * For encrypted images, read everything into a temporary
765                  * contiguous buffer on which the AES functions can work.
766                  */
767                 if (!cluster_data) {
768                     cluster_data =
769                         qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
770                 }
771 
772                 assert(cur_nr_sectors <=
773                     QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
774                 qemu_iovec_reset(&hd_qiov);
775                 qemu_iovec_add(&hd_qiov, cluster_data,
776                     512 * cur_nr_sectors);
777             }
778 
779             BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
780             qemu_co_mutex_unlock(&s->lock);
781             ret = bdrv_co_readv(bs->file,
782                                 (cluster_offset >> 9) + index_in_cluster,
783                                 cur_nr_sectors, &hd_qiov);
784             qemu_co_mutex_lock(&s->lock);
785             if (ret < 0) {
786                 goto fail;
787             }
788             if (s->crypt_method) {
789                 qcow2_encrypt_sectors(s, sector_num,  cluster_data,
790                     cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
791                 qemu_iovec_from_buf(qiov, bytes_done,
792                     cluster_data, 512 * cur_nr_sectors);
793             }
794             break;
795 
796         default:
797             g_assert_not_reached();
798             ret = -EIO;
799             goto fail;
800         }
801 
802         remaining_sectors -= cur_nr_sectors;
803         sector_num += cur_nr_sectors;
804         bytes_done += cur_nr_sectors * 512;
805     }
806     ret = 0;
807 
808 fail:
809     qemu_co_mutex_unlock(&s->lock);
810 
811     qemu_iovec_destroy(&hd_qiov);
812     qemu_vfree(cluster_data);
813 
814     return ret;
815 }
816 
817 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
818                            int64_t sector_num,
819                            int remaining_sectors,
820                            QEMUIOVector *qiov)
821 {
822     BDRVQcowState *s = bs->opaque;
823     int index_in_cluster;
824     int n_end;
825     int ret;
826     int cur_nr_sectors; /* number of sectors in current iteration */
827     uint64_t cluster_offset;
828     QEMUIOVector hd_qiov;
829     uint64_t bytes_done = 0;
830     uint8_t *cluster_data = NULL;
831     QCowL2Meta *l2meta = NULL;
832 
833     trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
834                                  remaining_sectors);
835 
836     qemu_iovec_init(&hd_qiov, qiov->niov);
837 
838     s->cluster_cache_offset = -1; /* disable compressed cache */
839 
840     qemu_co_mutex_lock(&s->lock);
841 
842     while (remaining_sectors != 0) {
843 
844         l2meta = NULL;
845 
846         trace_qcow2_writev_start_part(qemu_coroutine_self());
847         index_in_cluster = sector_num & (s->cluster_sectors - 1);
848         n_end = index_in_cluster + remaining_sectors;
849         if (s->crypt_method &&
850             n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) {
851             n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
852         }
853 
854         ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
855             index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta);
856         if (ret < 0) {
857             goto fail;
858         }
859 
860         assert((cluster_offset & 511) == 0);
861 
862         qemu_iovec_reset(&hd_qiov);
863         qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
864             cur_nr_sectors * 512);
865 
866         if (s->crypt_method) {
867             if (!cluster_data) {
868                 cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS *
869                                                  s->cluster_size);
870             }
871 
872             assert(hd_qiov.size <=
873                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
874             qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
875 
876             qcow2_encrypt_sectors(s, sector_num, cluster_data,
877                 cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
878 
879             qemu_iovec_reset(&hd_qiov);
880             qemu_iovec_add(&hd_qiov, cluster_data,
881                 cur_nr_sectors * 512);
882         }
883 
884         qemu_co_mutex_unlock(&s->lock);
885         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
886         trace_qcow2_writev_data(qemu_coroutine_self(),
887                                 (cluster_offset >> 9) + index_in_cluster);
888         ret = bdrv_co_writev(bs->file,
889                              (cluster_offset >> 9) + index_in_cluster,
890                              cur_nr_sectors, &hd_qiov);
891         qemu_co_mutex_lock(&s->lock);
892         if (ret < 0) {
893             goto fail;
894         }
895 
896         while (l2meta != NULL) {
897             QCowL2Meta *next;
898 
899             ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
900             if (ret < 0) {
901                 goto fail;
902             }
903 
904             /* Take the request off the list of running requests */
905             if (l2meta->nb_clusters != 0) {
906                 QLIST_REMOVE(l2meta, next_in_flight);
907             }
908 
909             qemu_co_queue_restart_all(&l2meta->dependent_requests);
910 
911             next = l2meta->next;
912             g_free(l2meta);
913             l2meta = next;
914         }
915 
916         remaining_sectors -= cur_nr_sectors;
917         sector_num += cur_nr_sectors;
918         bytes_done += cur_nr_sectors * 512;
919         trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
920     }
921     ret = 0;
922 
923 fail:
924     qemu_co_mutex_unlock(&s->lock);
925 
926     while (l2meta != NULL) {
927         QCowL2Meta *next;
928 
929         if (l2meta->nb_clusters != 0) {
930             QLIST_REMOVE(l2meta, next_in_flight);
931         }
932         qemu_co_queue_restart_all(&l2meta->dependent_requests);
933 
934         next = l2meta->next;
935         g_free(l2meta);
936         l2meta = next;
937     }
938 
939     qemu_iovec_destroy(&hd_qiov);
940     qemu_vfree(cluster_data);
941     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
942 
943     return ret;
944 }
945 
946 static void qcow2_close(BlockDriverState *bs)
947 {
948     BDRVQcowState *s = bs->opaque;
949     g_free(s->l1_table);
950 
951     qcow2_cache_flush(bs, s->l2_table_cache);
952     qcow2_cache_flush(bs, s->refcount_block_cache);
953 
954     qcow2_mark_clean(bs);
955 
956     qcow2_cache_destroy(bs, s->l2_table_cache);
957     qcow2_cache_destroy(bs, s->refcount_block_cache);
958 
959     g_free(s->unknown_header_fields);
960     cleanup_unknown_header_ext(bs);
961 
962     g_free(s->cluster_cache);
963     qemu_vfree(s->cluster_data);
964     qcow2_refcount_close(bs);
965     qcow2_free_snapshots(bs);
966 }
967 
968 static void qcow2_invalidate_cache(BlockDriverState *bs)
969 {
970     BDRVQcowState *s = bs->opaque;
971     int flags = s->flags;
972     AES_KEY aes_encrypt_key;
973     AES_KEY aes_decrypt_key;
974     uint32_t crypt_method = 0;
975     QDict *options;
976 
977     /*
978      * Backing files are read-only which makes all of their metadata immutable,
979      * that means we don't have to worry about reopening them here.
980      */
981 
982     if (s->crypt_method) {
983         crypt_method = s->crypt_method;
984         memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
985         memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
986     }
987 
988     qcow2_close(bs);
989 
990     options = qdict_new();
991     qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS,
992               qbool_from_int(s->use_lazy_refcounts));
993 
994     memset(s, 0, sizeof(BDRVQcowState));
995     qcow2_open(bs, options, flags);
996 
997     QDECREF(options);
998 
999     if (crypt_method) {
1000         s->crypt_method = crypt_method;
1001         memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
1002         memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
1003     }
1004 }
1005 
1006 static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
1007     size_t len, size_t buflen)
1008 {
1009     QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
1010     size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
1011 
1012     if (buflen < ext_len) {
1013         return -ENOSPC;
1014     }
1015 
1016     *ext_backing_fmt = (QCowExtension) {
1017         .magic  = cpu_to_be32(magic),
1018         .len    = cpu_to_be32(len),
1019     };
1020     memcpy(buf + sizeof(QCowExtension), s, len);
1021 
1022     return ext_len;
1023 }
1024 
1025 /*
1026  * Updates the qcow2 header, including the variable length parts of it, i.e.
1027  * the backing file name and all extensions. qcow2 was not designed to allow
1028  * such changes, so if we run out of space (we can only use the first cluster)
1029  * this function may fail.
1030  *
1031  * Returns 0 on success, -errno in error cases.
1032  */
1033 int qcow2_update_header(BlockDriverState *bs)
1034 {
1035     BDRVQcowState *s = bs->opaque;
1036     QCowHeader *header;
1037     char *buf;
1038     size_t buflen = s->cluster_size;
1039     int ret;
1040     uint64_t total_size;
1041     uint32_t refcount_table_clusters;
1042     size_t header_length;
1043     Qcow2UnknownHeaderExtension *uext;
1044 
1045     buf = qemu_blockalign(bs, buflen);
1046 
1047     /* Header structure */
1048     header = (QCowHeader*) buf;
1049 
1050     if (buflen < sizeof(*header)) {
1051         ret = -ENOSPC;
1052         goto fail;
1053     }
1054 
1055     header_length = sizeof(*header) + s->unknown_header_fields_size;
1056     total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
1057     refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
1058 
1059     *header = (QCowHeader) {
1060         /* Version 2 fields */
1061         .magic                  = cpu_to_be32(QCOW_MAGIC),
1062         .version                = cpu_to_be32(s->qcow_version),
1063         .backing_file_offset    = 0,
1064         .backing_file_size      = 0,
1065         .cluster_bits           = cpu_to_be32(s->cluster_bits),
1066         .size                   = cpu_to_be64(total_size),
1067         .crypt_method           = cpu_to_be32(s->crypt_method_header),
1068         .l1_size                = cpu_to_be32(s->l1_size),
1069         .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
1070         .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
1071         .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
1072         .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
1073         .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
1074 
1075         /* Version 3 fields */
1076         .incompatible_features  = cpu_to_be64(s->incompatible_features),
1077         .compatible_features    = cpu_to_be64(s->compatible_features),
1078         .autoclear_features     = cpu_to_be64(s->autoclear_features),
1079         .refcount_order         = cpu_to_be32(3 + REFCOUNT_SHIFT),
1080         .header_length          = cpu_to_be32(header_length),
1081     };
1082 
1083     /* For older versions, write a shorter header */
1084     switch (s->qcow_version) {
1085     case 2:
1086         ret = offsetof(QCowHeader, incompatible_features);
1087         break;
1088     case 3:
1089         ret = sizeof(*header);
1090         break;
1091     default:
1092         ret = -EINVAL;
1093         goto fail;
1094     }
1095 
1096     buf += ret;
1097     buflen -= ret;
1098     memset(buf, 0, buflen);
1099 
1100     /* Preserve any unknown field in the header */
1101     if (s->unknown_header_fields_size) {
1102         if (buflen < s->unknown_header_fields_size) {
1103             ret = -ENOSPC;
1104             goto fail;
1105         }
1106 
1107         memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
1108         buf += s->unknown_header_fields_size;
1109         buflen -= s->unknown_header_fields_size;
1110     }
1111 
1112     /* Backing file format header extension */
1113     if (*bs->backing_format) {
1114         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
1115                              bs->backing_format, strlen(bs->backing_format),
1116                              buflen);
1117         if (ret < 0) {
1118             goto fail;
1119         }
1120 
1121         buf += ret;
1122         buflen -= ret;
1123     }
1124 
1125     /* Feature table */
1126     Qcow2Feature features[] = {
1127         {
1128             .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1129             .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
1130             .name = "dirty bit",
1131         },
1132         {
1133             .type = QCOW2_FEAT_TYPE_COMPATIBLE,
1134             .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
1135             .name = "lazy refcounts",
1136         },
1137     };
1138 
1139     ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
1140                          features, sizeof(features), buflen);
1141     if (ret < 0) {
1142         goto fail;
1143     }
1144     buf += ret;
1145     buflen -= ret;
1146 
1147     /* Keep unknown header extensions */
1148     QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
1149         ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
1150         if (ret < 0) {
1151             goto fail;
1152         }
1153 
1154         buf += ret;
1155         buflen -= ret;
1156     }
1157 
1158     /* End of header extensions */
1159     ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
1160     if (ret < 0) {
1161         goto fail;
1162     }
1163 
1164     buf += ret;
1165     buflen -= ret;
1166 
1167     /* Backing file name */
1168     if (*bs->backing_file) {
1169         size_t backing_file_len = strlen(bs->backing_file);
1170 
1171         if (buflen < backing_file_len) {
1172             ret = -ENOSPC;
1173             goto fail;
1174         }
1175 
1176         /* Using strncpy is ok here, since buf is not NUL-terminated. */
1177         strncpy(buf, bs->backing_file, buflen);
1178 
1179         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
1180         header->backing_file_size   = cpu_to_be32(backing_file_len);
1181     }
1182 
1183     /* Write the new header */
1184     ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
1185     if (ret < 0) {
1186         goto fail;
1187     }
1188 
1189     ret = 0;
1190 fail:
1191     qemu_vfree(header);
1192     return ret;
1193 }
1194 
1195 static int qcow2_change_backing_file(BlockDriverState *bs,
1196     const char *backing_file, const char *backing_fmt)
1197 {
1198     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1199     pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1200 
1201     return qcow2_update_header(bs);
1202 }
1203 
1204 static int preallocate(BlockDriverState *bs)
1205 {
1206     uint64_t nb_sectors;
1207     uint64_t offset;
1208     uint64_t host_offset = 0;
1209     int num;
1210     int ret;
1211     QCowL2Meta *meta;
1212 
1213     nb_sectors = bdrv_getlength(bs) >> 9;
1214     offset = 0;
1215 
1216     while (nb_sectors) {
1217         num = MIN(nb_sectors, INT_MAX >> 9);
1218         ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num,
1219                                          &host_offset, &meta);
1220         if (ret < 0) {
1221             return ret;
1222         }
1223 
1224         ret = qcow2_alloc_cluster_link_l2(bs, meta);
1225         if (ret < 0) {
1226             qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters,
1227                                     QCOW2_DISCARD_NEVER);
1228             return ret;
1229         }
1230 
1231         /* There are no dependent requests, but we need to remove our request
1232          * from the list of in-flight requests */
1233         if (meta != NULL) {
1234             QLIST_REMOVE(meta, next_in_flight);
1235         }
1236 
1237         /* TODO Preallocate data if requested */
1238 
1239         nb_sectors -= num;
1240         offset += num << 9;
1241     }
1242 
1243     /*
1244      * It is expected that the image file is large enough to actually contain
1245      * all of the allocated clusters (otherwise we get failing reads after
1246      * EOF). Extend the image to the last allocated sector.
1247      */
1248     if (host_offset != 0) {
1249         uint8_t buf[512];
1250         memset(buf, 0, 512);
1251         ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1);
1252         if (ret < 0) {
1253             return ret;
1254         }
1255     }
1256 
1257     return 0;
1258 }
1259 
1260 static int qcow2_create2(const char *filename, int64_t total_size,
1261                          const char *backing_file, const char *backing_format,
1262                          int flags, size_t cluster_size, int prealloc,
1263                          QEMUOptionParameter *options, int version)
1264 {
1265     /* Calculate cluster_bits */
1266     int cluster_bits;
1267     cluster_bits = ffs(cluster_size) - 1;
1268     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
1269         (1 << cluster_bits) != cluster_size)
1270     {
1271         error_report(
1272             "Cluster size must be a power of two between %d and %dk",
1273             1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
1274         return -EINVAL;
1275     }
1276 
1277     /*
1278      * Open the image file and write a minimal qcow2 header.
1279      *
1280      * We keep things simple and start with a zero-sized image. We also
1281      * do without refcount blocks or a L1 table for now. We'll fix the
1282      * inconsistency later.
1283      *
1284      * We do need a refcount table because growing the refcount table means
1285      * allocating two new refcount blocks - the seconds of which would be at
1286      * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
1287      * size for any qcow2 image.
1288      */
1289     BlockDriverState* bs;
1290     QCowHeader header;
1291     uint8_t* refcount_table;
1292     int ret;
1293 
1294     ret = bdrv_create_file(filename, options);
1295     if (ret < 0) {
1296         return ret;
1297     }
1298 
1299     ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
1300     if (ret < 0) {
1301         return ret;
1302     }
1303 
1304     /* Write the header */
1305     memset(&header, 0, sizeof(header));
1306     header.magic = cpu_to_be32(QCOW_MAGIC);
1307     header.version = cpu_to_be32(version);
1308     header.cluster_bits = cpu_to_be32(cluster_bits);
1309     header.size = cpu_to_be64(0);
1310     header.l1_table_offset = cpu_to_be64(0);
1311     header.l1_size = cpu_to_be32(0);
1312     header.refcount_table_offset = cpu_to_be64(cluster_size);
1313     header.refcount_table_clusters = cpu_to_be32(1);
1314     header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT);
1315     header.header_length = cpu_to_be32(sizeof(header));
1316 
1317     if (flags & BLOCK_FLAG_ENCRYPT) {
1318         header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1319     } else {
1320         header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1321     }
1322 
1323     if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
1324         header.compatible_features |=
1325             cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
1326     }
1327 
1328     ret = bdrv_pwrite(bs, 0, &header, sizeof(header));
1329     if (ret < 0) {
1330         goto out;
1331     }
1332 
1333     /* Write an empty refcount table */
1334     refcount_table = g_malloc0(cluster_size);
1335     ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size);
1336     g_free(refcount_table);
1337 
1338     if (ret < 0) {
1339         goto out;
1340     }
1341 
1342     bdrv_close(bs);
1343 
1344     /*
1345      * And now open the image and make it consistent first (i.e. increase the
1346      * refcount of the cluster that is occupied by the header and the refcount
1347      * table)
1348      */
1349     BlockDriver* drv = bdrv_find_format("qcow2");
1350     assert(drv != NULL);
1351     ret = bdrv_open(bs, filename, NULL,
1352         BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv);
1353     if (ret < 0) {
1354         goto out;
1355     }
1356 
1357     ret = qcow2_alloc_clusters(bs, 2 * cluster_size);
1358     if (ret < 0) {
1359         goto out;
1360 
1361     } else if (ret != 0) {
1362         error_report("Huh, first cluster in empty image is already in use?");
1363         abort();
1364     }
1365 
1366     /* Okay, now that we have a valid image, let's give it the right size */
1367     ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE);
1368     if (ret < 0) {
1369         goto out;
1370     }
1371 
1372     /* Want a backing file? There you go.*/
1373     if (backing_file) {
1374         ret = bdrv_change_backing_file(bs, backing_file, backing_format);
1375         if (ret < 0) {
1376             goto out;
1377         }
1378     }
1379 
1380     /* And if we're supposed to preallocate metadata, do that now */
1381     if (prealloc) {
1382         BDRVQcowState *s = bs->opaque;
1383         qemu_co_mutex_lock(&s->lock);
1384         ret = preallocate(bs);
1385         qemu_co_mutex_unlock(&s->lock);
1386         if (ret < 0) {
1387             goto out;
1388         }
1389     }
1390 
1391     ret = 0;
1392 out:
1393     bdrv_delete(bs);
1394     return ret;
1395 }
1396 
1397 static int qcow2_create(const char *filename, QEMUOptionParameter *options)
1398 {
1399     const char *backing_file = NULL;
1400     const char *backing_fmt = NULL;
1401     uint64_t sectors = 0;
1402     int flags = 0;
1403     size_t cluster_size = DEFAULT_CLUSTER_SIZE;
1404     int prealloc = 0;
1405     int version = 2;
1406 
1407     /* Read out options */
1408     while (options && options->name) {
1409         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1410             sectors = options->value.n / 512;
1411         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1412             backing_file = options->value.s;
1413         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
1414             backing_fmt = options->value.s;
1415         } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
1416             flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
1417         } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
1418             if (options->value.n) {
1419                 cluster_size = options->value.n;
1420             }
1421         } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1422             if (!options->value.s || !strcmp(options->value.s, "off")) {
1423                 prealloc = 0;
1424             } else if (!strcmp(options->value.s, "metadata")) {
1425                 prealloc = 1;
1426             } else {
1427                 fprintf(stderr, "Invalid preallocation mode: '%s'\n",
1428                     options->value.s);
1429                 return -EINVAL;
1430             }
1431         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) {
1432             if (!options->value.s || !strcmp(options->value.s, "0.10")) {
1433                 version = 2;
1434             } else if (!strcmp(options->value.s, "1.1")) {
1435                 version = 3;
1436             } else {
1437                 fprintf(stderr, "Invalid compatibility level: '%s'\n",
1438                     options->value.s);
1439                 return -EINVAL;
1440             }
1441         } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
1442             flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0;
1443         }
1444         options++;
1445     }
1446 
1447     if (backing_file && prealloc) {
1448         fprintf(stderr, "Backing file and preallocation cannot be used at "
1449             "the same time\n");
1450         return -EINVAL;
1451     }
1452 
1453     if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
1454         fprintf(stderr, "Lazy refcounts only supported with compatibility "
1455                 "level 1.1 and above (use compat=1.1 or greater)\n");
1456         return -EINVAL;
1457     }
1458 
1459     return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags,
1460                          cluster_size, prealloc, options, version);
1461 }
1462 
1463 static int qcow2_make_empty(BlockDriverState *bs)
1464 {
1465 #if 0
1466     /* XXX: not correct */
1467     BDRVQcowState *s = bs->opaque;
1468     uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1469     int ret;
1470 
1471     memset(s->l1_table, 0, l1_length);
1472     if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0)
1473         return -1;
1474     ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
1475     if (ret < 0)
1476         return ret;
1477 
1478     l2_cache_reset(bs);
1479 #endif
1480     return 0;
1481 }
1482 
1483 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
1484     int64_t sector_num, int nb_sectors)
1485 {
1486     int ret;
1487     BDRVQcowState *s = bs->opaque;
1488 
1489     /* Emulate misaligned zero writes */
1490     if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
1491         return -ENOTSUP;
1492     }
1493 
1494     /* Whatever is left can use real zero clusters */
1495     qemu_co_mutex_lock(&s->lock);
1496     ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1497         nb_sectors);
1498     qemu_co_mutex_unlock(&s->lock);
1499 
1500     return ret;
1501 }
1502 
1503 static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
1504     int64_t sector_num, int nb_sectors)
1505 {
1506     int ret;
1507     BDRVQcowState *s = bs->opaque;
1508 
1509     qemu_co_mutex_lock(&s->lock);
1510     ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
1511         nb_sectors);
1512     qemu_co_mutex_unlock(&s->lock);
1513     return ret;
1514 }
1515 
1516 static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
1517 {
1518     BDRVQcowState *s = bs->opaque;
1519     int64_t new_l1_size;
1520     int ret;
1521 
1522     if (offset & 511) {
1523         error_report("The new size must be a multiple of 512");
1524         return -EINVAL;
1525     }
1526 
1527     /* cannot proceed if image has snapshots */
1528     if (s->nb_snapshots) {
1529         error_report("Can't resize an image which has snapshots");
1530         return -ENOTSUP;
1531     }
1532 
1533     /* shrinking is currently not supported */
1534     if (offset < bs->total_sectors * 512) {
1535         error_report("qcow2 doesn't support shrinking images yet");
1536         return -ENOTSUP;
1537     }
1538 
1539     new_l1_size = size_to_l1(s, offset);
1540     ret = qcow2_grow_l1_table(bs, new_l1_size, true);
1541     if (ret < 0) {
1542         return ret;
1543     }
1544 
1545     /* write updated header.size */
1546     offset = cpu_to_be64(offset);
1547     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
1548                            &offset, sizeof(uint64_t));
1549     if (ret < 0) {
1550         return ret;
1551     }
1552 
1553     s->l1_vm_state_index = new_l1_size;
1554     return 0;
1555 }
1556 
1557 /* XXX: put compressed sectors first, then all the cluster aligned
1558    tables to avoid losing bytes in alignment */
1559 static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
1560                                   const uint8_t *buf, int nb_sectors)
1561 {
1562     BDRVQcowState *s = bs->opaque;
1563     z_stream strm;
1564     int ret, out_len;
1565     uint8_t *out_buf;
1566     uint64_t cluster_offset;
1567 
1568     if (nb_sectors == 0) {
1569         /* align end of file to a sector boundary to ease reading with
1570            sector based I/Os */
1571         cluster_offset = bdrv_getlength(bs->file);
1572         cluster_offset = (cluster_offset + 511) & ~511;
1573         bdrv_truncate(bs->file, cluster_offset);
1574         return 0;
1575     }
1576 
1577     if (nb_sectors != s->cluster_sectors) {
1578         ret = -EINVAL;
1579 
1580         /* Zero-pad last write if image size is not cluster aligned */
1581         if (sector_num + nb_sectors == bs->total_sectors &&
1582             nb_sectors < s->cluster_sectors) {
1583             uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
1584             memset(pad_buf, 0, s->cluster_size);
1585             memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
1586             ret = qcow2_write_compressed(bs, sector_num,
1587                                          pad_buf, s->cluster_sectors);
1588             qemu_vfree(pad_buf);
1589         }
1590         return ret;
1591     }
1592 
1593     out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1594 
1595     /* best compression, small window, no zlib header */
1596     memset(&strm, 0, sizeof(strm));
1597     ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1598                        Z_DEFLATED, -12,
1599                        9, Z_DEFAULT_STRATEGY);
1600     if (ret != 0) {
1601         ret = -EINVAL;
1602         goto fail;
1603     }
1604 
1605     strm.avail_in = s->cluster_size;
1606     strm.next_in = (uint8_t *)buf;
1607     strm.avail_out = s->cluster_size;
1608     strm.next_out = out_buf;
1609 
1610     ret = deflate(&strm, Z_FINISH);
1611     if (ret != Z_STREAM_END && ret != Z_OK) {
1612         deflateEnd(&strm);
1613         ret = -EINVAL;
1614         goto fail;
1615     }
1616     out_len = strm.next_out - out_buf;
1617 
1618     deflateEnd(&strm);
1619 
1620     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1621         /* could not compress: write normal cluster */
1622         ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
1623         if (ret < 0) {
1624             goto fail;
1625         }
1626     } else {
1627         cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
1628             sector_num << 9, out_len);
1629         if (!cluster_offset) {
1630             ret = -EIO;
1631             goto fail;
1632         }
1633         cluster_offset &= s->cluster_offset_mask;
1634         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
1635         ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
1636         if (ret < 0) {
1637             goto fail;
1638         }
1639     }
1640 
1641     ret = 0;
1642 fail:
1643     g_free(out_buf);
1644     return ret;
1645 }
1646 
1647 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
1648 {
1649     BDRVQcowState *s = bs->opaque;
1650     int ret;
1651 
1652     qemu_co_mutex_lock(&s->lock);
1653     ret = qcow2_cache_flush(bs, s->l2_table_cache);
1654     if (ret < 0) {
1655         qemu_co_mutex_unlock(&s->lock);
1656         return ret;
1657     }
1658 
1659     if (qcow2_need_accurate_refcounts(s)) {
1660         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
1661         if (ret < 0) {
1662             qemu_co_mutex_unlock(&s->lock);
1663             return ret;
1664         }
1665     }
1666     qemu_co_mutex_unlock(&s->lock);
1667 
1668     return 0;
1669 }
1670 
1671 static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
1672 {
1673 	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
1674 }
1675 
1676 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1677 {
1678     BDRVQcowState *s = bs->opaque;
1679     bdi->cluster_size = s->cluster_size;
1680     bdi->vm_state_offset = qcow2_vm_state_offset(s);
1681     return 0;
1682 }
1683 
1684 #if 0
1685 static void dump_refcounts(BlockDriverState *bs)
1686 {
1687     BDRVQcowState *s = bs->opaque;
1688     int64_t nb_clusters, k, k1, size;
1689     int refcount;
1690 
1691     size = bdrv_getlength(bs->file);
1692     nb_clusters = size_to_clusters(s, size);
1693     for(k = 0; k < nb_clusters;) {
1694         k1 = k;
1695         refcount = get_refcount(bs, k);
1696         k++;
1697         while (k < nb_clusters && get_refcount(bs, k) == refcount)
1698             k++;
1699         printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
1700                k - k1);
1701     }
1702 }
1703 #endif
1704 
1705 static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
1706                               int64_t pos)
1707 {
1708     BDRVQcowState *s = bs->opaque;
1709     int growable = bs->growable;
1710     int ret;
1711 
1712     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
1713     bs->growable = 1;
1714     ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
1715     bs->growable = growable;
1716 
1717     return ret;
1718 }
1719 
1720 static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1721                               int64_t pos, int size)
1722 {
1723     BDRVQcowState *s = bs->opaque;
1724     int growable = bs->growable;
1725     int ret;
1726 
1727     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
1728     bs->growable = 1;
1729     ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
1730     bs->growable = growable;
1731 
1732     return ret;
1733 }
1734 
1735 static QEMUOptionParameter qcow2_create_options[] = {
1736     {
1737         .name = BLOCK_OPT_SIZE,
1738         .type = OPT_SIZE,
1739         .help = "Virtual disk size"
1740     },
1741     {
1742         .name = BLOCK_OPT_COMPAT_LEVEL,
1743         .type = OPT_STRING,
1744         .help = "Compatibility level (0.10 or 1.1)"
1745     },
1746     {
1747         .name = BLOCK_OPT_BACKING_FILE,
1748         .type = OPT_STRING,
1749         .help = "File name of a base image"
1750     },
1751     {
1752         .name = BLOCK_OPT_BACKING_FMT,
1753         .type = OPT_STRING,
1754         .help = "Image format of the base image"
1755     },
1756     {
1757         .name = BLOCK_OPT_ENCRYPT,
1758         .type = OPT_FLAG,
1759         .help = "Encrypt the image"
1760     },
1761     {
1762         .name = BLOCK_OPT_CLUSTER_SIZE,
1763         .type = OPT_SIZE,
1764         .help = "qcow2 cluster size",
1765         .value = { .n = DEFAULT_CLUSTER_SIZE },
1766     },
1767     {
1768         .name = BLOCK_OPT_PREALLOC,
1769         .type = OPT_STRING,
1770         .help = "Preallocation mode (allowed values: off, metadata)"
1771     },
1772     {
1773         .name = BLOCK_OPT_LAZY_REFCOUNTS,
1774         .type = OPT_FLAG,
1775         .help = "Postpone refcount updates",
1776     },
1777     { NULL }
1778 };
1779 
1780 static BlockDriver bdrv_qcow2 = {
1781     .format_name        = "qcow2",
1782     .instance_size      = sizeof(BDRVQcowState),
1783     .bdrv_probe         = qcow2_probe,
1784     .bdrv_open          = qcow2_open,
1785     .bdrv_close         = qcow2_close,
1786     .bdrv_reopen_prepare  = qcow2_reopen_prepare,
1787     .bdrv_create        = qcow2_create,
1788     .bdrv_has_zero_init = bdrv_has_zero_init_1,
1789     .bdrv_co_is_allocated = qcow2_co_is_allocated,
1790     .bdrv_set_key       = qcow2_set_key,
1791     .bdrv_make_empty    = qcow2_make_empty,
1792 
1793     .bdrv_co_readv          = qcow2_co_readv,
1794     .bdrv_co_writev         = qcow2_co_writev,
1795     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
1796 
1797     .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
1798     .bdrv_co_discard        = qcow2_co_discard,
1799     .bdrv_truncate          = qcow2_truncate,
1800     .bdrv_write_compressed  = qcow2_write_compressed,
1801 
1802     .bdrv_snapshot_create   = qcow2_snapshot_create,
1803     .bdrv_snapshot_goto     = qcow2_snapshot_goto,
1804     .bdrv_snapshot_delete   = qcow2_snapshot_delete,
1805     .bdrv_snapshot_list     = qcow2_snapshot_list,
1806     .bdrv_snapshot_load_tmp     = qcow2_snapshot_load_tmp,
1807     .bdrv_get_info      = qcow2_get_info,
1808 
1809     .bdrv_save_vmstate    = qcow2_save_vmstate,
1810     .bdrv_load_vmstate    = qcow2_load_vmstate,
1811 
1812     .bdrv_change_backing_file   = qcow2_change_backing_file,
1813 
1814     .bdrv_invalidate_cache      = qcow2_invalidate_cache,
1815 
1816     .create_options = qcow2_create_options,
1817     .bdrv_check = qcow2_check,
1818 };
1819 
1820 static void bdrv_qcow2_init(void)
1821 {
1822     bdrv_register(&bdrv_qcow2);
1823 }
1824 
1825 block_init(bdrv_qcow2_init);
1826