xref: /openbmc/qemu/block/qcow2.c (revision 922a01a013d2270682a188258cbccacfecf8129c)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "block/block_int.h"
27 #include "sysemu/block-backend.h"
28 #include "qemu/module.h"
29 #include <zlib.h>
30 #include "block/qcow2.h"
31 #include "qemu/error-report.h"
32 #include "qapi/error.h"
33 #include "qapi/qmp/qerror.h"
34 #include "qapi/qmp/qdict.h"
35 #include "qapi/qmp/qstring.h"
36 #include "qapi-event.h"
37 #include "trace.h"
38 #include "qemu/option_int.h"
39 #include "qemu/cutils.h"
40 #include "qemu/bswap.h"
41 #include "qapi/opts-visitor.h"
42 #include "qapi-visit.h"
43 #include "block/crypto.h"
44 
45 /*
46   Differences with QCOW:
47 
48   - Support for multiple incremental snapshots.
49   - Memory management by reference counts.
50   - Clusters which have a reference count of one have the bit
51     QCOW_OFLAG_COPIED to optimize write performance.
52   - Size of compressed clusters is stored in sectors to reduce bit usage
53     in the cluster offsets.
54   - Support for storing additional data (such as the VM state) in the
55     snapshots.
56   - If a backing store is used, the cluster size is not constrained
57     (could be backported to QCOW).
58   - L2 tables have always a size of one cluster.
59 */
60 
61 
62 typedef struct {
63     uint32_t magic;
64     uint32_t len;
65 } QEMU_PACKED QCowExtension;
66 
67 #define  QCOW2_EXT_MAGIC_END 0
68 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
69 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
70 #define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
71 #define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
72 
73 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
74 {
75     const QCowHeader *cow_header = (const void *)buf;
76 
77     if (buf_size >= sizeof(QCowHeader) &&
78         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
79         be32_to_cpu(cow_header->version) >= 2)
80         return 100;
81     else
82         return 0;
83 }
84 
85 
86 static ssize_t qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
87                                           uint8_t *buf, size_t buflen,
88                                           void *opaque, Error **errp)
89 {
90     BlockDriverState *bs = opaque;
91     BDRVQcow2State *s = bs->opaque;
92     ssize_t ret;
93 
94     if ((offset + buflen) > s->crypto_header.length) {
95         error_setg(errp, "Request for data outside of extension header");
96         return -1;
97     }
98 
99     ret = bdrv_pread(bs->file,
100                      s->crypto_header.offset + offset, buf, buflen);
101     if (ret < 0) {
102         error_setg_errno(errp, -ret, "Could not read encryption header");
103         return -1;
104     }
105     return ret;
106 }
107 
108 
109 static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen,
110                                           void *opaque, Error **errp)
111 {
112     BlockDriverState *bs = opaque;
113     BDRVQcow2State *s = bs->opaque;
114     int64_t ret;
115     int64_t clusterlen;
116 
117     ret = qcow2_alloc_clusters(bs, headerlen);
118     if (ret < 0) {
119         error_setg_errno(errp, -ret,
120                          "Cannot allocate cluster for LUKS header size %zu",
121                          headerlen);
122         return -1;
123     }
124 
125     s->crypto_header.length = headerlen;
126     s->crypto_header.offset = ret;
127 
128     /* Zero fill remaining space in cluster so it has predictable
129      * content in case of future spec changes */
130     clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
131     assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen) == 0);
132     ret = bdrv_pwrite_zeroes(bs->file,
133                              ret + headerlen,
134                              clusterlen - headerlen, 0);
135     if (ret < 0) {
136         error_setg_errno(errp, -ret, "Could not zero fill encryption header");
137         return -1;
138     }
139 
140     return ret;
141 }
142 
143 
144 static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
145                                            const uint8_t *buf, size_t buflen,
146                                            void *opaque, Error **errp)
147 {
148     BlockDriverState *bs = opaque;
149     BDRVQcow2State *s = bs->opaque;
150     ssize_t ret;
151 
152     if ((offset + buflen) > s->crypto_header.length) {
153         error_setg(errp, "Request for data outside of extension header");
154         return -1;
155     }
156 
157     ret = bdrv_pwrite(bs->file,
158                       s->crypto_header.offset + offset, buf, buflen);
159     if (ret < 0) {
160         error_setg_errno(errp, -ret, "Could not read encryption header");
161         return -1;
162     }
163     return ret;
164 }
165 
166 
167 /*
168  * read qcow2 extension and fill bs
169  * start reading from start_offset
170  * finish reading upon magic of value 0 or when end_offset reached
171  * unknown magic is skipped (future extension this version knows nothing about)
172  * return 0 upon success, non-0 otherwise
173  */
174 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
175                                  uint64_t end_offset, void **p_feature_table,
176                                  int flags, bool *need_update_header,
177                                  Error **errp)
178 {
179     BDRVQcow2State *s = bs->opaque;
180     QCowExtension ext;
181     uint64_t offset;
182     int ret;
183     Qcow2BitmapHeaderExt bitmaps_ext;
184 
185     if (need_update_header != NULL) {
186         *need_update_header = false;
187     }
188 
189 #ifdef DEBUG_EXT
190     printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
191 #endif
192     offset = start_offset;
193     while (offset < end_offset) {
194 
195 #ifdef DEBUG_EXT
196         /* Sanity check */
197         if (offset > s->cluster_size)
198             printf("qcow2_read_extension: suspicious offset %lu\n", offset);
199 
200         printf("attempting to read extended header in offset %lu\n", offset);
201 #endif
202 
203         ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
204         if (ret < 0) {
205             error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
206                              "pread fail from offset %" PRIu64, offset);
207             return 1;
208         }
209         be32_to_cpus(&ext.magic);
210         be32_to_cpus(&ext.len);
211         offset += sizeof(ext);
212 #ifdef DEBUG_EXT
213         printf("ext.magic = 0x%x\n", ext.magic);
214 #endif
215         if (offset > end_offset || ext.len > end_offset - offset) {
216             error_setg(errp, "Header extension too large");
217             return -EINVAL;
218         }
219 
220         switch (ext.magic) {
221         case QCOW2_EXT_MAGIC_END:
222             return 0;
223 
224         case QCOW2_EXT_MAGIC_BACKING_FORMAT:
225             if (ext.len >= sizeof(bs->backing_format)) {
226                 error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
227                            " too large (>=%zu)", ext.len,
228                            sizeof(bs->backing_format));
229                 return 2;
230             }
231             ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
232             if (ret < 0) {
233                 error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
234                                  "Could not read format name");
235                 return 3;
236             }
237             bs->backing_format[ext.len] = '\0';
238             s->image_backing_format = g_strdup(bs->backing_format);
239 #ifdef DEBUG_EXT
240             printf("Qcow2: Got format extension %s\n", bs->backing_format);
241 #endif
242             break;
243 
244         case QCOW2_EXT_MAGIC_FEATURE_TABLE:
245             if (p_feature_table != NULL) {
246                 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
247                 ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
248                 if (ret < 0) {
249                     error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
250                                      "Could not read table");
251                     return ret;
252                 }
253 
254                 *p_feature_table = feature_table;
255             }
256             break;
257 
258         case QCOW2_EXT_MAGIC_CRYPTO_HEADER: {
259             unsigned int cflags = 0;
260             if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
261                 error_setg(errp, "CRYPTO header extension only "
262                            "expected with LUKS encryption method");
263                 return -EINVAL;
264             }
265             if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) {
266                 error_setg(errp, "CRYPTO header extension size %u, "
267                            "but expected size %zu", ext.len,
268                            sizeof(Qcow2CryptoHeaderExtension));
269                 return -EINVAL;
270             }
271 
272             ret = bdrv_pread(bs->file, offset, &s->crypto_header, ext.len);
273             if (ret < 0) {
274                 error_setg_errno(errp, -ret,
275                                  "Unable to read CRYPTO header extension");
276                 return ret;
277             }
278             be64_to_cpus(&s->crypto_header.offset);
279             be64_to_cpus(&s->crypto_header.length);
280 
281             if ((s->crypto_header.offset % s->cluster_size) != 0) {
282                 error_setg(errp, "Encryption header offset '%" PRIu64 "' is "
283                            "not a multiple of cluster size '%u'",
284                            s->crypto_header.offset, s->cluster_size);
285                 return -EINVAL;
286             }
287 
288             if (flags & BDRV_O_NO_IO) {
289                 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
290             }
291             s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
292                                            qcow2_crypto_hdr_read_func,
293                                            bs, cflags, errp);
294             if (!s->crypto) {
295                 return -EINVAL;
296             }
297         }   break;
298 
299         case QCOW2_EXT_MAGIC_BITMAPS:
300             if (ext.len != sizeof(bitmaps_ext)) {
301                 error_setg_errno(errp, -ret, "bitmaps_ext: "
302                                  "Invalid extension length");
303                 return -EINVAL;
304             }
305 
306             if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
307                 if (s->qcow_version < 3) {
308                     /* Let's be a bit more specific */
309                     warn_report("This qcow2 v2 image contains bitmaps, but "
310                                 "they may have been modified by a program "
311                                 "without persistent bitmap support; so now "
312                                 "they must all be considered inconsistent");
313                 } else {
314                     warn_report("a program lacking bitmap support "
315                                 "modified this file, so all bitmaps are now "
316                                 "considered inconsistent");
317                 }
318                 error_printf("Some clusters may be leaked, "
319                              "run 'qemu-img check -r' on the image "
320                              "file to fix.");
321                 if (need_update_header != NULL) {
322                     /* Updating is needed to drop invalid bitmap extension. */
323                     *need_update_header = true;
324                 }
325                 break;
326             }
327 
328             ret = bdrv_pread(bs->file, offset, &bitmaps_ext, ext.len);
329             if (ret < 0) {
330                 error_setg_errno(errp, -ret, "bitmaps_ext: "
331                                  "Could not read ext header");
332                 return ret;
333             }
334 
335             if (bitmaps_ext.reserved32 != 0) {
336                 error_setg_errno(errp, -ret, "bitmaps_ext: "
337                                  "Reserved field is not zero");
338                 return -EINVAL;
339             }
340 
341             be32_to_cpus(&bitmaps_ext.nb_bitmaps);
342             be64_to_cpus(&bitmaps_ext.bitmap_directory_size);
343             be64_to_cpus(&bitmaps_ext.bitmap_directory_offset);
344 
345             if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) {
346                 error_setg(errp,
347                            "bitmaps_ext: Image has %" PRIu32 " bitmaps, "
348                            "exceeding the QEMU supported maximum of %d",
349                            bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS);
350                 return -EINVAL;
351             }
352 
353             if (bitmaps_ext.nb_bitmaps == 0) {
354                 error_setg(errp, "found bitmaps extension with zero bitmaps");
355                 return -EINVAL;
356             }
357 
358             if (bitmaps_ext.bitmap_directory_offset & (s->cluster_size - 1)) {
359                 error_setg(errp, "bitmaps_ext: "
360                                  "invalid bitmap directory offset");
361                 return -EINVAL;
362             }
363 
364             if (bitmaps_ext.bitmap_directory_size >
365                 QCOW2_MAX_BITMAP_DIRECTORY_SIZE) {
366                 error_setg(errp, "bitmaps_ext: "
367                                  "bitmap directory size (%" PRIu64 ") exceeds "
368                                  "the maximum supported size (%d)",
369                                  bitmaps_ext.bitmap_directory_size,
370                                  QCOW2_MAX_BITMAP_DIRECTORY_SIZE);
371                 return -EINVAL;
372             }
373 
374             s->nb_bitmaps = bitmaps_ext.nb_bitmaps;
375             s->bitmap_directory_offset =
376                     bitmaps_ext.bitmap_directory_offset;
377             s->bitmap_directory_size =
378                     bitmaps_ext.bitmap_directory_size;
379 
380 #ifdef DEBUG_EXT
381             printf("Qcow2: Got bitmaps extension: "
382                    "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n",
383                    s->bitmap_directory_offset, s->nb_bitmaps);
384 #endif
385             break;
386 
387         default:
388             /* unknown magic - save it in case we need to rewrite the header */
389             /* If you add a new feature, make sure to also update the fast
390              * path of qcow2_make_empty() to deal with it. */
391             {
392                 Qcow2UnknownHeaderExtension *uext;
393 
394                 uext = g_malloc0(sizeof(*uext)  + ext.len);
395                 uext->magic = ext.magic;
396                 uext->len = ext.len;
397                 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
398 
399                 ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
400                 if (ret < 0) {
401                     error_setg_errno(errp, -ret, "ERROR: unknown extension: "
402                                      "Could not read data");
403                     return ret;
404                 }
405             }
406             break;
407         }
408 
409         offset += ((ext.len + 7) & ~7);
410     }
411 
412     return 0;
413 }
414 
415 static void cleanup_unknown_header_ext(BlockDriverState *bs)
416 {
417     BDRVQcow2State *s = bs->opaque;
418     Qcow2UnknownHeaderExtension *uext, *next;
419 
420     QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
421         QLIST_REMOVE(uext, next);
422         g_free(uext);
423     }
424 }
425 
426 static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
427                                        uint64_t mask)
428 {
429     char *features = g_strdup("");
430     char *old;
431 
432     while (table && table->name[0] != '\0') {
433         if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
434             if (mask & (1ULL << table->bit)) {
435                 old = features;
436                 features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "",
437                                            table->name);
438                 g_free(old);
439                 mask &= ~(1ULL << table->bit);
440             }
441         }
442         table++;
443     }
444 
445     if (mask) {
446         old = features;
447         features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64,
448                                    old, *old ? ", " : "", mask);
449         g_free(old);
450     }
451 
452     error_setg(errp, "Unsupported qcow2 feature(s): %s", features);
453     g_free(features);
454 }
455 
456 /*
457  * Sets the dirty bit and flushes afterwards if necessary.
458  *
459  * The incompatible_features bit is only set if the image file header was
460  * updated successfully.  Therefore it is not required to check the return
461  * value of this function.
462  */
463 int qcow2_mark_dirty(BlockDriverState *bs)
464 {
465     BDRVQcow2State *s = bs->opaque;
466     uint64_t val;
467     int ret;
468 
469     assert(s->qcow_version >= 3);
470 
471     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
472         return 0; /* already dirty */
473     }
474 
475     val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
476     ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
477                       &val, sizeof(val));
478     if (ret < 0) {
479         return ret;
480     }
481     ret = bdrv_flush(bs->file->bs);
482     if (ret < 0) {
483         return ret;
484     }
485 
486     /* Only treat image as dirty if the header was updated successfully */
487     s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
488     return 0;
489 }
490 
491 /*
492  * Clears the dirty bit and flushes before if necessary.  Only call this
493  * function when there are no pending requests, it does not guard against
494  * concurrent requests dirtying the image.
495  */
496 static int qcow2_mark_clean(BlockDriverState *bs)
497 {
498     BDRVQcow2State *s = bs->opaque;
499 
500     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
501         int ret;
502 
503         s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
504 
505         ret = bdrv_flush(bs);
506         if (ret < 0) {
507             return ret;
508         }
509 
510         return qcow2_update_header(bs);
511     }
512     return 0;
513 }
514 
515 /*
516  * Marks the image as corrupt.
517  */
518 int qcow2_mark_corrupt(BlockDriverState *bs)
519 {
520     BDRVQcow2State *s = bs->opaque;
521 
522     s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
523     return qcow2_update_header(bs);
524 }
525 
526 /*
527  * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
528  * before if necessary.
529  */
530 int qcow2_mark_consistent(BlockDriverState *bs)
531 {
532     BDRVQcow2State *s = bs->opaque;
533 
534     if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
535         int ret = bdrv_flush(bs);
536         if (ret < 0) {
537             return ret;
538         }
539 
540         s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
541         return qcow2_update_header(bs);
542     }
543     return 0;
544 }
545 
546 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
547                        BdrvCheckMode fix)
548 {
549     int ret = qcow2_check_refcounts(bs, result, fix);
550     if (ret < 0) {
551         return ret;
552     }
553 
554     if (fix && result->check_errors == 0 && result->corruptions == 0) {
555         ret = qcow2_mark_clean(bs);
556         if (ret < 0) {
557             return ret;
558         }
559         return qcow2_mark_consistent(bs);
560     }
561     return ret;
562 }
563 
564 static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
565                                  uint64_t entries, size_t entry_len)
566 {
567     BDRVQcow2State *s = bs->opaque;
568     uint64_t size;
569 
570     /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
571      * because values will be passed to qemu functions taking int64_t. */
572     if (entries > INT64_MAX / entry_len) {
573         return -EINVAL;
574     }
575 
576     size = entries * entry_len;
577 
578     if (INT64_MAX - size < offset) {
579         return -EINVAL;
580     }
581 
582     /* Tables must be cluster aligned */
583     if (offset_into_cluster(s, offset) != 0) {
584         return -EINVAL;
585     }
586 
587     return 0;
588 }
589 
590 static QemuOptsList qcow2_runtime_opts = {
591     .name = "qcow2",
592     .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
593     .desc = {
594         {
595             .name = QCOW2_OPT_LAZY_REFCOUNTS,
596             .type = QEMU_OPT_BOOL,
597             .help = "Postpone refcount updates",
598         },
599         {
600             .name = QCOW2_OPT_DISCARD_REQUEST,
601             .type = QEMU_OPT_BOOL,
602             .help = "Pass guest discard requests to the layer below",
603         },
604         {
605             .name = QCOW2_OPT_DISCARD_SNAPSHOT,
606             .type = QEMU_OPT_BOOL,
607             .help = "Generate discard requests when snapshot related space "
608                     "is freed",
609         },
610         {
611             .name = QCOW2_OPT_DISCARD_OTHER,
612             .type = QEMU_OPT_BOOL,
613             .help = "Generate discard requests when other clusters are freed",
614         },
615         {
616             .name = QCOW2_OPT_OVERLAP,
617             .type = QEMU_OPT_STRING,
618             .help = "Selects which overlap checks to perform from a range of "
619                     "templates (none, constant, cached, all)",
620         },
621         {
622             .name = QCOW2_OPT_OVERLAP_TEMPLATE,
623             .type = QEMU_OPT_STRING,
624             .help = "Selects which overlap checks to perform from a range of "
625                     "templates (none, constant, cached, all)",
626         },
627         {
628             .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
629             .type = QEMU_OPT_BOOL,
630             .help = "Check for unintended writes into the main qcow2 header",
631         },
632         {
633             .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
634             .type = QEMU_OPT_BOOL,
635             .help = "Check for unintended writes into the active L1 table",
636         },
637         {
638             .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
639             .type = QEMU_OPT_BOOL,
640             .help = "Check for unintended writes into an active L2 table",
641         },
642         {
643             .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
644             .type = QEMU_OPT_BOOL,
645             .help = "Check for unintended writes into the refcount table",
646         },
647         {
648             .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
649             .type = QEMU_OPT_BOOL,
650             .help = "Check for unintended writes into a refcount block",
651         },
652         {
653             .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
654             .type = QEMU_OPT_BOOL,
655             .help = "Check for unintended writes into the snapshot table",
656         },
657         {
658             .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
659             .type = QEMU_OPT_BOOL,
660             .help = "Check for unintended writes into an inactive L1 table",
661         },
662         {
663             .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
664             .type = QEMU_OPT_BOOL,
665             .help = "Check for unintended writes into an inactive L2 table",
666         },
667         {
668             .name = QCOW2_OPT_CACHE_SIZE,
669             .type = QEMU_OPT_SIZE,
670             .help = "Maximum combined metadata (L2 tables and refcount blocks) "
671                     "cache size",
672         },
673         {
674             .name = QCOW2_OPT_L2_CACHE_SIZE,
675             .type = QEMU_OPT_SIZE,
676             .help = "Maximum L2 table cache size",
677         },
678         {
679             .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
680             .type = QEMU_OPT_SIZE,
681             .help = "Maximum refcount block cache size",
682         },
683         {
684             .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
685             .type = QEMU_OPT_NUMBER,
686             .help = "Clean unused cache entries after this time (in seconds)",
687         },
688         BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
689             "ID of secret providing qcow2 AES key or LUKS passphrase"),
690         { /* end of list */ }
691     },
692 };
693 
694 static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
695     [QCOW2_OL_MAIN_HEADER_BITNR]    = QCOW2_OPT_OVERLAP_MAIN_HEADER,
696     [QCOW2_OL_ACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L1,
697     [QCOW2_OL_ACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L2,
698     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
699     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
700     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
701     [QCOW2_OL_INACTIVE_L1_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L1,
702     [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
703 };
704 
705 static void cache_clean_timer_cb(void *opaque)
706 {
707     BlockDriverState *bs = opaque;
708     BDRVQcow2State *s = bs->opaque;
709     qcow2_cache_clean_unused(bs, s->l2_table_cache);
710     qcow2_cache_clean_unused(bs, s->refcount_block_cache);
711     timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
712               (int64_t) s->cache_clean_interval * 1000);
713 }
714 
715 static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
716 {
717     BDRVQcow2State *s = bs->opaque;
718     if (s->cache_clean_interval > 0) {
719         s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL,
720                                              SCALE_MS, cache_clean_timer_cb,
721                                              bs);
722         timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
723                   (int64_t) s->cache_clean_interval * 1000);
724     }
725 }
726 
727 static void cache_clean_timer_del(BlockDriverState *bs)
728 {
729     BDRVQcow2State *s = bs->opaque;
730     if (s->cache_clean_timer) {
731         timer_del(s->cache_clean_timer);
732         timer_free(s->cache_clean_timer);
733         s->cache_clean_timer = NULL;
734     }
735 }
736 
737 static void qcow2_detach_aio_context(BlockDriverState *bs)
738 {
739     cache_clean_timer_del(bs);
740 }
741 
742 static void qcow2_attach_aio_context(BlockDriverState *bs,
743                                      AioContext *new_context)
744 {
745     cache_clean_timer_init(bs, new_context);
746 }
747 
748 static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
749                              uint64_t *l2_cache_size,
750                              uint64_t *refcount_cache_size, Error **errp)
751 {
752     BDRVQcow2State *s = bs->opaque;
753     uint64_t combined_cache_size;
754     bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
755 
756     combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
757     l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
758     refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
759 
760     combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
761     *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0);
762     *refcount_cache_size = qemu_opt_get_size(opts,
763                                              QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
764 
765     if (combined_cache_size_set) {
766         if (l2_cache_size_set && refcount_cache_size_set) {
767             error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
768                        " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
769                        "the same time");
770             return;
771         } else if (*l2_cache_size > combined_cache_size) {
772             error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
773                        QCOW2_OPT_CACHE_SIZE);
774             return;
775         } else if (*refcount_cache_size > combined_cache_size) {
776             error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
777                        QCOW2_OPT_CACHE_SIZE);
778             return;
779         }
780 
781         if (l2_cache_size_set) {
782             *refcount_cache_size = combined_cache_size - *l2_cache_size;
783         } else if (refcount_cache_size_set) {
784             *l2_cache_size = combined_cache_size - *refcount_cache_size;
785         } else {
786             *refcount_cache_size = combined_cache_size
787                                  / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
788             *l2_cache_size = combined_cache_size - *refcount_cache_size;
789         }
790     } else {
791         if (!l2_cache_size_set && !refcount_cache_size_set) {
792             *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
793                                  (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
794                                  * s->cluster_size);
795             *refcount_cache_size = *l2_cache_size
796                                  / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
797         } else if (!l2_cache_size_set) {
798             *l2_cache_size = *refcount_cache_size
799                            * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
800         } else if (!refcount_cache_size_set) {
801             *refcount_cache_size = *l2_cache_size
802                                  / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
803         }
804     }
805 }
806 
807 typedef struct Qcow2ReopenState {
808     Qcow2Cache *l2_table_cache;
809     Qcow2Cache *refcount_block_cache;
810     bool use_lazy_refcounts;
811     int overlap_check;
812     bool discard_passthrough[QCOW2_DISCARD_MAX];
813     uint64_t cache_clean_interval;
814     QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
815 } Qcow2ReopenState;
816 
817 static int qcow2_update_options_prepare(BlockDriverState *bs,
818                                         Qcow2ReopenState *r,
819                                         QDict *options, int flags,
820                                         Error **errp)
821 {
822     BDRVQcow2State *s = bs->opaque;
823     QemuOpts *opts = NULL;
824     const char *opt_overlap_check, *opt_overlap_check_template;
825     int overlap_check_template = 0;
826     uint64_t l2_cache_size, refcount_cache_size;
827     int i;
828     const char *encryptfmt;
829     QDict *encryptopts = NULL;
830     Error *local_err = NULL;
831     int ret;
832 
833     qdict_extract_subqdict(options, &encryptopts, "encrypt.");
834     encryptfmt = qdict_get_try_str(encryptopts, "format");
835 
836     opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
837     qemu_opts_absorb_qdict(opts, options, &local_err);
838     if (local_err) {
839         error_propagate(errp, local_err);
840         ret = -EINVAL;
841         goto fail;
842     }
843 
844     /* get L2 table/refcount block cache size from command line options */
845     read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size,
846                      &local_err);
847     if (local_err) {
848         error_propagate(errp, local_err);
849         ret = -EINVAL;
850         goto fail;
851     }
852 
853     l2_cache_size /= s->cluster_size;
854     if (l2_cache_size < MIN_L2_CACHE_SIZE) {
855         l2_cache_size = MIN_L2_CACHE_SIZE;
856     }
857     if (l2_cache_size > INT_MAX) {
858         error_setg(errp, "L2 cache size too big");
859         ret = -EINVAL;
860         goto fail;
861     }
862 
863     refcount_cache_size /= s->cluster_size;
864     if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
865         refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
866     }
867     if (refcount_cache_size > INT_MAX) {
868         error_setg(errp, "Refcount cache size too big");
869         ret = -EINVAL;
870         goto fail;
871     }
872 
873     /* alloc new L2 table/refcount block cache, flush old one */
874     if (s->l2_table_cache) {
875         ret = qcow2_cache_flush(bs, s->l2_table_cache);
876         if (ret) {
877             error_setg_errno(errp, -ret, "Failed to flush the L2 table cache");
878             goto fail;
879         }
880     }
881 
882     if (s->refcount_block_cache) {
883         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
884         if (ret) {
885             error_setg_errno(errp, -ret,
886                              "Failed to flush the refcount block cache");
887             goto fail;
888         }
889     }
890 
891     r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size);
892     r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size);
893     if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
894         error_setg(errp, "Could not allocate metadata caches");
895         ret = -ENOMEM;
896         goto fail;
897     }
898 
899     /* New interval for cache cleanup timer */
900     r->cache_clean_interval =
901         qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
902                             s->cache_clean_interval);
903 #ifndef CONFIG_LINUX
904     if (r->cache_clean_interval != 0) {
905         error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
906                    " not supported on this host");
907         ret = -EINVAL;
908         goto fail;
909     }
910 #endif
911     if (r->cache_clean_interval > UINT_MAX) {
912         error_setg(errp, "Cache clean interval too big");
913         ret = -EINVAL;
914         goto fail;
915     }
916 
917     /* lazy-refcounts; flush if going from enabled to disabled */
918     r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
919         (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
920     if (r->use_lazy_refcounts && s->qcow_version < 3) {
921         error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
922                    "qemu 1.1 compatibility level");
923         ret = -EINVAL;
924         goto fail;
925     }
926 
927     if (s->use_lazy_refcounts && !r->use_lazy_refcounts) {
928         ret = qcow2_mark_clean(bs);
929         if (ret < 0) {
930             error_setg_errno(errp, -ret, "Failed to disable lazy refcounts");
931             goto fail;
932         }
933     }
934 
935     /* Overlap check options */
936     opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
937     opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
938     if (opt_overlap_check_template && opt_overlap_check &&
939         strcmp(opt_overlap_check_template, opt_overlap_check))
940     {
941         error_setg(errp, "Conflicting values for qcow2 options '"
942                    QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
943                    "' ('%s')", opt_overlap_check, opt_overlap_check_template);
944         ret = -EINVAL;
945         goto fail;
946     }
947     if (!opt_overlap_check) {
948         opt_overlap_check = opt_overlap_check_template ?: "cached";
949     }
950 
951     if (!strcmp(opt_overlap_check, "none")) {
952         overlap_check_template = 0;
953     } else if (!strcmp(opt_overlap_check, "constant")) {
954         overlap_check_template = QCOW2_OL_CONSTANT;
955     } else if (!strcmp(opt_overlap_check, "cached")) {
956         overlap_check_template = QCOW2_OL_CACHED;
957     } else if (!strcmp(opt_overlap_check, "all")) {
958         overlap_check_template = QCOW2_OL_ALL;
959     } else {
960         error_setg(errp, "Unsupported value '%s' for qcow2 option "
961                    "'overlap-check'. Allowed are any of the following: "
962                    "none, constant, cached, all", opt_overlap_check);
963         ret = -EINVAL;
964         goto fail;
965     }
966 
967     r->overlap_check = 0;
968     for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
969         /* overlap-check defines a template bitmask, but every flag may be
970          * overwritten through the associated boolean option */
971         r->overlap_check |=
972             qemu_opt_get_bool(opts, overlap_bool_option_names[i],
973                               overlap_check_template & (1 << i)) << i;
974     }
975 
976     r->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
977     r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
978     r->discard_passthrough[QCOW2_DISCARD_REQUEST] =
979         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
980                           flags & BDRV_O_UNMAP);
981     r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
982         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
983     r->discard_passthrough[QCOW2_DISCARD_OTHER] =
984         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
985 
986     switch (s->crypt_method_header) {
987     case QCOW_CRYPT_NONE:
988         if (encryptfmt) {
989             error_setg(errp, "No encryption in image header, but options "
990                        "specified format '%s'", encryptfmt);
991             ret = -EINVAL;
992             goto fail;
993         }
994         break;
995 
996     case QCOW_CRYPT_AES:
997         if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
998             error_setg(errp,
999                        "Header reported 'aes' encryption format but "
1000                        "options specify '%s'", encryptfmt);
1001             ret = -EINVAL;
1002             goto fail;
1003         }
1004         qdict_del(encryptopts, "format");
1005         r->crypto_opts = block_crypto_open_opts_init(
1006             Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp);
1007         break;
1008 
1009     case QCOW_CRYPT_LUKS:
1010         if (encryptfmt && !g_str_equal(encryptfmt, "luks")) {
1011             error_setg(errp,
1012                        "Header reported 'luks' encryption format but "
1013                        "options specify '%s'", encryptfmt);
1014             ret = -EINVAL;
1015             goto fail;
1016         }
1017         qdict_del(encryptopts, "format");
1018         r->crypto_opts = block_crypto_open_opts_init(
1019             Q_CRYPTO_BLOCK_FORMAT_LUKS, encryptopts, errp);
1020         break;
1021 
1022     default:
1023         error_setg(errp, "Unsupported encryption method %d",
1024                    s->crypt_method_header);
1025         break;
1026     }
1027     if (s->crypt_method_header != QCOW_CRYPT_NONE && !r->crypto_opts) {
1028         ret = -EINVAL;
1029         goto fail;
1030     }
1031 
1032     ret = 0;
1033 fail:
1034     QDECREF(encryptopts);
1035     qemu_opts_del(opts);
1036     opts = NULL;
1037     return ret;
1038 }
1039 
1040 static void qcow2_update_options_commit(BlockDriverState *bs,
1041                                         Qcow2ReopenState *r)
1042 {
1043     BDRVQcow2State *s = bs->opaque;
1044     int i;
1045 
1046     if (s->l2_table_cache) {
1047         qcow2_cache_destroy(bs, s->l2_table_cache);
1048     }
1049     if (s->refcount_block_cache) {
1050         qcow2_cache_destroy(bs, s->refcount_block_cache);
1051     }
1052     s->l2_table_cache = r->l2_table_cache;
1053     s->refcount_block_cache = r->refcount_block_cache;
1054 
1055     s->overlap_check = r->overlap_check;
1056     s->use_lazy_refcounts = r->use_lazy_refcounts;
1057 
1058     for (i = 0; i < QCOW2_DISCARD_MAX; i++) {
1059         s->discard_passthrough[i] = r->discard_passthrough[i];
1060     }
1061 
1062     if (s->cache_clean_interval != r->cache_clean_interval) {
1063         cache_clean_timer_del(bs);
1064         s->cache_clean_interval = r->cache_clean_interval;
1065         cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
1066     }
1067 
1068     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1069     s->crypto_opts = r->crypto_opts;
1070 }
1071 
1072 static void qcow2_update_options_abort(BlockDriverState *bs,
1073                                        Qcow2ReopenState *r)
1074 {
1075     if (r->l2_table_cache) {
1076         qcow2_cache_destroy(bs, r->l2_table_cache);
1077     }
1078     if (r->refcount_block_cache) {
1079         qcow2_cache_destroy(bs, r->refcount_block_cache);
1080     }
1081     qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
1082 }
1083 
1084 static int qcow2_update_options(BlockDriverState *bs, QDict *options,
1085                                 int flags, Error **errp)
1086 {
1087     Qcow2ReopenState r = {};
1088     int ret;
1089 
1090     ret = qcow2_update_options_prepare(bs, &r, options, flags, errp);
1091     if (ret >= 0) {
1092         qcow2_update_options_commit(bs, &r);
1093     } else {
1094         qcow2_update_options_abort(bs, &r);
1095     }
1096 
1097     return ret;
1098 }
1099 
1100 static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
1101                          Error **errp)
1102 {
1103     BDRVQcow2State *s = bs->opaque;
1104     unsigned int len, i;
1105     int ret = 0;
1106     QCowHeader header;
1107     Error *local_err = NULL;
1108     uint64_t ext_end;
1109     uint64_t l1_vm_state_index;
1110     bool update_header = false;
1111 
1112     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
1113     if (ret < 0) {
1114         error_setg_errno(errp, -ret, "Could not read qcow2 header");
1115         goto fail;
1116     }
1117     be32_to_cpus(&header.magic);
1118     be32_to_cpus(&header.version);
1119     be64_to_cpus(&header.backing_file_offset);
1120     be32_to_cpus(&header.backing_file_size);
1121     be64_to_cpus(&header.size);
1122     be32_to_cpus(&header.cluster_bits);
1123     be32_to_cpus(&header.crypt_method);
1124     be64_to_cpus(&header.l1_table_offset);
1125     be32_to_cpus(&header.l1_size);
1126     be64_to_cpus(&header.refcount_table_offset);
1127     be32_to_cpus(&header.refcount_table_clusters);
1128     be64_to_cpus(&header.snapshots_offset);
1129     be32_to_cpus(&header.nb_snapshots);
1130 
1131     if (header.magic != QCOW_MAGIC) {
1132         error_setg(errp, "Image is not in qcow2 format");
1133         ret = -EINVAL;
1134         goto fail;
1135     }
1136     if (header.version < 2 || header.version > 3) {
1137         error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
1138         ret = -ENOTSUP;
1139         goto fail;
1140     }
1141 
1142     s->qcow_version = header.version;
1143 
1144     /* Initialise cluster size */
1145     if (header.cluster_bits < MIN_CLUSTER_BITS ||
1146         header.cluster_bits > MAX_CLUSTER_BITS) {
1147         error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
1148                    header.cluster_bits);
1149         ret = -EINVAL;
1150         goto fail;
1151     }
1152 
1153     s->cluster_bits = header.cluster_bits;
1154     s->cluster_size = 1 << s->cluster_bits;
1155     s->cluster_sectors = 1 << (s->cluster_bits - BDRV_SECTOR_BITS);
1156 
1157     /* Initialise version 3 header fields */
1158     if (header.version == 2) {
1159         header.incompatible_features    = 0;
1160         header.compatible_features      = 0;
1161         header.autoclear_features       = 0;
1162         header.refcount_order           = 4;
1163         header.header_length            = 72;
1164     } else {
1165         be64_to_cpus(&header.incompatible_features);
1166         be64_to_cpus(&header.compatible_features);
1167         be64_to_cpus(&header.autoclear_features);
1168         be32_to_cpus(&header.refcount_order);
1169         be32_to_cpus(&header.header_length);
1170 
1171         if (header.header_length < 104) {
1172             error_setg(errp, "qcow2 header too short");
1173             ret = -EINVAL;
1174             goto fail;
1175         }
1176     }
1177 
1178     if (header.header_length > s->cluster_size) {
1179         error_setg(errp, "qcow2 header exceeds cluster size");
1180         ret = -EINVAL;
1181         goto fail;
1182     }
1183 
1184     if (header.header_length > sizeof(header)) {
1185         s->unknown_header_fields_size = header.header_length - sizeof(header);
1186         s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
1187         ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
1188                          s->unknown_header_fields_size);
1189         if (ret < 0) {
1190             error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
1191                              "fields");
1192             goto fail;
1193         }
1194     }
1195 
1196     if (header.backing_file_offset > s->cluster_size) {
1197         error_setg(errp, "Invalid backing file offset");
1198         ret = -EINVAL;
1199         goto fail;
1200     }
1201 
1202     if (header.backing_file_offset) {
1203         ext_end = header.backing_file_offset;
1204     } else {
1205         ext_end = 1 << header.cluster_bits;
1206     }
1207 
1208     /* Handle feature bits */
1209     s->incompatible_features    = header.incompatible_features;
1210     s->compatible_features      = header.compatible_features;
1211     s->autoclear_features       = header.autoclear_features;
1212 
1213     if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
1214         void *feature_table = NULL;
1215         qcow2_read_extensions(bs, header.header_length, ext_end,
1216                               &feature_table, flags, NULL, NULL);
1217         report_unsupported_feature(errp, feature_table,
1218                                    s->incompatible_features &
1219                                    ~QCOW2_INCOMPAT_MASK);
1220         ret = -ENOTSUP;
1221         g_free(feature_table);
1222         goto fail;
1223     }
1224 
1225     if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
1226         /* Corrupt images may not be written to unless they are being repaired
1227          */
1228         if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
1229             error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
1230                        "read/write");
1231             ret = -EACCES;
1232             goto fail;
1233         }
1234     }
1235 
1236     /* Check support for various header values */
1237     if (header.refcount_order > 6) {
1238         error_setg(errp, "Reference count entry width too large; may not "
1239                    "exceed 64 bits");
1240         ret = -EINVAL;
1241         goto fail;
1242     }
1243     s->refcount_order = header.refcount_order;
1244     s->refcount_bits = 1 << s->refcount_order;
1245     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
1246     s->refcount_max += s->refcount_max - 1;
1247 
1248     s->crypt_method_header = header.crypt_method;
1249     if (s->crypt_method_header) {
1250         if (bdrv_uses_whitelist() &&
1251             s->crypt_method_header == QCOW_CRYPT_AES) {
1252             error_setg(errp,
1253                        "Use of AES-CBC encrypted qcow2 images is no longer "
1254                        "supported in system emulators");
1255             error_append_hint(errp,
1256                               "You can use 'qemu-img convert' to convert your "
1257                               "image to an alternative supported format, such "
1258                               "as unencrypted qcow2, or raw with the LUKS "
1259                               "format instead.\n");
1260             ret = -ENOSYS;
1261             goto fail;
1262         }
1263 
1264         if (s->crypt_method_header == QCOW_CRYPT_AES) {
1265             s->crypt_physical_offset = false;
1266         } else {
1267             /* Assuming LUKS and any future crypt methods we
1268              * add will all use physical offsets, due to the
1269              * fact that the alternative is insecure...  */
1270             s->crypt_physical_offset = true;
1271         }
1272 
1273         bs->encrypted = true;
1274     }
1275 
1276     s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
1277     s->l2_size = 1 << s->l2_bits;
1278     /* 2^(s->refcount_order - 3) is the refcount width in bytes */
1279     s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
1280     s->refcount_block_size = 1 << s->refcount_block_bits;
1281     bs->total_sectors = header.size / 512;
1282     s->csize_shift = (62 - (s->cluster_bits - 8));
1283     s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
1284     s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
1285 
1286     s->refcount_table_offset = header.refcount_table_offset;
1287     s->refcount_table_size =
1288         header.refcount_table_clusters << (s->cluster_bits - 3);
1289 
1290     if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) {
1291         error_setg(errp, "Reference count table too large");
1292         ret = -EINVAL;
1293         goto fail;
1294     }
1295 
1296     if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) {
1297         error_setg(errp, "Image does not contain a reference count table");
1298         ret = -EINVAL;
1299         goto fail;
1300     }
1301 
1302     ret = validate_table_offset(bs, s->refcount_table_offset,
1303                                 s->refcount_table_size, sizeof(uint64_t));
1304     if (ret < 0) {
1305         error_setg(errp, "Invalid reference count table offset");
1306         goto fail;
1307     }
1308 
1309     /* Snapshot table offset/length */
1310     if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) {
1311         error_setg(errp, "Too many snapshots");
1312         ret = -EINVAL;
1313         goto fail;
1314     }
1315 
1316     ret = validate_table_offset(bs, header.snapshots_offset,
1317                                 header.nb_snapshots,
1318                                 sizeof(QCowSnapshotHeader));
1319     if (ret < 0) {
1320         error_setg(errp, "Invalid snapshot table offset");
1321         goto fail;
1322     }
1323 
1324     /* read the level 1 table */
1325     if (header.l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
1326         error_setg(errp, "Active L1 table too large");
1327         ret = -EFBIG;
1328         goto fail;
1329     }
1330     s->l1_size = header.l1_size;
1331 
1332     l1_vm_state_index = size_to_l1(s, header.size);
1333     if (l1_vm_state_index > INT_MAX) {
1334         error_setg(errp, "Image is too big");
1335         ret = -EFBIG;
1336         goto fail;
1337     }
1338     s->l1_vm_state_index = l1_vm_state_index;
1339 
1340     /* the L1 table must contain at least enough entries to put
1341        header.size bytes */
1342     if (s->l1_size < s->l1_vm_state_index) {
1343         error_setg(errp, "L1 table is too small");
1344         ret = -EINVAL;
1345         goto fail;
1346     }
1347 
1348     ret = validate_table_offset(bs, header.l1_table_offset,
1349                                 header.l1_size, sizeof(uint64_t));
1350     if (ret < 0) {
1351         error_setg(errp, "Invalid L1 table offset");
1352         goto fail;
1353     }
1354     s->l1_table_offset = header.l1_table_offset;
1355 
1356 
1357     if (s->l1_size > 0) {
1358         s->l1_table = qemu_try_blockalign(bs->file->bs,
1359             align_offset(s->l1_size * sizeof(uint64_t), 512));
1360         if (s->l1_table == NULL) {
1361             error_setg(errp, "Could not allocate L1 table");
1362             ret = -ENOMEM;
1363             goto fail;
1364         }
1365         ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
1366                          s->l1_size * sizeof(uint64_t));
1367         if (ret < 0) {
1368             error_setg_errno(errp, -ret, "Could not read L1 table");
1369             goto fail;
1370         }
1371         for(i = 0;i < s->l1_size; i++) {
1372             be64_to_cpus(&s->l1_table[i]);
1373         }
1374     }
1375 
1376     /* Parse driver-specific options */
1377     ret = qcow2_update_options(bs, options, flags, errp);
1378     if (ret < 0) {
1379         goto fail;
1380     }
1381 
1382     s->cluster_cache_offset = -1;
1383     s->flags = flags;
1384 
1385     ret = qcow2_refcount_init(bs);
1386     if (ret != 0) {
1387         error_setg_errno(errp, -ret, "Could not initialize refcount handling");
1388         goto fail;
1389     }
1390 
1391     QLIST_INIT(&s->cluster_allocs);
1392     QTAILQ_INIT(&s->discards);
1393 
1394     /* read qcow2 extensions */
1395     if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
1396                               flags, &update_header, &local_err)) {
1397         error_propagate(errp, local_err);
1398         ret = -EINVAL;
1399         goto fail;
1400     }
1401 
1402     /* qcow2_read_extension may have set up the crypto context
1403      * if the crypt method needs a header region, some methods
1404      * don't need header extensions, so must check here
1405      */
1406     if (s->crypt_method_header && !s->crypto) {
1407         if (s->crypt_method_header == QCOW_CRYPT_AES) {
1408             unsigned int cflags = 0;
1409             if (flags & BDRV_O_NO_IO) {
1410                 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
1411             }
1412             s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
1413                                            NULL, NULL, cflags, errp);
1414             if (!s->crypto) {
1415                 ret = -EINVAL;
1416                 goto fail;
1417             }
1418         } else if (!(flags & BDRV_O_NO_IO)) {
1419             error_setg(errp, "Missing CRYPTO header for crypt method %d",
1420                        s->crypt_method_header);
1421             ret = -EINVAL;
1422             goto fail;
1423         }
1424     }
1425 
1426     /* read the backing file name */
1427     if (header.backing_file_offset != 0) {
1428         len = header.backing_file_size;
1429         if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
1430             len >= sizeof(bs->backing_file)) {
1431             error_setg(errp, "Backing file name too long");
1432             ret = -EINVAL;
1433             goto fail;
1434         }
1435         ret = bdrv_pread(bs->file, header.backing_file_offset,
1436                          bs->backing_file, len);
1437         if (ret < 0) {
1438             error_setg_errno(errp, -ret, "Could not read backing file name");
1439             goto fail;
1440         }
1441         bs->backing_file[len] = '\0';
1442         s->image_backing_file = g_strdup(bs->backing_file);
1443     }
1444 
1445     /* Internal snapshots */
1446     s->snapshots_offset = header.snapshots_offset;
1447     s->nb_snapshots = header.nb_snapshots;
1448 
1449     ret = qcow2_read_snapshots(bs);
1450     if (ret < 0) {
1451         error_setg_errno(errp, -ret, "Could not read snapshots");
1452         goto fail;
1453     }
1454 
1455     /* Clear unknown autoclear feature bits */
1456     update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK;
1457     update_header =
1458         update_header && !bs->read_only && !(flags & BDRV_O_INACTIVE);
1459     if (update_header) {
1460         s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
1461     }
1462 
1463     if (qcow2_load_autoloading_dirty_bitmaps(bs, &local_err)) {
1464         update_header = false;
1465     }
1466     if (local_err != NULL) {
1467         error_propagate(errp, local_err);
1468         ret = -EINVAL;
1469         goto fail;
1470     }
1471 
1472     if (update_header) {
1473         ret = qcow2_update_header(bs);
1474         if (ret < 0) {
1475             error_setg_errno(errp, -ret, "Could not update qcow2 header");
1476             goto fail;
1477         }
1478     }
1479 
1480     /* Initialise locks */
1481     qemu_co_mutex_init(&s->lock);
1482     bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
1483 
1484     /* Repair image if dirty */
1485     if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
1486         (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
1487         BdrvCheckResult result = {0};
1488 
1489         ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
1490         if (ret < 0 || result.check_errors) {
1491             if (ret >= 0) {
1492                 ret = -EIO;
1493             }
1494             error_setg_errno(errp, -ret, "Could not repair dirty image");
1495             goto fail;
1496         }
1497     }
1498 
1499 #ifdef DEBUG_ALLOC
1500     {
1501         BdrvCheckResult result = {0};
1502         qcow2_check_refcounts(bs, &result, 0);
1503     }
1504 #endif
1505     return ret;
1506 
1507  fail:
1508     g_free(s->unknown_header_fields);
1509     cleanup_unknown_header_ext(bs);
1510     qcow2_free_snapshots(bs);
1511     qcow2_refcount_close(bs);
1512     qemu_vfree(s->l1_table);
1513     /* else pre-write overlap checks in cache_destroy may crash */
1514     s->l1_table = NULL;
1515     cache_clean_timer_del(bs);
1516     if (s->l2_table_cache) {
1517         qcow2_cache_destroy(bs, s->l2_table_cache);
1518     }
1519     if (s->refcount_block_cache) {
1520         qcow2_cache_destroy(bs, s->refcount_block_cache);
1521     }
1522     qcrypto_block_free(s->crypto);
1523     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1524     return ret;
1525 }
1526 
1527 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
1528                       Error **errp)
1529 {
1530     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file,
1531                                false, errp);
1532     if (!bs->file) {
1533         return -EINVAL;
1534     }
1535 
1536     return qcow2_do_open(bs, options, flags, errp);
1537 }
1538 
1539 static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
1540 {
1541     BDRVQcow2State *s = bs->opaque;
1542 
1543     if (bs->encrypted) {
1544         /* Encryption works on a sector granularity */
1545         bs->bl.request_alignment = BDRV_SECTOR_SIZE;
1546     }
1547     bs->bl.pwrite_zeroes_alignment = s->cluster_size;
1548     bs->bl.pdiscard_alignment = s->cluster_size;
1549 }
1550 
1551 static int qcow2_reopen_prepare(BDRVReopenState *state,
1552                                 BlockReopenQueue *queue, Error **errp)
1553 {
1554     Qcow2ReopenState *r;
1555     int ret;
1556 
1557     r = g_new0(Qcow2ReopenState, 1);
1558     state->opaque = r;
1559 
1560     ret = qcow2_update_options_prepare(state->bs, r, state->options,
1561                                        state->flags, errp);
1562     if (ret < 0) {
1563         goto fail;
1564     }
1565 
1566     /* We need to write out any unwritten data if we reopen read-only. */
1567     if ((state->flags & BDRV_O_RDWR) == 0) {
1568         ret = qcow2_reopen_bitmaps_ro(state->bs, errp);
1569         if (ret < 0) {
1570             goto fail;
1571         }
1572 
1573         ret = bdrv_flush(state->bs);
1574         if (ret < 0) {
1575             goto fail;
1576         }
1577 
1578         ret = qcow2_mark_clean(state->bs);
1579         if (ret < 0) {
1580             goto fail;
1581         }
1582     }
1583 
1584     return 0;
1585 
1586 fail:
1587     qcow2_update_options_abort(state->bs, r);
1588     g_free(r);
1589     return ret;
1590 }
1591 
1592 static void qcow2_reopen_commit(BDRVReopenState *state)
1593 {
1594     qcow2_update_options_commit(state->bs, state->opaque);
1595     g_free(state->opaque);
1596 }
1597 
1598 static void qcow2_reopen_abort(BDRVReopenState *state)
1599 {
1600     qcow2_update_options_abort(state->bs, state->opaque);
1601     g_free(state->opaque);
1602 }
1603 
1604 static void qcow2_join_options(QDict *options, QDict *old_options)
1605 {
1606     bool has_new_overlap_template =
1607         qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
1608         qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
1609     bool has_new_total_cache_size =
1610         qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
1611     bool has_all_cache_options;
1612 
1613     /* New overlap template overrides all old overlap options */
1614     if (has_new_overlap_template) {
1615         qdict_del(old_options, QCOW2_OPT_OVERLAP);
1616         qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
1617         qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
1618         qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
1619         qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
1620         qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
1621         qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
1622         qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
1623         qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
1624         qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
1625     }
1626 
1627     /* New total cache size overrides all old options */
1628     if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
1629         qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
1630         qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
1631     }
1632 
1633     qdict_join(options, old_options, false);
1634 
1635     /*
1636      * If after merging all cache size options are set, an old total size is
1637      * overwritten. Do keep all options, however, if all three are new. The
1638      * resulting error message is what we want to happen.
1639      */
1640     has_all_cache_options =
1641         qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
1642         qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
1643         qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
1644 
1645     if (has_all_cache_options && !has_new_total_cache_size) {
1646         qdict_del(options, QCOW2_OPT_CACHE_SIZE);
1647     }
1648 }
1649 
1650 static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
1651         int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
1652 {
1653     BDRVQcow2State *s = bs->opaque;
1654     uint64_t cluster_offset;
1655     int index_in_cluster, ret;
1656     unsigned int bytes;
1657     int64_t status = 0;
1658 
1659     bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE);
1660     qemu_co_mutex_lock(&s->lock);
1661     ret = qcow2_get_cluster_offset(bs, sector_num << BDRV_SECTOR_BITS, &bytes,
1662                                    &cluster_offset);
1663     qemu_co_mutex_unlock(&s->lock);
1664     if (ret < 0) {
1665         return ret;
1666     }
1667 
1668     *pnum = bytes >> BDRV_SECTOR_BITS;
1669 
1670     if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
1671         !s->crypto) {
1672         index_in_cluster = sector_num & (s->cluster_sectors - 1);
1673         cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
1674         *file = bs->file->bs;
1675         status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
1676     }
1677     if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) {
1678         status |= BDRV_BLOCK_ZERO;
1679     } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
1680         status |= BDRV_BLOCK_DATA;
1681     }
1682     return status;
1683 }
1684 
1685 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
1686                                         uint64_t bytes, QEMUIOVector *qiov,
1687                                         int flags)
1688 {
1689     BDRVQcow2State *s = bs->opaque;
1690     int offset_in_cluster;
1691     int ret;
1692     unsigned int cur_bytes; /* number of bytes in current iteration */
1693     uint64_t cluster_offset = 0;
1694     uint64_t bytes_done = 0;
1695     QEMUIOVector hd_qiov;
1696     uint8_t *cluster_data = NULL;
1697 
1698     qemu_iovec_init(&hd_qiov, qiov->niov);
1699 
1700     qemu_co_mutex_lock(&s->lock);
1701 
1702     while (bytes != 0) {
1703 
1704         /* prepare next request */
1705         cur_bytes = MIN(bytes, INT_MAX);
1706         if (s->crypto) {
1707             cur_bytes = MIN(cur_bytes,
1708                             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1709         }
1710 
1711         ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset);
1712         if (ret < 0) {
1713             goto fail;
1714         }
1715 
1716         offset_in_cluster = offset_into_cluster(s, offset);
1717 
1718         qemu_iovec_reset(&hd_qiov);
1719         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
1720 
1721         switch (ret) {
1722         case QCOW2_CLUSTER_UNALLOCATED:
1723 
1724             if (bs->backing) {
1725                 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
1726                 qemu_co_mutex_unlock(&s->lock);
1727                 ret = bdrv_co_preadv(bs->backing, offset, cur_bytes,
1728                                      &hd_qiov, 0);
1729                 qemu_co_mutex_lock(&s->lock);
1730                 if (ret < 0) {
1731                     goto fail;
1732                 }
1733             } else {
1734                 /* Note: in this case, no need to wait */
1735                 qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
1736             }
1737             break;
1738 
1739         case QCOW2_CLUSTER_ZERO_PLAIN:
1740         case QCOW2_CLUSTER_ZERO_ALLOC:
1741             qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
1742             break;
1743 
1744         case QCOW2_CLUSTER_COMPRESSED:
1745             /* add AIO support for compressed blocks ? */
1746             ret = qcow2_decompress_cluster(bs, cluster_offset);
1747             if (ret < 0) {
1748                 goto fail;
1749             }
1750 
1751             qemu_iovec_from_buf(&hd_qiov, 0,
1752                                 s->cluster_cache + offset_in_cluster,
1753                                 cur_bytes);
1754             break;
1755 
1756         case QCOW2_CLUSTER_NORMAL:
1757             if ((cluster_offset & 511) != 0) {
1758                 ret = -EIO;
1759                 goto fail;
1760             }
1761 
1762             if (bs->encrypted) {
1763                 assert(s->crypto);
1764 
1765                 /*
1766                  * For encrypted images, read everything into a temporary
1767                  * contiguous buffer on which the AES functions can work.
1768                  */
1769                 if (!cluster_data) {
1770                     cluster_data =
1771                         qemu_try_blockalign(bs->file->bs,
1772                                             QCOW_MAX_CRYPT_CLUSTERS
1773                                             * s->cluster_size);
1774                     if (cluster_data == NULL) {
1775                         ret = -ENOMEM;
1776                         goto fail;
1777                     }
1778                 }
1779 
1780                 assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1781                 qemu_iovec_reset(&hd_qiov);
1782                 qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
1783             }
1784 
1785             BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1786             qemu_co_mutex_unlock(&s->lock);
1787             ret = bdrv_co_preadv(bs->file,
1788                                  cluster_offset + offset_in_cluster,
1789                                  cur_bytes, &hd_qiov, 0);
1790             qemu_co_mutex_lock(&s->lock);
1791             if (ret < 0) {
1792                 goto fail;
1793             }
1794             if (bs->encrypted) {
1795                 assert(s->crypto);
1796                 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
1797                 assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
1798                 if (qcrypto_block_decrypt(s->crypto,
1799                                           (s->crypt_physical_offset ?
1800                                            cluster_offset + offset_in_cluster :
1801                                            offset),
1802                                           cluster_data,
1803                                           cur_bytes,
1804                                           NULL) < 0) {
1805                     ret = -EIO;
1806                     goto fail;
1807                 }
1808                 qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
1809             }
1810             break;
1811 
1812         default:
1813             g_assert_not_reached();
1814             ret = -EIO;
1815             goto fail;
1816         }
1817 
1818         bytes -= cur_bytes;
1819         offset += cur_bytes;
1820         bytes_done += cur_bytes;
1821     }
1822     ret = 0;
1823 
1824 fail:
1825     qemu_co_mutex_unlock(&s->lock);
1826 
1827     qemu_iovec_destroy(&hd_qiov);
1828     qemu_vfree(cluster_data);
1829 
1830     return ret;
1831 }
1832 
1833 /* Check if it's possible to merge a write request with the writing of
1834  * the data from the COW regions */
1835 static bool merge_cow(uint64_t offset, unsigned bytes,
1836                       QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
1837 {
1838     QCowL2Meta *m;
1839 
1840     for (m = l2meta; m != NULL; m = m->next) {
1841         /* If both COW regions are empty then there's nothing to merge */
1842         if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
1843             continue;
1844         }
1845 
1846         /* The data (middle) region must be immediately after the
1847          * start region */
1848         if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
1849             continue;
1850         }
1851 
1852         /* The end region must be immediately after the data (middle)
1853          * region */
1854         if (m->offset + m->cow_end.offset != offset + bytes) {
1855             continue;
1856         }
1857 
1858         /* Make sure that adding both COW regions to the QEMUIOVector
1859          * does not exceed IOV_MAX */
1860         if (hd_qiov->niov > IOV_MAX - 2) {
1861             continue;
1862         }
1863 
1864         m->data_qiov = hd_qiov;
1865         return true;
1866     }
1867 
1868     return false;
1869 }
1870 
1871 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
1872                                          uint64_t bytes, QEMUIOVector *qiov,
1873                                          int flags)
1874 {
1875     BDRVQcow2State *s = bs->opaque;
1876     int offset_in_cluster;
1877     int ret;
1878     unsigned int cur_bytes; /* number of sectors in current iteration */
1879     uint64_t cluster_offset;
1880     QEMUIOVector hd_qiov;
1881     uint64_t bytes_done = 0;
1882     uint8_t *cluster_data = NULL;
1883     QCowL2Meta *l2meta = NULL;
1884 
1885     trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
1886 
1887     qemu_iovec_init(&hd_qiov, qiov->niov);
1888 
1889     s->cluster_cache_offset = -1; /* disable compressed cache */
1890 
1891     qemu_co_mutex_lock(&s->lock);
1892 
1893     while (bytes != 0) {
1894 
1895         l2meta = NULL;
1896 
1897         trace_qcow2_writev_start_part(qemu_coroutine_self());
1898         offset_in_cluster = offset_into_cluster(s, offset);
1899         cur_bytes = MIN(bytes, INT_MAX);
1900         if (bs->encrypted) {
1901             cur_bytes = MIN(cur_bytes,
1902                             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
1903                             - offset_in_cluster);
1904         }
1905 
1906         ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
1907                                          &cluster_offset, &l2meta);
1908         if (ret < 0) {
1909             goto fail;
1910         }
1911 
1912         assert((cluster_offset & 511) == 0);
1913 
1914         qemu_iovec_reset(&hd_qiov);
1915         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
1916 
1917         if (bs->encrypted) {
1918             assert(s->crypto);
1919             if (!cluster_data) {
1920                 cluster_data = qemu_try_blockalign(bs->file->bs,
1921                                                    QCOW_MAX_CRYPT_CLUSTERS
1922                                                    * s->cluster_size);
1923                 if (cluster_data == NULL) {
1924                     ret = -ENOMEM;
1925                     goto fail;
1926                 }
1927             }
1928 
1929             assert(hd_qiov.size <=
1930                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1931             qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
1932 
1933             if (qcrypto_block_encrypt(s->crypto,
1934                                       (s->crypt_physical_offset ?
1935                                        cluster_offset + offset_in_cluster :
1936                                        offset),
1937                                       cluster_data,
1938                                       cur_bytes, NULL) < 0) {
1939                 ret = -EIO;
1940                 goto fail;
1941             }
1942 
1943             qemu_iovec_reset(&hd_qiov);
1944             qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
1945         }
1946 
1947         ret = qcow2_pre_write_overlap_check(bs, 0,
1948                 cluster_offset + offset_in_cluster, cur_bytes);
1949         if (ret < 0) {
1950             goto fail;
1951         }
1952 
1953         /* If we need to do COW, check if it's possible to merge the
1954          * writing of the guest data together with that of the COW regions.
1955          * If it's not possible (or not necessary) then write the
1956          * guest data now. */
1957         if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
1958             qemu_co_mutex_unlock(&s->lock);
1959             BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
1960             trace_qcow2_writev_data(qemu_coroutine_self(),
1961                                     cluster_offset + offset_in_cluster);
1962             ret = bdrv_co_pwritev(bs->file,
1963                                   cluster_offset + offset_in_cluster,
1964                                   cur_bytes, &hd_qiov, 0);
1965             qemu_co_mutex_lock(&s->lock);
1966             if (ret < 0) {
1967                 goto fail;
1968             }
1969         }
1970 
1971         while (l2meta != NULL) {
1972             QCowL2Meta *next;
1973 
1974             ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
1975             if (ret < 0) {
1976                 goto fail;
1977             }
1978 
1979             /* Take the request off the list of running requests */
1980             if (l2meta->nb_clusters != 0) {
1981                 QLIST_REMOVE(l2meta, next_in_flight);
1982             }
1983 
1984             qemu_co_queue_restart_all(&l2meta->dependent_requests);
1985 
1986             next = l2meta->next;
1987             g_free(l2meta);
1988             l2meta = next;
1989         }
1990 
1991         bytes -= cur_bytes;
1992         offset += cur_bytes;
1993         bytes_done += cur_bytes;
1994         trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
1995     }
1996     ret = 0;
1997 
1998 fail:
1999     while (l2meta != NULL) {
2000         QCowL2Meta *next;
2001 
2002         if (l2meta->nb_clusters != 0) {
2003             QLIST_REMOVE(l2meta, next_in_flight);
2004         }
2005         qemu_co_queue_restart_all(&l2meta->dependent_requests);
2006 
2007         next = l2meta->next;
2008         g_free(l2meta);
2009         l2meta = next;
2010     }
2011 
2012     qemu_co_mutex_unlock(&s->lock);
2013 
2014     qemu_iovec_destroy(&hd_qiov);
2015     qemu_vfree(cluster_data);
2016     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
2017 
2018     return ret;
2019 }
2020 
2021 static int qcow2_inactivate(BlockDriverState *bs)
2022 {
2023     BDRVQcow2State *s = bs->opaque;
2024     int ret, result = 0;
2025     Error *local_err = NULL;
2026 
2027     qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
2028     if (local_err != NULL) {
2029         result = -EINVAL;
2030         error_report_err(local_err);
2031         error_report("Persistent bitmaps are lost for node '%s'",
2032                      bdrv_get_device_or_node_name(bs));
2033     }
2034 
2035     ret = qcow2_cache_flush(bs, s->l2_table_cache);
2036     if (ret) {
2037         result = ret;
2038         error_report("Failed to flush the L2 table cache: %s",
2039                      strerror(-ret));
2040     }
2041 
2042     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2043     if (ret) {
2044         result = ret;
2045         error_report("Failed to flush the refcount block cache: %s",
2046                      strerror(-ret));
2047     }
2048 
2049     if (result == 0) {
2050         qcow2_mark_clean(bs);
2051     }
2052 
2053     return result;
2054 }
2055 
2056 static void qcow2_close(BlockDriverState *bs)
2057 {
2058     BDRVQcow2State *s = bs->opaque;
2059     qemu_vfree(s->l1_table);
2060     /* else pre-write overlap checks in cache_destroy may crash */
2061     s->l1_table = NULL;
2062 
2063     if (!(s->flags & BDRV_O_INACTIVE)) {
2064         qcow2_inactivate(bs);
2065     }
2066 
2067     cache_clean_timer_del(bs);
2068     qcow2_cache_destroy(bs, s->l2_table_cache);
2069     qcow2_cache_destroy(bs, s->refcount_block_cache);
2070 
2071     qcrypto_block_free(s->crypto);
2072     s->crypto = NULL;
2073 
2074     g_free(s->unknown_header_fields);
2075     cleanup_unknown_header_ext(bs);
2076 
2077     g_free(s->image_backing_file);
2078     g_free(s->image_backing_format);
2079 
2080     g_free(s->cluster_cache);
2081     qemu_vfree(s->cluster_data);
2082     qcow2_refcount_close(bs);
2083     qcow2_free_snapshots(bs);
2084 }
2085 
2086 static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
2087 {
2088     BDRVQcow2State *s = bs->opaque;
2089     int flags = s->flags;
2090     QCryptoBlock *crypto = NULL;
2091     QDict *options;
2092     Error *local_err = NULL;
2093     int ret;
2094 
2095     /*
2096      * Backing files are read-only which makes all of their metadata immutable,
2097      * that means we don't have to worry about reopening them here.
2098      */
2099 
2100     crypto = s->crypto;
2101     s->crypto = NULL;
2102 
2103     qcow2_close(bs);
2104 
2105     memset(s, 0, sizeof(BDRVQcow2State));
2106     options = qdict_clone_shallow(bs->options);
2107 
2108     flags &= ~BDRV_O_INACTIVE;
2109     ret = qcow2_do_open(bs, options, flags, &local_err);
2110     QDECREF(options);
2111     if (local_err) {
2112         error_propagate(errp, local_err);
2113         error_prepend(errp, "Could not reopen qcow2 layer: ");
2114         bs->drv = NULL;
2115         return;
2116     } else if (ret < 0) {
2117         error_setg_errno(errp, -ret, "Could not reopen qcow2 layer");
2118         bs->drv = NULL;
2119         return;
2120     }
2121 
2122     s->crypto = crypto;
2123 }
2124 
2125 static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
2126     size_t len, size_t buflen)
2127 {
2128     QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
2129     size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
2130 
2131     if (buflen < ext_len) {
2132         return -ENOSPC;
2133     }
2134 
2135     *ext_backing_fmt = (QCowExtension) {
2136         .magic  = cpu_to_be32(magic),
2137         .len    = cpu_to_be32(len),
2138     };
2139 
2140     if (len) {
2141         memcpy(buf + sizeof(QCowExtension), s, len);
2142     }
2143 
2144     return ext_len;
2145 }
2146 
2147 /*
2148  * Updates the qcow2 header, including the variable length parts of it, i.e.
2149  * the backing file name and all extensions. qcow2 was not designed to allow
2150  * such changes, so if we run out of space (we can only use the first cluster)
2151  * this function may fail.
2152  *
2153  * Returns 0 on success, -errno in error cases.
2154  */
2155 int qcow2_update_header(BlockDriverState *bs)
2156 {
2157     BDRVQcow2State *s = bs->opaque;
2158     QCowHeader *header;
2159     char *buf;
2160     size_t buflen = s->cluster_size;
2161     int ret;
2162     uint64_t total_size;
2163     uint32_t refcount_table_clusters;
2164     size_t header_length;
2165     Qcow2UnknownHeaderExtension *uext;
2166 
2167     buf = qemu_blockalign(bs, buflen);
2168 
2169     /* Header structure */
2170     header = (QCowHeader*) buf;
2171 
2172     if (buflen < sizeof(*header)) {
2173         ret = -ENOSPC;
2174         goto fail;
2175     }
2176 
2177     header_length = sizeof(*header) + s->unknown_header_fields_size;
2178     total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
2179     refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2180 
2181     *header = (QCowHeader) {
2182         /* Version 2 fields */
2183         .magic                  = cpu_to_be32(QCOW_MAGIC),
2184         .version                = cpu_to_be32(s->qcow_version),
2185         .backing_file_offset    = 0,
2186         .backing_file_size      = 0,
2187         .cluster_bits           = cpu_to_be32(s->cluster_bits),
2188         .size                   = cpu_to_be64(total_size),
2189         .crypt_method           = cpu_to_be32(s->crypt_method_header),
2190         .l1_size                = cpu_to_be32(s->l1_size),
2191         .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
2192         .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
2193         .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
2194         .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
2195         .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
2196 
2197         /* Version 3 fields */
2198         .incompatible_features  = cpu_to_be64(s->incompatible_features),
2199         .compatible_features    = cpu_to_be64(s->compatible_features),
2200         .autoclear_features     = cpu_to_be64(s->autoclear_features),
2201         .refcount_order         = cpu_to_be32(s->refcount_order),
2202         .header_length          = cpu_to_be32(header_length),
2203     };
2204 
2205     /* For older versions, write a shorter header */
2206     switch (s->qcow_version) {
2207     case 2:
2208         ret = offsetof(QCowHeader, incompatible_features);
2209         break;
2210     case 3:
2211         ret = sizeof(*header);
2212         break;
2213     default:
2214         ret = -EINVAL;
2215         goto fail;
2216     }
2217 
2218     buf += ret;
2219     buflen -= ret;
2220     memset(buf, 0, buflen);
2221 
2222     /* Preserve any unknown field in the header */
2223     if (s->unknown_header_fields_size) {
2224         if (buflen < s->unknown_header_fields_size) {
2225             ret = -ENOSPC;
2226             goto fail;
2227         }
2228 
2229         memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
2230         buf += s->unknown_header_fields_size;
2231         buflen -= s->unknown_header_fields_size;
2232     }
2233 
2234     /* Backing file format header extension */
2235     if (s->image_backing_format) {
2236         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
2237                              s->image_backing_format,
2238                              strlen(s->image_backing_format),
2239                              buflen);
2240         if (ret < 0) {
2241             goto fail;
2242         }
2243 
2244         buf += ret;
2245         buflen -= ret;
2246     }
2247 
2248     /* Full disk encryption header pointer extension */
2249     if (s->crypto_header.offset != 0) {
2250         cpu_to_be64s(&s->crypto_header.offset);
2251         cpu_to_be64s(&s->crypto_header.length);
2252         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER,
2253                              &s->crypto_header, sizeof(s->crypto_header),
2254                              buflen);
2255         be64_to_cpus(&s->crypto_header.offset);
2256         be64_to_cpus(&s->crypto_header.length);
2257         if (ret < 0) {
2258             goto fail;
2259         }
2260         buf += ret;
2261         buflen -= ret;
2262     }
2263 
2264     /* Feature table */
2265     if (s->qcow_version >= 3) {
2266         Qcow2Feature features[] = {
2267             {
2268                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
2269                 .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
2270                 .name = "dirty bit",
2271             },
2272             {
2273                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
2274                 .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
2275                 .name = "corrupt bit",
2276             },
2277             {
2278                 .type = QCOW2_FEAT_TYPE_COMPATIBLE,
2279                 .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
2280                 .name = "lazy refcounts",
2281             },
2282         };
2283 
2284         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
2285                              features, sizeof(features), buflen);
2286         if (ret < 0) {
2287             goto fail;
2288         }
2289         buf += ret;
2290         buflen -= ret;
2291     }
2292 
2293     /* Bitmap extension */
2294     if (s->nb_bitmaps > 0) {
2295         Qcow2BitmapHeaderExt bitmaps_header = {
2296             .nb_bitmaps = cpu_to_be32(s->nb_bitmaps),
2297             .bitmap_directory_size =
2298                     cpu_to_be64(s->bitmap_directory_size),
2299             .bitmap_directory_offset =
2300                     cpu_to_be64(s->bitmap_directory_offset)
2301         };
2302         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS,
2303                              &bitmaps_header, sizeof(bitmaps_header),
2304                              buflen);
2305         if (ret < 0) {
2306             goto fail;
2307         }
2308         buf += ret;
2309         buflen -= ret;
2310     }
2311 
2312     /* Keep unknown header extensions */
2313     QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
2314         ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
2315         if (ret < 0) {
2316             goto fail;
2317         }
2318 
2319         buf += ret;
2320         buflen -= ret;
2321     }
2322 
2323     /* End of header extensions */
2324     ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
2325     if (ret < 0) {
2326         goto fail;
2327     }
2328 
2329     buf += ret;
2330     buflen -= ret;
2331 
2332     /* Backing file name */
2333     if (s->image_backing_file) {
2334         size_t backing_file_len = strlen(s->image_backing_file);
2335 
2336         if (buflen < backing_file_len) {
2337             ret = -ENOSPC;
2338             goto fail;
2339         }
2340 
2341         /* Using strncpy is ok here, since buf is not NUL-terminated. */
2342         strncpy(buf, s->image_backing_file, buflen);
2343 
2344         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
2345         header->backing_file_size   = cpu_to_be32(backing_file_len);
2346     }
2347 
2348     /* Write the new header */
2349     ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
2350     if (ret < 0) {
2351         goto fail;
2352     }
2353 
2354     ret = 0;
2355 fail:
2356     qemu_vfree(header);
2357     return ret;
2358 }
2359 
2360 static int qcow2_change_backing_file(BlockDriverState *bs,
2361     const char *backing_file, const char *backing_fmt)
2362 {
2363     BDRVQcow2State *s = bs->opaque;
2364 
2365     if (backing_file && strlen(backing_file) > 1023) {
2366         return -EINVAL;
2367     }
2368 
2369     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
2370     pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
2371 
2372     g_free(s->image_backing_file);
2373     g_free(s->image_backing_format);
2374 
2375     s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
2376     s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;
2377 
2378     return qcow2_update_header(bs);
2379 }
2380 
2381 static int qcow2_crypt_method_from_format(const char *encryptfmt)
2382 {
2383     if (g_str_equal(encryptfmt, "luks")) {
2384         return QCOW_CRYPT_LUKS;
2385     } else if (g_str_equal(encryptfmt, "aes")) {
2386         return QCOW_CRYPT_AES;
2387     } else {
2388         return -EINVAL;
2389     }
2390 }
2391 
2392 static int qcow2_set_up_encryption(BlockDriverState *bs, const char *encryptfmt,
2393                                    QemuOpts *opts, Error **errp)
2394 {
2395     BDRVQcow2State *s = bs->opaque;
2396     QCryptoBlockCreateOptions *cryptoopts = NULL;
2397     QCryptoBlock *crypto = NULL;
2398     int ret = -EINVAL;
2399     QDict *options, *encryptopts;
2400     int fmt;
2401 
2402     options = qemu_opts_to_qdict(opts, NULL);
2403     qdict_extract_subqdict(options, &encryptopts, "encrypt.");
2404     QDECREF(options);
2405 
2406     fmt = qcow2_crypt_method_from_format(encryptfmt);
2407 
2408     switch (fmt) {
2409     case QCOW_CRYPT_LUKS:
2410         cryptoopts = block_crypto_create_opts_init(
2411             Q_CRYPTO_BLOCK_FORMAT_LUKS, encryptopts, errp);
2412         break;
2413     case QCOW_CRYPT_AES:
2414         cryptoopts = block_crypto_create_opts_init(
2415             Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp);
2416         break;
2417     default:
2418         error_setg(errp, "Unknown encryption format '%s'", encryptfmt);
2419         break;
2420     }
2421     if (!cryptoopts) {
2422         ret = -EINVAL;
2423         goto out;
2424     }
2425     s->crypt_method_header = fmt;
2426 
2427     crypto = qcrypto_block_create(cryptoopts, "encrypt.",
2428                                   qcow2_crypto_hdr_init_func,
2429                                   qcow2_crypto_hdr_write_func,
2430                                   bs, errp);
2431     if (!crypto) {
2432         ret = -EINVAL;
2433         goto out;
2434     }
2435 
2436     ret = qcow2_update_header(bs);
2437     if (ret < 0) {
2438         error_setg_errno(errp, -ret, "Could not write encryption header");
2439         goto out;
2440     }
2441 
2442  out:
2443     QDECREF(encryptopts);
2444     qcrypto_block_free(crypto);
2445     qapi_free_QCryptoBlockCreateOptions(cryptoopts);
2446     return ret;
2447 }
2448 
2449 
2450 typedef struct PreallocCo {
2451     BlockDriverState *bs;
2452     uint64_t offset;
2453     uint64_t new_length;
2454 
2455     int ret;
2456 } PreallocCo;
2457 
2458 /**
2459  * Preallocates metadata structures for data clusters between @offset (in the
2460  * guest disk) and @new_length (which is thus generally the new guest disk
2461  * size).
2462  *
2463  * Returns: 0 on success, -errno on failure.
2464  */
2465 static void coroutine_fn preallocate_co(void *opaque)
2466 {
2467     PreallocCo *params = opaque;
2468     BlockDriverState *bs = params->bs;
2469     uint64_t offset = params->offset;
2470     uint64_t new_length = params->new_length;
2471     BDRVQcow2State *s = bs->opaque;
2472     uint64_t bytes;
2473     uint64_t host_offset = 0;
2474     unsigned int cur_bytes;
2475     int ret;
2476     QCowL2Meta *meta;
2477 
2478     qemu_co_mutex_lock(&s->lock);
2479 
2480     assert(offset <= new_length);
2481     bytes = new_length - offset;
2482 
2483     while (bytes) {
2484         cur_bytes = MIN(bytes, INT_MAX);
2485         ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
2486                                          &host_offset, &meta);
2487         if (ret < 0) {
2488             goto done;
2489         }
2490 
2491         while (meta) {
2492             QCowL2Meta *next = meta->next;
2493 
2494             ret = qcow2_alloc_cluster_link_l2(bs, meta);
2495             if (ret < 0) {
2496                 qcow2_free_any_clusters(bs, meta->alloc_offset,
2497                                         meta->nb_clusters, QCOW2_DISCARD_NEVER);
2498                 goto done;
2499             }
2500 
2501             /* There are no dependent requests, but we need to remove our
2502              * request from the list of in-flight requests */
2503             QLIST_REMOVE(meta, next_in_flight);
2504 
2505             g_free(meta);
2506             meta = next;
2507         }
2508 
2509         /* TODO Preallocate data if requested */
2510 
2511         bytes -= cur_bytes;
2512         offset += cur_bytes;
2513     }
2514 
2515     /*
2516      * It is expected that the image file is large enough to actually contain
2517      * all of the allocated clusters (otherwise we get failing reads after
2518      * EOF). Extend the image to the last allocated sector.
2519      */
2520     if (host_offset != 0) {
2521         uint8_t data = 0;
2522         ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1,
2523                           &data, 1);
2524         if (ret < 0) {
2525             goto done;
2526         }
2527     }
2528 
2529     ret = 0;
2530 
2531 done:
2532     qemu_co_mutex_unlock(&s->lock);
2533     params->ret = ret;
2534 }
2535 
2536 static int preallocate(BlockDriverState *bs,
2537                        uint64_t offset, uint64_t new_length)
2538 {
2539     PreallocCo params = {
2540         .bs         = bs,
2541         .offset     = offset,
2542         .new_length = new_length,
2543         .ret        = -EINPROGRESS,
2544     };
2545 
2546     if (qemu_in_coroutine()) {
2547         preallocate_co(&params);
2548     } else {
2549         Coroutine *co = qemu_coroutine_create(preallocate_co, &params);
2550         bdrv_coroutine_enter(bs, co);
2551         BDRV_POLL_WHILE(bs, params.ret == -EINPROGRESS);
2552     }
2553     return params.ret;
2554 }
2555 
2556 /* qcow2_refcount_metadata_size:
2557  * @clusters: number of clusters to refcount (including data and L1/L2 tables)
2558  * @cluster_size: size of a cluster, in bytes
2559  * @refcount_order: refcount bits power-of-2 exponent
2560  * @generous_increase: allow for the refcount table to be 1.5x as large as it
2561  *                     needs to be
2562  *
2563  * Returns: Number of bytes required for refcount blocks and table metadata.
2564  */
2565 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
2566                                      int refcount_order, bool generous_increase,
2567                                      uint64_t *refblock_count)
2568 {
2569     /*
2570      * Every host cluster is reference-counted, including metadata (even
2571      * refcount metadata is recursively included).
2572      *
2573      * An accurate formula for the size of refcount metadata size is difficult
2574      * to derive.  An easier method of calculation is finding the fixed point
2575      * where no further refcount blocks or table clusters are required to
2576      * reference count every cluster.
2577      */
2578     int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t);
2579     int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
2580     int64_t table = 0;  /* number of refcount table clusters */
2581     int64_t blocks = 0; /* number of refcount block clusters */
2582     int64_t last;
2583     int64_t n = 0;
2584 
2585     do {
2586         last = n;
2587         blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
2588         table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
2589         n = clusters + blocks + table;
2590 
2591         if (n == last && generous_increase) {
2592             clusters += DIV_ROUND_UP(table, 2);
2593             n = 0; /* force another loop */
2594             generous_increase = false;
2595         }
2596     } while (n != last);
2597 
2598     if (refblock_count) {
2599         *refblock_count = blocks;
2600     }
2601 
2602     return (blocks + table) * cluster_size;
2603 }
2604 
2605 /**
2606  * qcow2_calc_prealloc_size:
2607  * @total_size: virtual disk size in bytes
2608  * @cluster_size: cluster size in bytes
2609  * @refcount_order: refcount bits power-of-2 exponent
2610  *
2611  * Returns: Total number of bytes required for the fully allocated image
2612  * (including metadata).
2613  */
2614 static int64_t qcow2_calc_prealloc_size(int64_t total_size,
2615                                         size_t cluster_size,
2616                                         int refcount_order)
2617 {
2618     int64_t meta_size = 0;
2619     uint64_t nl1e, nl2e;
2620     int64_t aligned_total_size = align_offset(total_size, cluster_size);
2621 
2622     /* header: 1 cluster */
2623     meta_size += cluster_size;
2624 
2625     /* total size of L2 tables */
2626     nl2e = aligned_total_size / cluster_size;
2627     nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t));
2628     meta_size += nl2e * sizeof(uint64_t);
2629 
2630     /* total size of L1 tables */
2631     nl1e = nl2e * sizeof(uint64_t) / cluster_size;
2632     nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t));
2633     meta_size += nl1e * sizeof(uint64_t);
2634 
2635     /* total size of refcount table and blocks */
2636     meta_size += qcow2_refcount_metadata_size(
2637             (meta_size + aligned_total_size) / cluster_size,
2638             cluster_size, refcount_order, false, NULL);
2639 
2640     return meta_size + aligned_total_size;
2641 }
2642 
2643 static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp)
2644 {
2645     size_t cluster_size;
2646     int cluster_bits;
2647 
2648     cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
2649                                          DEFAULT_CLUSTER_SIZE);
2650     cluster_bits = ctz32(cluster_size);
2651     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
2652         (1 << cluster_bits) != cluster_size)
2653     {
2654         error_setg(errp, "Cluster size must be a power of two between %d and "
2655                    "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
2656         return 0;
2657     }
2658     return cluster_size;
2659 }
2660 
2661 static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
2662 {
2663     char *buf;
2664     int ret;
2665 
2666     buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
2667     if (!buf) {
2668         ret = 3; /* default */
2669     } else if (!strcmp(buf, "0.10")) {
2670         ret = 2;
2671     } else if (!strcmp(buf, "1.1")) {
2672         ret = 3;
2673     } else {
2674         error_setg(errp, "Invalid compatibility level: '%s'", buf);
2675         ret = -EINVAL;
2676     }
2677     g_free(buf);
2678     return ret;
2679 }
2680 
2681 static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
2682                                                 Error **errp)
2683 {
2684     uint64_t refcount_bits;
2685 
2686     refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
2687     if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
2688         error_setg(errp, "Refcount width must be a power of two and may not "
2689                    "exceed 64 bits");
2690         return 0;
2691     }
2692 
2693     if (version < 3 && refcount_bits != 16) {
2694         error_setg(errp, "Different refcount widths than 16 bits require "
2695                    "compatibility level 1.1 or above (use compat=1.1 or "
2696                    "greater)");
2697         return 0;
2698     }
2699 
2700     return refcount_bits;
2701 }
2702 
2703 static int qcow2_create2(const char *filename, int64_t total_size,
2704                          const char *backing_file, const char *backing_format,
2705                          int flags, size_t cluster_size, PreallocMode prealloc,
2706                          QemuOpts *opts, int version, int refcount_order,
2707                          const char *encryptfmt, Error **errp)
2708 {
2709     QDict *options;
2710 
2711     /*
2712      * Open the image file and write a minimal qcow2 header.
2713      *
2714      * We keep things simple and start with a zero-sized image. We also
2715      * do without refcount blocks or a L1 table for now. We'll fix the
2716      * inconsistency later.
2717      *
2718      * We do need a refcount table because growing the refcount table means
2719      * allocating two new refcount blocks - the seconds of which would be at
2720      * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
2721      * size for any qcow2 image.
2722      */
2723     BlockBackend *blk;
2724     QCowHeader *header;
2725     uint64_t* refcount_table;
2726     Error *local_err = NULL;
2727     int ret;
2728 
2729     if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
2730         int64_t prealloc_size =
2731             qcow2_calc_prealloc_size(total_size, cluster_size, refcount_order);
2732         qemu_opt_set_number(opts, BLOCK_OPT_SIZE, prealloc_size, &error_abort);
2733         qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_str(prealloc),
2734                      &error_abort);
2735     }
2736 
2737     ret = bdrv_create_file(filename, opts, &local_err);
2738     if (ret < 0) {
2739         error_propagate(errp, local_err);
2740         return ret;
2741     }
2742 
2743     blk = blk_new_open(filename, NULL, NULL,
2744                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
2745                        &local_err);
2746     if (blk == NULL) {
2747         error_propagate(errp, local_err);
2748         return -EIO;
2749     }
2750 
2751     blk_set_allow_write_beyond_eof(blk, true);
2752 
2753     /* Write the header */
2754     QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
2755     header = g_malloc0(cluster_size);
2756     *header = (QCowHeader) {
2757         .magic                      = cpu_to_be32(QCOW_MAGIC),
2758         .version                    = cpu_to_be32(version),
2759         .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
2760         .size                       = cpu_to_be64(0),
2761         .l1_table_offset            = cpu_to_be64(0),
2762         .l1_size                    = cpu_to_be32(0),
2763         .refcount_table_offset      = cpu_to_be64(cluster_size),
2764         .refcount_table_clusters    = cpu_to_be32(1),
2765         .refcount_order             = cpu_to_be32(refcount_order),
2766         .header_length              = cpu_to_be32(sizeof(*header)),
2767     };
2768 
2769     /* We'll update this to correct value later */
2770     header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
2771 
2772     if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
2773         header->compatible_features |=
2774             cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
2775     }
2776 
2777     ret = blk_pwrite(blk, 0, header, cluster_size, 0);
2778     g_free(header);
2779     if (ret < 0) {
2780         error_setg_errno(errp, -ret, "Could not write qcow2 header");
2781         goto out;
2782     }
2783 
2784     /* Write a refcount table with one refcount block */
2785     refcount_table = g_malloc0(2 * cluster_size);
2786     refcount_table[0] = cpu_to_be64(2 * cluster_size);
2787     ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0);
2788     g_free(refcount_table);
2789 
2790     if (ret < 0) {
2791         error_setg_errno(errp, -ret, "Could not write refcount table");
2792         goto out;
2793     }
2794 
2795     blk_unref(blk);
2796     blk = NULL;
2797 
2798     /*
2799      * And now open the image and make it consistent first (i.e. increase the
2800      * refcount of the cluster that is occupied by the header and the refcount
2801      * table)
2802      */
2803     options = qdict_new();
2804     qdict_put_str(options, "driver", "qcow2");
2805     blk = blk_new_open(filename, NULL, options,
2806                        BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
2807                        &local_err);
2808     if (blk == NULL) {
2809         error_propagate(errp, local_err);
2810         ret = -EIO;
2811         goto out;
2812     }
2813 
2814     ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
2815     if (ret < 0) {
2816         error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
2817                          "header and refcount table");
2818         goto out;
2819 
2820     } else if (ret != 0) {
2821         error_report("Huh, first cluster in empty image is already in use?");
2822         abort();
2823     }
2824 
2825     /* Create a full header (including things like feature table) */
2826     ret = qcow2_update_header(blk_bs(blk));
2827     if (ret < 0) {
2828         error_setg_errno(errp, -ret, "Could not update qcow2 header");
2829         goto out;
2830     }
2831 
2832     /* Okay, now that we have a valid image, let's give it the right size */
2833     ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
2834     if (ret < 0) {
2835         error_prepend(errp, "Could not resize image: ");
2836         goto out;
2837     }
2838 
2839     /* Want a backing file? There you go.*/
2840     if (backing_file) {
2841         ret = bdrv_change_backing_file(blk_bs(blk), backing_file, backing_format);
2842         if (ret < 0) {
2843             error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
2844                              "with format '%s'", backing_file, backing_format);
2845             goto out;
2846         }
2847     }
2848 
2849     /* Want encryption? There you go. */
2850     if (encryptfmt) {
2851         ret = qcow2_set_up_encryption(blk_bs(blk), encryptfmt, opts, errp);
2852         if (ret < 0) {
2853             goto out;
2854         }
2855     }
2856 
2857     /* And if we're supposed to preallocate metadata, do that now */
2858     if (prealloc != PREALLOC_MODE_OFF) {
2859         ret = preallocate(blk_bs(blk), 0, total_size);
2860         if (ret < 0) {
2861             error_setg_errno(errp, -ret, "Could not preallocate metadata");
2862             goto out;
2863         }
2864     }
2865 
2866     blk_unref(blk);
2867     blk = NULL;
2868 
2869     /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
2870      * Using BDRV_O_NO_IO, since encryption is now setup we don't want to
2871      * have to setup decryption context. We're not doing any I/O on the top
2872      * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does
2873      * not have effect.
2874      */
2875     options = qdict_new();
2876     qdict_put_str(options, "driver", "qcow2");
2877     blk = blk_new_open(filename, NULL, options,
2878                        BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO,
2879                        &local_err);
2880     if (blk == NULL) {
2881         error_propagate(errp, local_err);
2882         ret = -EIO;
2883         goto out;
2884     }
2885 
2886     ret = 0;
2887 out:
2888     if (blk) {
2889         blk_unref(blk);
2890     }
2891     return ret;
2892 }
2893 
2894 static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
2895 {
2896     char *backing_file = NULL;
2897     char *backing_fmt = NULL;
2898     char *buf = NULL;
2899     uint64_t size = 0;
2900     int flags = 0;
2901     size_t cluster_size = DEFAULT_CLUSTER_SIZE;
2902     PreallocMode prealloc;
2903     int version;
2904     uint64_t refcount_bits;
2905     int refcount_order;
2906     char *encryptfmt = NULL;
2907     Error *local_err = NULL;
2908     int ret;
2909 
2910     /* Read out options */
2911     size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2912                     BDRV_SECTOR_SIZE);
2913     backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
2914     backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
2915     encryptfmt = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT);
2916     if (encryptfmt) {
2917         if (qemu_opt_get(opts, BLOCK_OPT_ENCRYPT)) {
2918             error_setg(errp, "Options " BLOCK_OPT_ENCRYPT " and "
2919                        BLOCK_OPT_ENCRYPT_FORMAT " are mutually exclusive");
2920             ret = -EINVAL;
2921             goto finish;
2922         }
2923     } else if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
2924         encryptfmt = g_strdup("aes");
2925     }
2926     cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err);
2927     if (local_err) {
2928         error_propagate(errp, local_err);
2929         ret = -EINVAL;
2930         goto finish;
2931     }
2932     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2933     prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
2934                                PREALLOC_MODE_OFF, &local_err);
2935     if (local_err) {
2936         error_propagate(errp, local_err);
2937         ret = -EINVAL;
2938         goto finish;
2939     }
2940 
2941     version = qcow2_opt_get_version_del(opts, &local_err);
2942     if (local_err) {
2943         error_propagate(errp, local_err);
2944         ret = -EINVAL;
2945         goto finish;
2946     }
2947 
2948     if (qemu_opt_get_bool_del(opts, BLOCK_OPT_LAZY_REFCOUNTS, false)) {
2949         flags |= BLOCK_FLAG_LAZY_REFCOUNTS;
2950     }
2951 
2952     if (backing_file && prealloc != PREALLOC_MODE_OFF) {
2953         error_setg(errp, "Backing file and preallocation cannot be used at "
2954                    "the same time");
2955         ret = -EINVAL;
2956         goto finish;
2957     }
2958 
2959     if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
2960         error_setg(errp, "Lazy refcounts only supported with compatibility "
2961                    "level 1.1 and above (use compat=1.1 or greater)");
2962         ret = -EINVAL;
2963         goto finish;
2964     }
2965 
2966     refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
2967     if (local_err) {
2968         error_propagate(errp, local_err);
2969         ret = -EINVAL;
2970         goto finish;
2971     }
2972 
2973     refcount_order = ctz32(refcount_bits);
2974 
2975     ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
2976                         cluster_size, prealloc, opts, version, refcount_order,
2977                         encryptfmt, &local_err);
2978     error_propagate(errp, local_err);
2979 
2980 finish:
2981     g_free(backing_file);
2982     g_free(backing_fmt);
2983     g_free(encryptfmt);
2984     g_free(buf);
2985     return ret;
2986 }
2987 
2988 
2989 static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
2990 {
2991     int64_t nr;
2992     int res;
2993 
2994     /* Clamp to image length, before checking status of underlying sectors */
2995     if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
2996         bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset;
2997     }
2998 
2999     if (!bytes) {
3000         return true;
3001     }
3002     res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
3003     return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
3004 }
3005 
3006 static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
3007     int64_t offset, int bytes, BdrvRequestFlags flags)
3008 {
3009     int ret;
3010     BDRVQcow2State *s = bs->opaque;
3011 
3012     uint32_t head = offset % s->cluster_size;
3013     uint32_t tail = (offset + bytes) % s->cluster_size;
3014 
3015     trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
3016     if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
3017         tail = 0;
3018     }
3019 
3020     if (head || tail) {
3021         uint64_t off;
3022         unsigned int nr;
3023 
3024         assert(head + bytes <= s->cluster_size);
3025 
3026         /* check whether remainder of cluster already reads as zero */
3027         if (!(is_zero(bs, offset - head, head) &&
3028               is_zero(bs, offset + bytes,
3029                       tail ? s->cluster_size - tail : 0))) {
3030             return -ENOTSUP;
3031         }
3032 
3033         qemu_co_mutex_lock(&s->lock);
3034         /* We can have new write after previous check */
3035         offset = QEMU_ALIGN_DOWN(offset, s->cluster_size);
3036         bytes = s->cluster_size;
3037         nr = s->cluster_size;
3038         ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
3039         if (ret != QCOW2_CLUSTER_UNALLOCATED &&
3040             ret != QCOW2_CLUSTER_ZERO_PLAIN &&
3041             ret != QCOW2_CLUSTER_ZERO_ALLOC) {
3042             qemu_co_mutex_unlock(&s->lock);
3043             return -ENOTSUP;
3044         }
3045     } else {
3046         qemu_co_mutex_lock(&s->lock);
3047     }
3048 
3049     trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
3050 
3051     /* Whatever is left can use real zero clusters */
3052     ret = qcow2_cluster_zeroize(bs, offset, bytes, flags);
3053     qemu_co_mutex_unlock(&s->lock);
3054 
3055     return ret;
3056 }
3057 
3058 static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
3059                                           int64_t offset, int bytes)
3060 {
3061     int ret;
3062     BDRVQcow2State *s = bs->opaque;
3063 
3064     if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
3065         assert(bytes < s->cluster_size);
3066         /* Ignore partial clusters, except for the special case of the
3067          * complete partial cluster at the end of an unaligned file */
3068         if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
3069             offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
3070             return -ENOTSUP;
3071         }
3072     }
3073 
3074     qemu_co_mutex_lock(&s->lock);
3075     ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
3076                                 false);
3077     qemu_co_mutex_unlock(&s->lock);
3078     return ret;
3079 }
3080 
3081 static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
3082                           PreallocMode prealloc, Error **errp)
3083 {
3084     BDRVQcow2State *s = bs->opaque;
3085     uint64_t old_length;
3086     int64_t new_l1_size;
3087     int ret;
3088 
3089     if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
3090         prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
3091     {
3092         error_setg(errp, "Unsupported preallocation mode '%s'",
3093                    PreallocMode_str(prealloc));
3094         return -ENOTSUP;
3095     }
3096 
3097     if (offset & 511) {
3098         error_setg(errp, "The new size must be a multiple of 512");
3099         return -EINVAL;
3100     }
3101 
3102     /* cannot proceed if image has snapshots */
3103     if (s->nb_snapshots) {
3104         error_setg(errp, "Can't resize an image which has snapshots");
3105         return -ENOTSUP;
3106     }
3107 
3108     /* cannot proceed if image has bitmaps */
3109     if (s->nb_bitmaps) {
3110         /* TODO: resize bitmaps in the image */
3111         error_setg(errp, "Can't resize an image which has bitmaps");
3112         return -ENOTSUP;
3113     }
3114 
3115     old_length = bs->total_sectors * 512;
3116     new_l1_size = size_to_l1(s, offset);
3117 
3118     if (offset < old_length) {
3119         int64_t last_cluster, old_file_size;
3120         if (prealloc != PREALLOC_MODE_OFF) {
3121             error_setg(errp,
3122                        "Preallocation can't be used for shrinking an image");
3123             return -EINVAL;
3124         }
3125 
3126         ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
3127                                     old_length - ROUND_UP(offset,
3128                                                           s->cluster_size),
3129                                     QCOW2_DISCARD_ALWAYS, true);
3130         if (ret < 0) {
3131             error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
3132             return ret;
3133         }
3134 
3135         ret = qcow2_shrink_l1_table(bs, new_l1_size);
3136         if (ret < 0) {
3137             error_setg_errno(errp, -ret,
3138                              "Failed to reduce the number of L2 tables");
3139             return ret;
3140         }
3141 
3142         ret = qcow2_shrink_reftable(bs);
3143         if (ret < 0) {
3144             error_setg_errno(errp, -ret,
3145                              "Failed to discard unused refblocks");
3146             return ret;
3147         }
3148 
3149         old_file_size = bdrv_getlength(bs->file->bs);
3150         if (old_file_size < 0) {
3151             error_setg_errno(errp, -old_file_size,
3152                              "Failed to inquire current file length");
3153             return old_file_size;
3154         }
3155         last_cluster = qcow2_get_last_cluster(bs, old_file_size);
3156         if (last_cluster < 0) {
3157             error_setg_errno(errp, -last_cluster,
3158                              "Failed to find the last cluster");
3159             return last_cluster;
3160         }
3161         if ((last_cluster + 1) * s->cluster_size < old_file_size) {
3162             Error *local_err = NULL;
3163 
3164             bdrv_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
3165                           PREALLOC_MODE_OFF, &local_err);
3166             if (local_err) {
3167                 warn_reportf_err(local_err,
3168                                  "Failed to truncate the tail of the image: ");
3169             }
3170         }
3171     } else {
3172         ret = qcow2_grow_l1_table(bs, new_l1_size, true);
3173         if (ret < 0) {
3174             error_setg_errno(errp, -ret, "Failed to grow the L1 table");
3175             return ret;
3176         }
3177     }
3178 
3179     switch (prealloc) {
3180     case PREALLOC_MODE_OFF:
3181         break;
3182 
3183     case PREALLOC_MODE_METADATA:
3184         ret = preallocate(bs, old_length, offset);
3185         if (ret < 0) {
3186             error_setg_errno(errp, -ret, "Preallocation failed");
3187             return ret;
3188         }
3189         break;
3190 
3191     case PREALLOC_MODE_FALLOC:
3192     case PREALLOC_MODE_FULL:
3193     {
3194         int64_t allocation_start, host_offset, guest_offset;
3195         int64_t clusters_allocated;
3196         int64_t old_file_size, new_file_size;
3197         uint64_t nb_new_data_clusters, nb_new_l2_tables;
3198 
3199         old_file_size = bdrv_getlength(bs->file->bs);
3200         if (old_file_size < 0) {
3201             error_setg_errno(errp, -old_file_size,
3202                              "Failed to inquire current file length");
3203             return old_file_size;
3204         }
3205         old_file_size = ROUND_UP(old_file_size, s->cluster_size);
3206 
3207         nb_new_data_clusters = DIV_ROUND_UP(offset - old_length,
3208                                             s->cluster_size);
3209 
3210         /* This is an overestimation; we will not actually allocate space for
3211          * these in the file but just make sure the new refcount structures are
3212          * able to cover them so we will not have to allocate new refblocks
3213          * while entering the data blocks in the potentially new L2 tables.
3214          * (We do not actually care where the L2 tables are placed. Maybe they
3215          *  are already allocated or they can be placed somewhere before
3216          *  @old_file_size. It does not matter because they will be fully
3217          *  allocated automatically, so they do not need to be covered by the
3218          *  preallocation. All that matters is that we will not have to allocate
3219          *  new refcount structures for them.) */
3220         nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
3221                                         s->cluster_size / sizeof(uint64_t));
3222         /* The cluster range may not be aligned to L2 boundaries, so add one L2
3223          * table for a potential head/tail */
3224         nb_new_l2_tables++;
3225 
3226         allocation_start = qcow2_refcount_area(bs, old_file_size,
3227                                                nb_new_data_clusters +
3228                                                nb_new_l2_tables,
3229                                                true, 0, 0);
3230         if (allocation_start < 0) {
3231             error_setg_errno(errp, -allocation_start,
3232                              "Failed to resize refcount structures");
3233             return allocation_start;
3234         }
3235 
3236         clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
3237                                                      nb_new_data_clusters);
3238         if (clusters_allocated < 0) {
3239             error_setg_errno(errp, -clusters_allocated,
3240                              "Failed to allocate data clusters");
3241             return -clusters_allocated;
3242         }
3243 
3244         assert(clusters_allocated == nb_new_data_clusters);
3245 
3246         /* Allocate the data area */
3247         new_file_size = allocation_start +
3248                         nb_new_data_clusters * s->cluster_size;
3249         ret = bdrv_truncate(bs->file, new_file_size, prealloc, errp);
3250         if (ret < 0) {
3251             error_prepend(errp, "Failed to resize underlying file: ");
3252             qcow2_free_clusters(bs, allocation_start,
3253                                 nb_new_data_clusters * s->cluster_size,
3254                                 QCOW2_DISCARD_OTHER);
3255             return ret;
3256         }
3257 
3258         /* Create the necessary L2 entries */
3259         host_offset = allocation_start;
3260         guest_offset = old_length;
3261         while (nb_new_data_clusters) {
3262             int64_t guest_cluster = guest_offset >> s->cluster_bits;
3263             int64_t nb_clusters = MIN(nb_new_data_clusters,
3264                                       s->l2_size - guest_cluster % s->l2_size);
3265             QCowL2Meta allocation = {
3266                 .offset       = guest_offset,
3267                 .alloc_offset = host_offset,
3268                 .nb_clusters  = nb_clusters,
3269             };
3270             qemu_co_queue_init(&allocation.dependent_requests);
3271 
3272             ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
3273             if (ret < 0) {
3274                 error_setg_errno(errp, -ret, "Failed to update L2 tables");
3275                 qcow2_free_clusters(bs, host_offset,
3276                                     nb_new_data_clusters * s->cluster_size,
3277                                     QCOW2_DISCARD_OTHER);
3278                 return ret;
3279             }
3280 
3281             guest_offset += nb_clusters * s->cluster_size;
3282             host_offset += nb_clusters * s->cluster_size;
3283             nb_new_data_clusters -= nb_clusters;
3284         }
3285         break;
3286     }
3287 
3288     default:
3289         g_assert_not_reached();
3290     }
3291 
3292     if (prealloc != PREALLOC_MODE_OFF) {
3293         /* Flush metadata before actually changing the image size */
3294         ret = bdrv_flush(bs);
3295         if (ret < 0) {
3296             error_setg_errno(errp, -ret,
3297                              "Failed to flush the preallocated area to disk");
3298             return ret;
3299         }
3300     }
3301 
3302     /* write updated header.size */
3303     offset = cpu_to_be64(offset);
3304     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
3305                            &offset, sizeof(uint64_t));
3306     if (ret < 0) {
3307         error_setg_errno(errp, -ret, "Failed to update the image size");
3308         return ret;
3309     }
3310 
3311     s->l1_vm_state_index = new_l1_size;
3312     return 0;
3313 }
3314 
3315 /* XXX: put compressed sectors first, then all the cluster aligned
3316    tables to avoid losing bytes in alignment */
3317 static coroutine_fn int
3318 qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
3319                             uint64_t bytes, QEMUIOVector *qiov)
3320 {
3321     BDRVQcow2State *s = bs->opaque;
3322     QEMUIOVector hd_qiov;
3323     struct iovec iov;
3324     z_stream strm;
3325     int ret, out_len;
3326     uint8_t *buf, *out_buf;
3327     int64_t cluster_offset;
3328 
3329     if (bytes == 0) {
3330         /* align end of file to a sector boundary to ease reading with
3331            sector based I/Os */
3332         cluster_offset = bdrv_getlength(bs->file->bs);
3333         if (cluster_offset < 0) {
3334             return cluster_offset;
3335         }
3336         return bdrv_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF, NULL);
3337     }
3338 
3339     if (offset_into_cluster(s, offset)) {
3340         return -EINVAL;
3341     }
3342 
3343     buf = qemu_blockalign(bs, s->cluster_size);
3344     if (bytes != s->cluster_size) {
3345         if (bytes > s->cluster_size ||
3346             offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS)
3347         {
3348             qemu_vfree(buf);
3349             return -EINVAL;
3350         }
3351         /* Zero-pad last write if image size is not cluster aligned */
3352         memset(buf + bytes, 0, s->cluster_size - bytes);
3353     }
3354     qemu_iovec_to_buf(qiov, 0, buf, bytes);
3355 
3356     out_buf = g_malloc(s->cluster_size);
3357 
3358     /* best compression, small window, no zlib header */
3359     memset(&strm, 0, sizeof(strm));
3360     ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
3361                        Z_DEFLATED, -12,
3362                        9, Z_DEFAULT_STRATEGY);
3363     if (ret != 0) {
3364         ret = -EINVAL;
3365         goto fail;
3366     }
3367 
3368     strm.avail_in = s->cluster_size;
3369     strm.next_in = (uint8_t *)buf;
3370     strm.avail_out = s->cluster_size;
3371     strm.next_out = out_buf;
3372 
3373     ret = deflate(&strm, Z_FINISH);
3374     if (ret != Z_STREAM_END && ret != Z_OK) {
3375         deflateEnd(&strm);
3376         ret = -EINVAL;
3377         goto fail;
3378     }
3379     out_len = strm.next_out - out_buf;
3380 
3381     deflateEnd(&strm);
3382 
3383     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
3384         /* could not compress: write normal cluster */
3385         ret = qcow2_co_pwritev(bs, offset, bytes, qiov, 0);
3386         if (ret < 0) {
3387             goto fail;
3388         }
3389         goto success;
3390     }
3391 
3392     qemu_co_mutex_lock(&s->lock);
3393     cluster_offset =
3394         qcow2_alloc_compressed_cluster_offset(bs, offset, out_len);
3395     if (!cluster_offset) {
3396         qemu_co_mutex_unlock(&s->lock);
3397         ret = -EIO;
3398         goto fail;
3399     }
3400     cluster_offset &= s->cluster_offset_mask;
3401 
3402     ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
3403     qemu_co_mutex_unlock(&s->lock);
3404     if (ret < 0) {
3405         goto fail;
3406     }
3407 
3408     iov = (struct iovec) {
3409         .iov_base   = out_buf,
3410         .iov_len    = out_len,
3411     };
3412     qemu_iovec_init_external(&hd_qiov, &iov, 1);
3413 
3414     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
3415     ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0);
3416     if (ret < 0) {
3417         goto fail;
3418     }
3419 success:
3420     ret = 0;
3421 fail:
3422     qemu_vfree(buf);
3423     g_free(out_buf);
3424     return ret;
3425 }
3426 
3427 static int make_completely_empty(BlockDriverState *bs)
3428 {
3429     BDRVQcow2State *s = bs->opaque;
3430     Error *local_err = NULL;
3431     int ret, l1_clusters;
3432     int64_t offset;
3433     uint64_t *new_reftable = NULL;
3434     uint64_t rt_entry, l1_size2;
3435     struct {
3436         uint64_t l1_offset;
3437         uint64_t reftable_offset;
3438         uint32_t reftable_clusters;
3439     } QEMU_PACKED l1_ofs_rt_ofs_cls;
3440 
3441     ret = qcow2_cache_empty(bs, s->l2_table_cache);
3442     if (ret < 0) {
3443         goto fail;
3444     }
3445 
3446     ret = qcow2_cache_empty(bs, s->refcount_block_cache);
3447     if (ret < 0) {
3448         goto fail;
3449     }
3450 
3451     /* Refcounts will be broken utterly */
3452     ret = qcow2_mark_dirty(bs);
3453     if (ret < 0) {
3454         goto fail;
3455     }
3456 
3457     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
3458 
3459     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
3460     l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);
3461 
3462     /* After this call, neither the in-memory nor the on-disk refcount
3463      * information accurately describe the actual references */
3464 
3465     ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
3466                              l1_clusters * s->cluster_size, 0);
3467     if (ret < 0) {
3468         goto fail_broken_refcounts;
3469     }
3470     memset(s->l1_table, 0, l1_size2);
3471 
3472     BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
3473 
3474     /* Overwrite enough clusters at the beginning of the sectors to place
3475      * the refcount table, a refcount block and the L1 table in; this may
3476      * overwrite parts of the existing refcount and L1 table, which is not
3477      * an issue because the dirty flag is set, complete data loss is in fact
3478      * desired and partial data loss is consequently fine as well */
3479     ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
3480                              (2 + l1_clusters) * s->cluster_size, 0);
3481     /* This call (even if it failed overall) may have overwritten on-disk
3482      * refcount structures; in that case, the in-memory refcount information
3483      * will probably differ from the on-disk information which makes the BDS
3484      * unusable */
3485     if (ret < 0) {
3486         goto fail_broken_refcounts;
3487     }
3488 
3489     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
3490     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
3491 
3492     /* "Create" an empty reftable (one cluster) directly after the image
3493      * header and an empty L1 table three clusters after the image header;
3494      * the cluster between those two will be used as the first refblock */
3495     l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
3496     l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
3497     l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
3498     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
3499                            &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
3500     if (ret < 0) {
3501         goto fail_broken_refcounts;
3502     }
3503 
3504     s->l1_table_offset = 3 * s->cluster_size;
3505 
3506     new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
3507     if (!new_reftable) {
3508         ret = -ENOMEM;
3509         goto fail_broken_refcounts;
3510     }
3511 
3512     s->refcount_table_offset = s->cluster_size;
3513     s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
3514     s->max_refcount_table_index = 0;
3515 
3516     g_free(s->refcount_table);
3517     s->refcount_table = new_reftable;
3518     new_reftable = NULL;
3519 
3520     /* Now the in-memory refcount information again corresponds to the on-disk
3521      * information (reftable is empty and no refblocks (the refblock cache is
3522      * empty)); however, this means some clusters (e.g. the image header) are
3523      * referenced, but not refcounted, but the normal qcow2 code assumes that
3524      * the in-memory information is always correct */
3525 
3526     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
3527 
3528     /* Enter the first refblock into the reftable */
3529     rt_entry = cpu_to_be64(2 * s->cluster_size);
3530     ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
3531                            &rt_entry, sizeof(rt_entry));
3532     if (ret < 0) {
3533         goto fail_broken_refcounts;
3534     }
3535     s->refcount_table[0] = 2 * s->cluster_size;
3536 
3537     s->free_cluster_index = 0;
3538     assert(3 + l1_clusters <= s->refcount_block_size);
3539     offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
3540     if (offset < 0) {
3541         ret = offset;
3542         goto fail_broken_refcounts;
3543     } else if (offset > 0) {
3544         error_report("First cluster in emptied image is in use");
3545         abort();
3546     }
3547 
3548     /* Now finally the in-memory information corresponds to the on-disk
3549      * structures and is correct */
3550     ret = qcow2_mark_clean(bs);
3551     if (ret < 0) {
3552         goto fail;
3553     }
3554 
3555     ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size,
3556                         PREALLOC_MODE_OFF, &local_err);
3557     if (ret < 0) {
3558         error_report_err(local_err);
3559         goto fail;
3560     }
3561 
3562     return 0;
3563 
3564 fail_broken_refcounts:
3565     /* The BDS is unusable at this point. If we wanted to make it usable, we
3566      * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
3567      * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
3568      * again. However, because the functions which could have caused this error
3569      * path to be taken are used by those functions as well, it's very likely
3570      * that that sequence will fail as well. Therefore, just eject the BDS. */
3571     bs->drv = NULL;
3572 
3573 fail:
3574     g_free(new_reftable);
3575     return ret;
3576 }
3577 
3578 static int qcow2_make_empty(BlockDriverState *bs)
3579 {
3580     BDRVQcow2State *s = bs->opaque;
3581     uint64_t offset, end_offset;
3582     int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
3583     int l1_clusters, ret = 0;
3584 
3585     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
3586 
3587     if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps &&
3588         3 + l1_clusters <= s->refcount_block_size &&
3589         s->crypt_method_header != QCOW_CRYPT_LUKS) {
3590         /* The following function only works for qcow2 v3 images (it
3591          * requires the dirty flag) and only as long as there are no
3592          * features that reserve extra clusters (such as snapshots,
3593          * LUKS header, or persistent bitmaps), because it completely
3594          * empties the image.  Furthermore, the L1 table and three
3595          * additional clusters (image header, refcount table, one
3596          * refcount block) have to fit inside one refcount block. */
3597         return make_completely_empty(bs);
3598     }
3599 
3600     /* This fallback code simply discards every active cluster; this is slow,
3601      * but works in all cases */
3602     end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3603     for (offset = 0; offset < end_offset; offset += step) {
3604         /* As this function is generally used after committing an external
3605          * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
3606          * default action for this kind of discard is to pass the discard,
3607          * which will ideally result in an actually smaller image file, as
3608          * is probably desired. */
3609         ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
3610                                     QCOW2_DISCARD_SNAPSHOT, true);
3611         if (ret < 0) {
3612             break;
3613         }
3614     }
3615 
3616     return ret;
3617 }
3618 
3619 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
3620 {
3621     BDRVQcow2State *s = bs->opaque;
3622     int ret;
3623 
3624     qemu_co_mutex_lock(&s->lock);
3625     ret = qcow2_cache_write(bs, s->l2_table_cache);
3626     if (ret < 0) {
3627         qemu_co_mutex_unlock(&s->lock);
3628         return ret;
3629     }
3630 
3631     if (qcow2_need_accurate_refcounts(s)) {
3632         ret = qcow2_cache_write(bs, s->refcount_block_cache);
3633         if (ret < 0) {
3634             qemu_co_mutex_unlock(&s->lock);
3635             return ret;
3636         }
3637     }
3638     qemu_co_mutex_unlock(&s->lock);
3639 
3640     return 0;
3641 }
3642 
3643 static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
3644                                        Error **errp)
3645 {
3646     Error *local_err = NULL;
3647     BlockMeasureInfo *info;
3648     uint64_t required = 0; /* bytes that contribute to required size */
3649     uint64_t virtual_size; /* disk size as seen by guest */
3650     uint64_t refcount_bits;
3651     uint64_t l2_tables;
3652     size_t cluster_size;
3653     int version;
3654     char *optstr;
3655     PreallocMode prealloc;
3656     bool has_backing_file;
3657 
3658     /* Parse image creation options */
3659     cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err);
3660     if (local_err) {
3661         goto err;
3662     }
3663 
3664     version = qcow2_opt_get_version_del(opts, &local_err);
3665     if (local_err) {
3666         goto err;
3667     }
3668 
3669     refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
3670     if (local_err) {
3671         goto err;
3672     }
3673 
3674     optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
3675     prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
3676                                PREALLOC_MODE_OFF, &local_err);
3677     g_free(optstr);
3678     if (local_err) {
3679         goto err;
3680     }
3681 
3682     optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
3683     has_backing_file = !!optstr;
3684     g_free(optstr);
3685 
3686     virtual_size = align_offset(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
3687                                 cluster_size);
3688 
3689     /* Check that virtual disk size is valid */
3690     l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
3691                              cluster_size / sizeof(uint64_t));
3692     if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) {
3693         error_setg(&local_err, "The image size is too large "
3694                                "(try using a larger cluster size)");
3695         goto err;
3696     }
3697 
3698     /* Account for input image */
3699     if (in_bs) {
3700         int64_t ssize = bdrv_getlength(in_bs);
3701         if (ssize < 0) {
3702             error_setg_errno(&local_err, -ssize,
3703                              "Unable to get image virtual_size");
3704             goto err;
3705         }
3706 
3707         virtual_size = align_offset(ssize, cluster_size);
3708 
3709         if (has_backing_file) {
3710             /* We don't how much of the backing chain is shared by the input
3711              * image and the new image file.  In the worst case the new image's
3712              * backing file has nothing in common with the input image.  Be
3713              * conservative and assume all clusters need to be written.
3714              */
3715             required = virtual_size;
3716         } else {
3717             int64_t offset;
3718             int64_t pnum = 0;
3719 
3720             for (offset = 0; offset < ssize; offset += pnum) {
3721                 int ret;
3722 
3723                 ret = bdrv_block_status_above(in_bs, NULL, offset,
3724                                               ssize - offset, &pnum, NULL,
3725                                               NULL);
3726                 if (ret < 0) {
3727                     error_setg_errno(&local_err, -ret,
3728                                      "Unable to get block status");
3729                     goto err;
3730                 }
3731 
3732                 if (ret & BDRV_BLOCK_ZERO) {
3733                     /* Skip zero regions (safe with no backing file) */
3734                 } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
3735                            (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
3736                     /* Extend pnum to end of cluster for next iteration */
3737                     pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
3738 
3739                     /* Count clusters we've seen */
3740                     required += offset % cluster_size + pnum;
3741                 }
3742             }
3743         }
3744     }
3745 
3746     /* Take into account preallocation.  Nothing special is needed for
3747      * PREALLOC_MODE_METADATA since metadata is always counted.
3748      */
3749     if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
3750         required = virtual_size;
3751     }
3752 
3753     info = g_new(BlockMeasureInfo, 1);
3754     info->fully_allocated =
3755         qcow2_calc_prealloc_size(virtual_size, cluster_size,
3756                                  ctz32(refcount_bits));
3757 
3758     /* Remove data clusters that are not required.  This overestimates the
3759      * required size because metadata needed for the fully allocated file is
3760      * still counted.
3761      */
3762     info->required = info->fully_allocated - virtual_size + required;
3763     return info;
3764 
3765 err:
3766     error_propagate(errp, local_err);
3767     return NULL;
3768 }
3769 
3770 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3771 {
3772     BDRVQcow2State *s = bs->opaque;
3773     bdi->unallocated_blocks_are_zero = true;
3774     bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3);
3775     bdi->cluster_size = s->cluster_size;
3776     bdi->vm_state_offset = qcow2_vm_state_offset(s);
3777     return 0;
3778 }
3779 
3780 static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
3781 {
3782     BDRVQcow2State *s = bs->opaque;
3783     ImageInfoSpecific *spec_info;
3784     QCryptoBlockInfo *encrypt_info = NULL;
3785 
3786     if (s->crypto != NULL) {
3787         encrypt_info = qcrypto_block_get_info(s->crypto, &error_abort);
3788     }
3789 
3790     spec_info = g_new(ImageInfoSpecific, 1);
3791     *spec_info = (ImageInfoSpecific){
3792         .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
3793         .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1),
3794     };
3795     if (s->qcow_version == 2) {
3796         *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
3797             .compat             = g_strdup("0.10"),
3798             .refcount_bits      = s->refcount_bits,
3799         };
3800     } else if (s->qcow_version == 3) {
3801         *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
3802             .compat             = g_strdup("1.1"),
3803             .lazy_refcounts     = s->compatible_features &
3804                                   QCOW2_COMPAT_LAZY_REFCOUNTS,
3805             .has_lazy_refcounts = true,
3806             .corrupt            = s->incompatible_features &
3807                                   QCOW2_INCOMPAT_CORRUPT,
3808             .has_corrupt        = true,
3809             .refcount_bits      = s->refcount_bits,
3810         };
3811     } else {
3812         /* if this assertion fails, this probably means a new version was
3813          * added without having it covered here */
3814         assert(false);
3815     }
3816 
3817     if (encrypt_info) {
3818         ImageInfoSpecificQCow2Encryption *qencrypt =
3819             g_new(ImageInfoSpecificQCow2Encryption, 1);
3820         switch (encrypt_info->format) {
3821         case Q_CRYPTO_BLOCK_FORMAT_QCOW:
3822             qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
3823             qencrypt->u.aes = encrypt_info->u.qcow;
3824             break;
3825         case Q_CRYPTO_BLOCK_FORMAT_LUKS:
3826             qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
3827             qencrypt->u.luks = encrypt_info->u.luks;
3828             break;
3829         default:
3830             abort();
3831         }
3832         /* Since we did shallow copy above, erase any pointers
3833          * in the original info */
3834         memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
3835         qapi_free_QCryptoBlockInfo(encrypt_info);
3836 
3837         spec_info->u.qcow2.data->has_encrypt = true;
3838         spec_info->u.qcow2.data->encrypt = qencrypt;
3839     }
3840 
3841     return spec_info;
3842 }
3843 
3844 static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
3845                               int64_t pos)
3846 {
3847     BDRVQcow2State *s = bs->opaque;
3848 
3849     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
3850     return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos,
3851                                     qiov->size, qiov, 0);
3852 }
3853 
3854 static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
3855                               int64_t pos)
3856 {
3857     BDRVQcow2State *s = bs->opaque;
3858 
3859     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
3860     return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos,
3861                                    qiov->size, qiov, 0);
3862 }
3863 
3864 /*
3865  * Downgrades an image's version. To achieve this, any incompatible features
3866  * have to be removed.
3867  */
3868 static int qcow2_downgrade(BlockDriverState *bs, int target_version,
3869                            BlockDriverAmendStatusCB *status_cb, void *cb_opaque)
3870 {
3871     BDRVQcow2State *s = bs->opaque;
3872     int current_version = s->qcow_version;
3873     int ret;
3874 
3875     if (target_version == current_version) {
3876         return 0;
3877     } else if (target_version > current_version) {
3878         return -EINVAL;
3879     } else if (target_version != 2) {
3880         return -EINVAL;
3881     }
3882 
3883     if (s->refcount_order != 4) {
3884         error_report("compat=0.10 requires refcount_bits=16");
3885         return -ENOTSUP;
3886     }
3887 
3888     /* clear incompatible features */
3889     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
3890         ret = qcow2_mark_clean(bs);
3891         if (ret < 0) {
3892             return ret;
3893         }
3894     }
3895 
3896     /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
3897      * the first place; if that happens nonetheless, returning -ENOTSUP is the
3898      * best thing to do anyway */
3899 
3900     if (s->incompatible_features) {
3901         return -ENOTSUP;
3902     }
3903 
3904     /* since we can ignore compatible features, we can set them to 0 as well */
3905     s->compatible_features = 0;
3906     /* if lazy refcounts have been used, they have already been fixed through
3907      * clearing the dirty flag */
3908 
3909     /* clearing autoclear features is trivial */
3910     s->autoclear_features = 0;
3911 
3912     ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
3913     if (ret < 0) {
3914         return ret;
3915     }
3916 
3917     s->qcow_version = target_version;
3918     ret = qcow2_update_header(bs);
3919     if (ret < 0) {
3920         s->qcow_version = current_version;
3921         return ret;
3922     }
3923     return 0;
3924 }
3925 
3926 typedef enum Qcow2AmendOperation {
3927     /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
3928      * statically initialized to so that the helper CB can discern the first
3929      * invocation from an operation change */
3930     QCOW2_NO_OPERATION = 0,
3931 
3932     QCOW2_CHANGING_REFCOUNT_ORDER,
3933     QCOW2_DOWNGRADING,
3934 } Qcow2AmendOperation;
3935 
3936 typedef struct Qcow2AmendHelperCBInfo {
3937     /* The code coordinating the amend operations should only modify
3938      * these four fields; the rest will be managed by the CB */
3939     BlockDriverAmendStatusCB *original_status_cb;
3940     void *original_cb_opaque;
3941 
3942     Qcow2AmendOperation current_operation;
3943 
3944     /* Total number of operations to perform (only set once) */
3945     int total_operations;
3946 
3947     /* The following fields are managed by the CB */
3948 
3949     /* Number of operations completed */
3950     int operations_completed;
3951 
3952     /* Cumulative offset of all completed operations */
3953     int64_t offset_completed;
3954 
3955     Qcow2AmendOperation last_operation;
3956     int64_t last_work_size;
3957 } Qcow2AmendHelperCBInfo;
3958 
3959 static void qcow2_amend_helper_cb(BlockDriverState *bs,
3960                                   int64_t operation_offset,
3961                                   int64_t operation_work_size, void *opaque)
3962 {
3963     Qcow2AmendHelperCBInfo *info = opaque;
3964     int64_t current_work_size;
3965     int64_t projected_work_size;
3966 
3967     if (info->current_operation != info->last_operation) {
3968         if (info->last_operation != QCOW2_NO_OPERATION) {
3969             info->offset_completed += info->last_work_size;
3970             info->operations_completed++;
3971         }
3972 
3973         info->last_operation = info->current_operation;
3974     }
3975 
3976     assert(info->total_operations > 0);
3977     assert(info->operations_completed < info->total_operations);
3978 
3979     info->last_work_size = operation_work_size;
3980 
3981     current_work_size = info->offset_completed + operation_work_size;
3982 
3983     /* current_work_size is the total work size for (operations_completed + 1)
3984      * operations (which includes this one), so multiply it by the number of
3985      * operations not covered and divide it by the number of operations
3986      * covered to get a projection for the operations not covered */
3987     projected_work_size = current_work_size * (info->total_operations -
3988                                                info->operations_completed - 1)
3989                                             / (info->operations_completed + 1);
3990 
3991     info->original_status_cb(bs, info->offset_completed + operation_offset,
3992                              current_work_size + projected_work_size,
3993                              info->original_cb_opaque);
3994 }
3995 
3996 static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
3997                                BlockDriverAmendStatusCB *status_cb,
3998                                void *cb_opaque)
3999 {
4000     BDRVQcow2State *s = bs->opaque;
4001     int old_version = s->qcow_version, new_version = old_version;
4002     uint64_t new_size = 0;
4003     const char *backing_file = NULL, *backing_format = NULL;
4004     bool lazy_refcounts = s->use_lazy_refcounts;
4005     const char *compat = NULL;
4006     uint64_t cluster_size = s->cluster_size;
4007     bool encrypt;
4008     int encformat;
4009     int refcount_bits = s->refcount_bits;
4010     Error *local_err = NULL;
4011     int ret;
4012     QemuOptDesc *desc = opts->list->desc;
4013     Qcow2AmendHelperCBInfo helper_cb_info;
4014 
4015     while (desc && desc->name) {
4016         if (!qemu_opt_find(opts, desc->name)) {
4017             /* only change explicitly defined options */
4018             desc++;
4019             continue;
4020         }
4021 
4022         if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
4023             compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
4024             if (!compat) {
4025                 /* preserve default */
4026             } else if (!strcmp(compat, "0.10")) {
4027                 new_version = 2;
4028             } else if (!strcmp(compat, "1.1")) {
4029                 new_version = 3;
4030             } else {
4031                 error_report("Unknown compatibility level %s", compat);
4032                 return -EINVAL;
4033             }
4034         } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) {
4035             error_report("Cannot change preallocation mode");
4036             return -ENOTSUP;
4037         } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
4038             new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
4039         } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
4040             backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
4041         } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
4042             backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
4043         } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
4044             encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
4045                                         !!s->crypto);
4046 
4047             if (encrypt != !!s->crypto) {
4048                 error_report("Changing the encryption flag is not supported");
4049                 return -ENOTSUP;
4050             }
4051         } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT_FORMAT)) {
4052             encformat = qcow2_crypt_method_from_format(
4053                 qemu_opt_get(opts, BLOCK_OPT_ENCRYPT_FORMAT));
4054 
4055             if (encformat != s->crypt_method_header) {
4056                 error_report("Changing the encryption format is not supported");
4057                 return -ENOTSUP;
4058             }
4059         } else if (g_str_has_prefix(desc->name, "encrypt.")) {
4060             error_report("Changing the encryption parameters is not supported");
4061             return -ENOTSUP;
4062         } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) {
4063             cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
4064                                              cluster_size);
4065             if (cluster_size != s->cluster_size) {
4066                 error_report("Changing the cluster size is not supported");
4067                 return -ENOTSUP;
4068             }
4069         } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
4070             lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
4071                                                lazy_refcounts);
4072         } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
4073             refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
4074                                                 refcount_bits);
4075 
4076             if (refcount_bits <= 0 || refcount_bits > 64 ||
4077                 !is_power_of_2(refcount_bits))
4078             {
4079                 error_report("Refcount width must be a power of two and may "
4080                              "not exceed 64 bits");
4081                 return -EINVAL;
4082             }
4083         } else {
4084             /* if this point is reached, this probably means a new option was
4085              * added without having it covered here */
4086             abort();
4087         }
4088 
4089         desc++;
4090     }
4091 
4092     helper_cb_info = (Qcow2AmendHelperCBInfo){
4093         .original_status_cb = status_cb,
4094         .original_cb_opaque = cb_opaque,
4095         .total_operations = (new_version < old_version)
4096                           + (s->refcount_bits != refcount_bits)
4097     };
4098 
4099     /* Upgrade first (some features may require compat=1.1) */
4100     if (new_version > old_version) {
4101         s->qcow_version = new_version;
4102         ret = qcow2_update_header(bs);
4103         if (ret < 0) {
4104             s->qcow_version = old_version;
4105             return ret;
4106         }
4107     }
4108 
4109     if (s->refcount_bits != refcount_bits) {
4110         int refcount_order = ctz32(refcount_bits);
4111 
4112         if (new_version < 3 && refcount_bits != 16) {
4113             error_report("Different refcount widths than 16 bits require "
4114                          "compatibility level 1.1 or above (use compat=1.1 or "
4115                          "greater)");
4116             return -EINVAL;
4117         }
4118 
4119         helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
4120         ret = qcow2_change_refcount_order(bs, refcount_order,
4121                                           &qcow2_amend_helper_cb,
4122                                           &helper_cb_info, &local_err);
4123         if (ret < 0) {
4124             error_report_err(local_err);
4125             return ret;
4126         }
4127     }
4128 
4129     if (backing_file || backing_format) {
4130         ret = qcow2_change_backing_file(bs,
4131                     backing_file ?: s->image_backing_file,
4132                     backing_format ?: s->image_backing_format);
4133         if (ret < 0) {
4134             return ret;
4135         }
4136     }
4137 
4138     if (s->use_lazy_refcounts != lazy_refcounts) {
4139         if (lazy_refcounts) {
4140             if (new_version < 3) {
4141                 error_report("Lazy refcounts only supported with compatibility "
4142                              "level 1.1 and above (use compat=1.1 or greater)");
4143                 return -EINVAL;
4144             }
4145             s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
4146             ret = qcow2_update_header(bs);
4147             if (ret < 0) {
4148                 s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
4149                 return ret;
4150             }
4151             s->use_lazy_refcounts = true;
4152         } else {
4153             /* make image clean first */
4154             ret = qcow2_mark_clean(bs);
4155             if (ret < 0) {
4156                 return ret;
4157             }
4158             /* now disallow lazy refcounts */
4159             s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
4160             ret = qcow2_update_header(bs);
4161             if (ret < 0) {
4162                 s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
4163                 return ret;
4164             }
4165             s->use_lazy_refcounts = false;
4166         }
4167     }
4168 
4169     if (new_size) {
4170         BlockBackend *blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL);
4171         ret = blk_insert_bs(blk, bs, &local_err);
4172         if (ret < 0) {
4173             error_report_err(local_err);
4174             blk_unref(blk);
4175             return ret;
4176         }
4177 
4178         ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, &local_err);
4179         blk_unref(blk);
4180         if (ret < 0) {
4181             error_report_err(local_err);
4182             return ret;
4183         }
4184     }
4185 
4186     /* Downgrade last (so unsupported features can be removed before) */
4187     if (new_version < old_version) {
4188         helper_cb_info.current_operation = QCOW2_DOWNGRADING;
4189         ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
4190                               &helper_cb_info);
4191         if (ret < 0) {
4192             return ret;
4193         }
4194     }
4195 
4196     return 0;
4197 }
4198 
4199 /*
4200  * If offset or size are negative, respectively, they will not be included in
4201  * the BLOCK_IMAGE_CORRUPTED event emitted.
4202  * fatal will be ignored for read-only BDS; corruptions found there will always
4203  * be considered non-fatal.
4204  */
4205 void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
4206                              int64_t size, const char *message_format, ...)
4207 {
4208     BDRVQcow2State *s = bs->opaque;
4209     const char *node_name;
4210     char *message;
4211     va_list ap;
4212 
4213     fatal = fatal && !bs->read_only;
4214 
4215     if (s->signaled_corruption &&
4216         (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
4217     {
4218         return;
4219     }
4220 
4221     va_start(ap, message_format);
4222     message = g_strdup_vprintf(message_format, ap);
4223     va_end(ap);
4224 
4225     if (fatal) {
4226         fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
4227                 "corruption events will be suppressed\n", message);
4228     } else {
4229         fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
4230                 "corruption events will be suppressed\n", message);
4231     }
4232 
4233     node_name = bdrv_get_node_name(bs);
4234     qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
4235                                           *node_name != '\0', node_name,
4236                                           message, offset >= 0, offset,
4237                                           size >= 0, size,
4238                                           fatal, &error_abort);
4239     g_free(message);
4240 
4241     if (fatal) {
4242         qcow2_mark_corrupt(bs);
4243         bs->drv = NULL; /* make BDS unusable */
4244     }
4245 
4246     s->signaled_corruption = true;
4247 }
4248 
4249 static QemuOptsList qcow2_create_opts = {
4250     .name = "qcow2-create-opts",
4251     .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
4252     .desc = {
4253         {
4254             .name = BLOCK_OPT_SIZE,
4255             .type = QEMU_OPT_SIZE,
4256             .help = "Virtual disk size"
4257         },
4258         {
4259             .name = BLOCK_OPT_COMPAT_LEVEL,
4260             .type = QEMU_OPT_STRING,
4261             .help = "Compatibility level (0.10 or 1.1)"
4262         },
4263         {
4264             .name = BLOCK_OPT_BACKING_FILE,
4265             .type = QEMU_OPT_STRING,
4266             .help = "File name of a base image"
4267         },
4268         {
4269             .name = BLOCK_OPT_BACKING_FMT,
4270             .type = QEMU_OPT_STRING,
4271             .help = "Image format of the base image"
4272         },
4273         {
4274             .name = BLOCK_OPT_ENCRYPT,
4275             .type = QEMU_OPT_BOOL,
4276             .help = "Encrypt the image with format 'aes'. (Deprecated "
4277                     "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
4278         },
4279         {
4280             .name = BLOCK_OPT_ENCRYPT_FORMAT,
4281             .type = QEMU_OPT_STRING,
4282             .help = "Encrypt the image, format choices: 'aes', 'luks'",
4283         },
4284         BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
4285             "ID of secret providing qcow AES key or LUKS passphrase"),
4286         BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."),
4287         BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."),
4288         BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."),
4289         BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."),
4290         BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."),
4291         BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),
4292         {
4293             .name = BLOCK_OPT_CLUSTER_SIZE,
4294             .type = QEMU_OPT_SIZE,
4295             .help = "qcow2 cluster size",
4296             .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
4297         },
4298         {
4299             .name = BLOCK_OPT_PREALLOC,
4300             .type = QEMU_OPT_STRING,
4301             .help = "Preallocation mode (allowed values: off, metadata, "
4302                     "falloc, full)"
4303         },
4304         {
4305             .name = BLOCK_OPT_LAZY_REFCOUNTS,
4306             .type = QEMU_OPT_BOOL,
4307             .help = "Postpone refcount updates",
4308             .def_value_str = "off"
4309         },
4310         {
4311             .name = BLOCK_OPT_REFCOUNT_BITS,
4312             .type = QEMU_OPT_NUMBER,
4313             .help = "Width of a reference count entry in bits",
4314             .def_value_str = "16"
4315         },
4316         { /* end of list */ }
4317     }
4318 };
4319 
4320 BlockDriver bdrv_qcow2 = {
4321     .format_name        = "qcow2",
4322     .instance_size      = sizeof(BDRVQcow2State),
4323     .bdrv_probe         = qcow2_probe,
4324     .bdrv_open          = qcow2_open,
4325     .bdrv_close         = qcow2_close,
4326     .bdrv_reopen_prepare  = qcow2_reopen_prepare,
4327     .bdrv_reopen_commit   = qcow2_reopen_commit,
4328     .bdrv_reopen_abort    = qcow2_reopen_abort,
4329     .bdrv_join_options    = qcow2_join_options,
4330     .bdrv_child_perm      = bdrv_format_default_perms,
4331     .bdrv_create        = qcow2_create,
4332     .bdrv_has_zero_init = bdrv_has_zero_init_1,
4333     .bdrv_co_get_block_status = qcow2_co_get_block_status,
4334 
4335     .bdrv_co_preadv         = qcow2_co_preadv,
4336     .bdrv_co_pwritev        = qcow2_co_pwritev,
4337     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
4338 
4339     .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
4340     .bdrv_co_pdiscard       = qcow2_co_pdiscard,
4341     .bdrv_truncate          = qcow2_truncate,
4342     .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed,
4343     .bdrv_make_empty        = qcow2_make_empty,
4344 
4345     .bdrv_snapshot_create   = qcow2_snapshot_create,
4346     .bdrv_snapshot_goto     = qcow2_snapshot_goto,
4347     .bdrv_snapshot_delete   = qcow2_snapshot_delete,
4348     .bdrv_snapshot_list     = qcow2_snapshot_list,
4349     .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
4350     .bdrv_measure           = qcow2_measure,
4351     .bdrv_get_info          = qcow2_get_info,
4352     .bdrv_get_specific_info = qcow2_get_specific_info,
4353 
4354     .bdrv_save_vmstate    = qcow2_save_vmstate,
4355     .bdrv_load_vmstate    = qcow2_load_vmstate,
4356 
4357     .supports_backing           = true,
4358     .bdrv_change_backing_file   = qcow2_change_backing_file,
4359 
4360     .bdrv_refresh_limits        = qcow2_refresh_limits,
4361     .bdrv_invalidate_cache      = qcow2_invalidate_cache,
4362     .bdrv_inactivate            = qcow2_inactivate,
4363 
4364     .create_opts         = &qcow2_create_opts,
4365     .bdrv_check          = qcow2_check,
4366     .bdrv_amend_options  = qcow2_amend_options,
4367 
4368     .bdrv_detach_aio_context  = qcow2_detach_aio_context,
4369     .bdrv_attach_aio_context  = qcow2_attach_aio_context,
4370 
4371     .bdrv_reopen_bitmaps_rw = qcow2_reopen_bitmaps_rw,
4372     .bdrv_can_store_new_dirty_bitmap = qcow2_can_store_new_dirty_bitmap,
4373     .bdrv_remove_persistent_dirty_bitmap = qcow2_remove_persistent_dirty_bitmap,
4374 };
4375 
4376 static void bdrv_qcow2_init(void)
4377 {
4378     bdrv_register(&bdrv_qcow2);
4379 }
4380 
4381 block_init(bdrv_qcow2_init);
4382