1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu/osdep.h" 25 #include "block/block_int.h" 26 #include "sysemu/block-backend.h" 27 #include "qemu/module.h" 28 #include <zlib.h> 29 #include "block/qcow2.h" 30 #include "qemu/error-report.h" 31 #include "qapi/qmp/qerror.h" 32 #include "qapi/qmp/qbool.h" 33 #include "qapi/qmp/types.h" 34 #include "qapi-event.h" 35 #include "trace.h" 36 #include "qemu/option_int.h" 37 #include "qemu/cutils.h" 38 #include "qemu/bswap.h" 39 #include "qapi/opts-visitor.h" 40 #include "qapi-visit.h" 41 #include "block/crypto.h" 42 43 /* 44 Differences with QCOW: 45 46 - Support for multiple incremental snapshots. 47 - Memory management by reference counts. 48 - Clusters which have a reference count of one have the bit 49 QCOW_OFLAG_COPIED to optimize write performance. 50 - Size of compressed clusters is stored in sectors to reduce bit usage 51 in the cluster offsets. 52 - Support for storing additional data (such as the VM state) in the 53 snapshots. 54 - If a backing store is used, the cluster size is not constrained 55 (could be backported to QCOW). 56 - L2 tables have always a size of one cluster. 57 */ 58 59 60 typedef struct { 61 uint32_t magic; 62 uint32_t len; 63 } QEMU_PACKED QCowExtension; 64 65 #define QCOW2_EXT_MAGIC_END 0 66 #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA 67 #define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 68 #define QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77 69 #define QCOW2_EXT_MAGIC_BITMAPS 0x23852875 70 71 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) 72 { 73 const QCowHeader *cow_header = (const void *)buf; 74 75 if (buf_size >= sizeof(QCowHeader) && 76 be32_to_cpu(cow_header->magic) == QCOW_MAGIC && 77 be32_to_cpu(cow_header->version) >= 2) 78 return 100; 79 else 80 return 0; 81 } 82 83 84 static ssize_t qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset, 85 uint8_t *buf, size_t buflen, 86 void *opaque, Error **errp) 87 { 88 BlockDriverState *bs = opaque; 89 BDRVQcow2State *s = bs->opaque; 90 ssize_t ret; 91 92 if ((offset + buflen) > s->crypto_header.length) { 93 error_setg(errp, "Request for data outside of extension header"); 94 return -1; 95 } 96 97 ret = bdrv_pread(bs->file, 98 s->crypto_header.offset + offset, buf, buflen); 99 if (ret < 0) { 100 error_setg_errno(errp, -ret, "Could not read encryption header"); 101 return -1; 102 } 103 return ret; 104 } 105 106 107 static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, 108 void *opaque, Error **errp) 109 { 110 BlockDriverState *bs = opaque; 111 BDRVQcow2State *s = bs->opaque; 112 int64_t ret; 113 int64_t clusterlen; 114 115 ret = qcow2_alloc_clusters(bs, headerlen); 116 if (ret < 0) { 117 error_setg_errno(errp, -ret, 118 "Cannot allocate cluster for LUKS header size %zu", 119 headerlen); 120 return -1; 121 } 122 123 s->crypto_header.length = headerlen; 124 s->crypto_header.offset = ret; 125 126 /* Zero fill remaining space in cluster so it has predictable 127 * content in case of future spec changes */ 128 clusterlen = size_to_clusters(s, headerlen) * s->cluster_size; 129 ret = bdrv_pwrite_zeroes(bs->file, 130 ret + headerlen, 131 clusterlen - headerlen, 0); 132 if (ret < 0) { 133 error_setg_errno(errp, -ret, "Could not zero fill encryption header"); 134 return -1; 135 } 136 137 return ret; 138 } 139 140 141 static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset, 142 const uint8_t *buf, size_t buflen, 143 void *opaque, Error **errp) 144 { 145 BlockDriverState *bs = opaque; 146 BDRVQcow2State *s = bs->opaque; 147 ssize_t ret; 148 149 if ((offset + buflen) > s->crypto_header.length) { 150 error_setg(errp, "Request for data outside of extension header"); 151 return -1; 152 } 153 154 ret = bdrv_pwrite(bs->file, 155 s->crypto_header.offset + offset, buf, buflen); 156 if (ret < 0) { 157 error_setg_errno(errp, -ret, "Could not read encryption header"); 158 return -1; 159 } 160 return ret; 161 } 162 163 164 /* 165 * read qcow2 extension and fill bs 166 * start reading from start_offset 167 * finish reading upon magic of value 0 or when end_offset reached 168 * unknown magic is skipped (future extension this version knows nothing about) 169 * return 0 upon success, non-0 otherwise 170 */ 171 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, 172 uint64_t end_offset, void **p_feature_table, 173 int flags, bool *need_update_header, 174 Error **errp) 175 { 176 BDRVQcow2State *s = bs->opaque; 177 QCowExtension ext; 178 uint64_t offset; 179 int ret; 180 Qcow2BitmapHeaderExt bitmaps_ext; 181 182 if (need_update_header != NULL) { 183 *need_update_header = false; 184 } 185 186 #ifdef DEBUG_EXT 187 printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); 188 #endif 189 offset = start_offset; 190 while (offset < end_offset) { 191 192 #ifdef DEBUG_EXT 193 /* Sanity check */ 194 if (offset > s->cluster_size) 195 printf("qcow2_read_extension: suspicious offset %lu\n", offset); 196 197 printf("attempting to read extended header in offset %lu\n", offset); 198 #endif 199 200 ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext)); 201 if (ret < 0) { 202 error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " 203 "pread fail from offset %" PRIu64, offset); 204 return 1; 205 } 206 be32_to_cpus(&ext.magic); 207 be32_to_cpus(&ext.len); 208 offset += sizeof(ext); 209 #ifdef DEBUG_EXT 210 printf("ext.magic = 0x%x\n", ext.magic); 211 #endif 212 if (offset > end_offset || ext.len > end_offset - offset) { 213 error_setg(errp, "Header extension too large"); 214 return -EINVAL; 215 } 216 217 switch (ext.magic) { 218 case QCOW2_EXT_MAGIC_END: 219 return 0; 220 221 case QCOW2_EXT_MAGIC_BACKING_FORMAT: 222 if (ext.len >= sizeof(bs->backing_format)) { 223 error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32 224 " too large (>=%zu)", ext.len, 225 sizeof(bs->backing_format)); 226 return 2; 227 } 228 ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len); 229 if (ret < 0) { 230 error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " 231 "Could not read format name"); 232 return 3; 233 } 234 bs->backing_format[ext.len] = '\0'; 235 s->image_backing_format = g_strdup(bs->backing_format); 236 #ifdef DEBUG_EXT 237 printf("Qcow2: Got format extension %s\n", bs->backing_format); 238 #endif 239 break; 240 241 case QCOW2_EXT_MAGIC_FEATURE_TABLE: 242 if (p_feature_table != NULL) { 243 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); 244 ret = bdrv_pread(bs->file, offset , feature_table, ext.len); 245 if (ret < 0) { 246 error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " 247 "Could not read table"); 248 return ret; 249 } 250 251 *p_feature_table = feature_table; 252 } 253 break; 254 255 case QCOW2_EXT_MAGIC_CRYPTO_HEADER: { 256 unsigned int cflags = 0; 257 if (s->crypt_method_header != QCOW_CRYPT_LUKS) { 258 error_setg(errp, "CRYPTO header extension only " 259 "expected with LUKS encryption method"); 260 return -EINVAL; 261 } 262 if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) { 263 error_setg(errp, "CRYPTO header extension size %u, " 264 "but expected size %zu", ext.len, 265 sizeof(Qcow2CryptoHeaderExtension)); 266 return -EINVAL; 267 } 268 269 ret = bdrv_pread(bs->file, offset, &s->crypto_header, ext.len); 270 if (ret < 0) { 271 error_setg_errno(errp, -ret, 272 "Unable to read CRYPTO header extension"); 273 return ret; 274 } 275 be64_to_cpus(&s->crypto_header.offset); 276 be64_to_cpus(&s->crypto_header.length); 277 278 if ((s->crypto_header.offset % s->cluster_size) != 0) { 279 error_setg(errp, "Encryption header offset '%" PRIu64 "' is " 280 "not a multiple of cluster size '%u'", 281 s->crypto_header.offset, s->cluster_size); 282 return -EINVAL; 283 } 284 285 if (flags & BDRV_O_NO_IO) { 286 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; 287 } 288 s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.", 289 qcow2_crypto_hdr_read_func, 290 bs, cflags, errp); 291 if (!s->crypto) { 292 return -EINVAL; 293 } 294 } break; 295 296 case QCOW2_EXT_MAGIC_BITMAPS: 297 if (ext.len != sizeof(bitmaps_ext)) { 298 error_setg_errno(errp, -ret, "bitmaps_ext: " 299 "Invalid extension length"); 300 return -EINVAL; 301 } 302 303 if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) { 304 error_report("WARNING: a program lacking bitmap support " 305 "modified this file, so all bitmaps are now " 306 "considered inconsistent. Some clusters may be " 307 "leaked, run 'qemu-img check -r' on the image " 308 "file to fix."); 309 if (need_update_header != NULL) { 310 /* Updating is needed to drop invalid bitmap extension. */ 311 *need_update_header = true; 312 } 313 break; 314 } 315 316 ret = bdrv_pread(bs->file, offset, &bitmaps_ext, ext.len); 317 if (ret < 0) { 318 error_setg_errno(errp, -ret, "bitmaps_ext: " 319 "Could not read ext header"); 320 return ret; 321 } 322 323 if (bitmaps_ext.reserved32 != 0) { 324 error_setg_errno(errp, -ret, "bitmaps_ext: " 325 "Reserved field is not zero"); 326 return -EINVAL; 327 } 328 329 be32_to_cpus(&bitmaps_ext.nb_bitmaps); 330 be64_to_cpus(&bitmaps_ext.bitmap_directory_size); 331 be64_to_cpus(&bitmaps_ext.bitmap_directory_offset); 332 333 if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) { 334 error_setg(errp, 335 "bitmaps_ext: Image has %" PRIu32 " bitmaps, " 336 "exceeding the QEMU supported maximum of %d", 337 bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS); 338 return -EINVAL; 339 } 340 341 if (bitmaps_ext.nb_bitmaps == 0) { 342 error_setg(errp, "found bitmaps extension with zero bitmaps"); 343 return -EINVAL; 344 } 345 346 if (bitmaps_ext.bitmap_directory_offset & (s->cluster_size - 1)) { 347 error_setg(errp, "bitmaps_ext: " 348 "invalid bitmap directory offset"); 349 return -EINVAL; 350 } 351 352 if (bitmaps_ext.bitmap_directory_size > 353 QCOW2_MAX_BITMAP_DIRECTORY_SIZE) { 354 error_setg(errp, "bitmaps_ext: " 355 "bitmap directory size (%" PRIu64 ") exceeds " 356 "the maximum supported size (%d)", 357 bitmaps_ext.bitmap_directory_size, 358 QCOW2_MAX_BITMAP_DIRECTORY_SIZE); 359 return -EINVAL; 360 } 361 362 s->nb_bitmaps = bitmaps_ext.nb_bitmaps; 363 s->bitmap_directory_offset = 364 bitmaps_ext.bitmap_directory_offset; 365 s->bitmap_directory_size = 366 bitmaps_ext.bitmap_directory_size; 367 368 #ifdef DEBUG_EXT 369 printf("Qcow2: Got bitmaps extension: " 370 "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n", 371 s->bitmap_directory_offset, s->nb_bitmaps); 372 #endif 373 break; 374 375 default: 376 /* unknown magic - save it in case we need to rewrite the header */ 377 { 378 Qcow2UnknownHeaderExtension *uext; 379 380 uext = g_malloc0(sizeof(*uext) + ext.len); 381 uext->magic = ext.magic; 382 uext->len = ext.len; 383 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); 384 385 ret = bdrv_pread(bs->file, offset , uext->data, uext->len); 386 if (ret < 0) { 387 error_setg_errno(errp, -ret, "ERROR: unknown extension: " 388 "Could not read data"); 389 return ret; 390 } 391 } 392 break; 393 } 394 395 offset += ((ext.len + 7) & ~7); 396 } 397 398 return 0; 399 } 400 401 static void cleanup_unknown_header_ext(BlockDriverState *bs) 402 { 403 BDRVQcow2State *s = bs->opaque; 404 Qcow2UnknownHeaderExtension *uext, *next; 405 406 QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { 407 QLIST_REMOVE(uext, next); 408 g_free(uext); 409 } 410 } 411 412 static void report_unsupported_feature(Error **errp, Qcow2Feature *table, 413 uint64_t mask) 414 { 415 char *features = g_strdup(""); 416 char *old; 417 418 while (table && table->name[0] != '\0') { 419 if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { 420 if (mask & (1ULL << table->bit)) { 421 old = features; 422 features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "", 423 table->name); 424 g_free(old); 425 mask &= ~(1ULL << table->bit); 426 } 427 } 428 table++; 429 } 430 431 if (mask) { 432 old = features; 433 features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64, 434 old, *old ? ", " : "", mask); 435 g_free(old); 436 } 437 438 error_setg(errp, "Unsupported qcow2 feature(s): %s", features); 439 g_free(features); 440 } 441 442 /* 443 * Sets the dirty bit and flushes afterwards if necessary. 444 * 445 * The incompatible_features bit is only set if the image file header was 446 * updated successfully. Therefore it is not required to check the return 447 * value of this function. 448 */ 449 int qcow2_mark_dirty(BlockDriverState *bs) 450 { 451 BDRVQcow2State *s = bs->opaque; 452 uint64_t val; 453 int ret; 454 455 assert(s->qcow_version >= 3); 456 457 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { 458 return 0; /* already dirty */ 459 } 460 461 val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); 462 ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), 463 &val, sizeof(val)); 464 if (ret < 0) { 465 return ret; 466 } 467 ret = bdrv_flush(bs->file->bs); 468 if (ret < 0) { 469 return ret; 470 } 471 472 /* Only treat image as dirty if the header was updated successfully */ 473 s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; 474 return 0; 475 } 476 477 /* 478 * Clears the dirty bit and flushes before if necessary. Only call this 479 * function when there are no pending requests, it does not guard against 480 * concurrent requests dirtying the image. 481 */ 482 static int qcow2_mark_clean(BlockDriverState *bs) 483 { 484 BDRVQcow2State *s = bs->opaque; 485 486 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { 487 int ret; 488 489 s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; 490 491 ret = bdrv_flush(bs); 492 if (ret < 0) { 493 return ret; 494 } 495 496 return qcow2_update_header(bs); 497 } 498 return 0; 499 } 500 501 /* 502 * Marks the image as corrupt. 503 */ 504 int qcow2_mark_corrupt(BlockDriverState *bs) 505 { 506 BDRVQcow2State *s = bs->opaque; 507 508 s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; 509 return qcow2_update_header(bs); 510 } 511 512 /* 513 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes 514 * before if necessary. 515 */ 516 int qcow2_mark_consistent(BlockDriverState *bs) 517 { 518 BDRVQcow2State *s = bs->opaque; 519 520 if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { 521 int ret = bdrv_flush(bs); 522 if (ret < 0) { 523 return ret; 524 } 525 526 s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT; 527 return qcow2_update_header(bs); 528 } 529 return 0; 530 } 531 532 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, 533 BdrvCheckMode fix) 534 { 535 int ret = qcow2_check_refcounts(bs, result, fix); 536 if (ret < 0) { 537 return ret; 538 } 539 540 if (fix && result->check_errors == 0 && result->corruptions == 0) { 541 ret = qcow2_mark_clean(bs); 542 if (ret < 0) { 543 return ret; 544 } 545 return qcow2_mark_consistent(bs); 546 } 547 return ret; 548 } 549 550 static int validate_table_offset(BlockDriverState *bs, uint64_t offset, 551 uint64_t entries, size_t entry_len) 552 { 553 BDRVQcow2State *s = bs->opaque; 554 uint64_t size; 555 556 /* Use signed INT64_MAX as the maximum even for uint64_t header fields, 557 * because values will be passed to qemu functions taking int64_t. */ 558 if (entries > INT64_MAX / entry_len) { 559 return -EINVAL; 560 } 561 562 size = entries * entry_len; 563 564 if (INT64_MAX - size < offset) { 565 return -EINVAL; 566 } 567 568 /* Tables must be cluster aligned */ 569 if (offset_into_cluster(s, offset) != 0) { 570 return -EINVAL; 571 } 572 573 return 0; 574 } 575 576 static QemuOptsList qcow2_runtime_opts = { 577 .name = "qcow2", 578 .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), 579 .desc = { 580 { 581 .name = QCOW2_OPT_LAZY_REFCOUNTS, 582 .type = QEMU_OPT_BOOL, 583 .help = "Postpone refcount updates", 584 }, 585 { 586 .name = QCOW2_OPT_DISCARD_REQUEST, 587 .type = QEMU_OPT_BOOL, 588 .help = "Pass guest discard requests to the layer below", 589 }, 590 { 591 .name = QCOW2_OPT_DISCARD_SNAPSHOT, 592 .type = QEMU_OPT_BOOL, 593 .help = "Generate discard requests when snapshot related space " 594 "is freed", 595 }, 596 { 597 .name = QCOW2_OPT_DISCARD_OTHER, 598 .type = QEMU_OPT_BOOL, 599 .help = "Generate discard requests when other clusters are freed", 600 }, 601 { 602 .name = QCOW2_OPT_OVERLAP, 603 .type = QEMU_OPT_STRING, 604 .help = "Selects which overlap checks to perform from a range of " 605 "templates (none, constant, cached, all)", 606 }, 607 { 608 .name = QCOW2_OPT_OVERLAP_TEMPLATE, 609 .type = QEMU_OPT_STRING, 610 .help = "Selects which overlap checks to perform from a range of " 611 "templates (none, constant, cached, all)", 612 }, 613 { 614 .name = QCOW2_OPT_OVERLAP_MAIN_HEADER, 615 .type = QEMU_OPT_BOOL, 616 .help = "Check for unintended writes into the main qcow2 header", 617 }, 618 { 619 .name = QCOW2_OPT_OVERLAP_ACTIVE_L1, 620 .type = QEMU_OPT_BOOL, 621 .help = "Check for unintended writes into the active L1 table", 622 }, 623 { 624 .name = QCOW2_OPT_OVERLAP_ACTIVE_L2, 625 .type = QEMU_OPT_BOOL, 626 .help = "Check for unintended writes into an active L2 table", 627 }, 628 { 629 .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, 630 .type = QEMU_OPT_BOOL, 631 .help = "Check for unintended writes into the refcount table", 632 }, 633 { 634 .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, 635 .type = QEMU_OPT_BOOL, 636 .help = "Check for unintended writes into a refcount block", 637 }, 638 { 639 .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, 640 .type = QEMU_OPT_BOOL, 641 .help = "Check for unintended writes into the snapshot table", 642 }, 643 { 644 .name = QCOW2_OPT_OVERLAP_INACTIVE_L1, 645 .type = QEMU_OPT_BOOL, 646 .help = "Check for unintended writes into an inactive L1 table", 647 }, 648 { 649 .name = QCOW2_OPT_OVERLAP_INACTIVE_L2, 650 .type = QEMU_OPT_BOOL, 651 .help = "Check for unintended writes into an inactive L2 table", 652 }, 653 { 654 .name = QCOW2_OPT_CACHE_SIZE, 655 .type = QEMU_OPT_SIZE, 656 .help = "Maximum combined metadata (L2 tables and refcount blocks) " 657 "cache size", 658 }, 659 { 660 .name = QCOW2_OPT_L2_CACHE_SIZE, 661 .type = QEMU_OPT_SIZE, 662 .help = "Maximum L2 table cache size", 663 }, 664 { 665 .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE, 666 .type = QEMU_OPT_SIZE, 667 .help = "Maximum refcount block cache size", 668 }, 669 { 670 .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL, 671 .type = QEMU_OPT_NUMBER, 672 .help = "Clean unused cache entries after this time (in seconds)", 673 }, 674 BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.", 675 "ID of secret providing qcow2 AES key or LUKS passphrase"), 676 { /* end of list */ } 677 }, 678 }; 679 680 static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { 681 [QCOW2_OL_MAIN_HEADER_BITNR] = QCOW2_OPT_OVERLAP_MAIN_HEADER, 682 [QCOW2_OL_ACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L1, 683 [QCOW2_OL_ACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L2, 684 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, 685 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, 686 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, 687 [QCOW2_OL_INACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L1, 688 [QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2, 689 }; 690 691 static void cache_clean_timer_cb(void *opaque) 692 { 693 BlockDriverState *bs = opaque; 694 BDRVQcow2State *s = bs->opaque; 695 qcow2_cache_clean_unused(bs, s->l2_table_cache); 696 qcow2_cache_clean_unused(bs, s->refcount_block_cache); 697 timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 698 (int64_t) s->cache_clean_interval * 1000); 699 } 700 701 static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context) 702 { 703 BDRVQcow2State *s = bs->opaque; 704 if (s->cache_clean_interval > 0) { 705 s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL, 706 SCALE_MS, cache_clean_timer_cb, 707 bs); 708 timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 709 (int64_t) s->cache_clean_interval * 1000); 710 } 711 } 712 713 static void cache_clean_timer_del(BlockDriverState *bs) 714 { 715 BDRVQcow2State *s = bs->opaque; 716 if (s->cache_clean_timer) { 717 timer_del(s->cache_clean_timer); 718 timer_free(s->cache_clean_timer); 719 s->cache_clean_timer = NULL; 720 } 721 } 722 723 static void qcow2_detach_aio_context(BlockDriverState *bs) 724 { 725 cache_clean_timer_del(bs); 726 } 727 728 static void qcow2_attach_aio_context(BlockDriverState *bs, 729 AioContext *new_context) 730 { 731 cache_clean_timer_init(bs, new_context); 732 } 733 734 static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, 735 uint64_t *l2_cache_size, 736 uint64_t *refcount_cache_size, Error **errp) 737 { 738 BDRVQcow2State *s = bs->opaque; 739 uint64_t combined_cache_size; 740 bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set; 741 742 combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE); 743 l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE); 744 refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE); 745 746 combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0); 747 *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0); 748 *refcount_cache_size = qemu_opt_get_size(opts, 749 QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0); 750 751 if (combined_cache_size_set) { 752 if (l2_cache_size_set && refcount_cache_size_set) { 753 error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE 754 " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set " 755 "the same time"); 756 return; 757 } else if (*l2_cache_size > combined_cache_size) { 758 error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed " 759 QCOW2_OPT_CACHE_SIZE); 760 return; 761 } else if (*refcount_cache_size > combined_cache_size) { 762 error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed " 763 QCOW2_OPT_CACHE_SIZE); 764 return; 765 } 766 767 if (l2_cache_size_set) { 768 *refcount_cache_size = combined_cache_size - *l2_cache_size; 769 } else if (refcount_cache_size_set) { 770 *l2_cache_size = combined_cache_size - *refcount_cache_size; 771 } else { 772 *refcount_cache_size = combined_cache_size 773 / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1); 774 *l2_cache_size = combined_cache_size - *refcount_cache_size; 775 } 776 } else { 777 if (!l2_cache_size_set && !refcount_cache_size_set) { 778 *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE, 779 (uint64_t)DEFAULT_L2_CACHE_CLUSTERS 780 * s->cluster_size); 781 *refcount_cache_size = *l2_cache_size 782 / DEFAULT_L2_REFCOUNT_SIZE_RATIO; 783 } else if (!l2_cache_size_set) { 784 *l2_cache_size = *refcount_cache_size 785 * DEFAULT_L2_REFCOUNT_SIZE_RATIO; 786 } else if (!refcount_cache_size_set) { 787 *refcount_cache_size = *l2_cache_size 788 / DEFAULT_L2_REFCOUNT_SIZE_RATIO; 789 } 790 } 791 } 792 793 typedef struct Qcow2ReopenState { 794 Qcow2Cache *l2_table_cache; 795 Qcow2Cache *refcount_block_cache; 796 bool use_lazy_refcounts; 797 int overlap_check; 798 bool discard_passthrough[QCOW2_DISCARD_MAX]; 799 uint64_t cache_clean_interval; 800 QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */ 801 } Qcow2ReopenState; 802 803 static int qcow2_update_options_prepare(BlockDriverState *bs, 804 Qcow2ReopenState *r, 805 QDict *options, int flags, 806 Error **errp) 807 { 808 BDRVQcow2State *s = bs->opaque; 809 QemuOpts *opts = NULL; 810 const char *opt_overlap_check, *opt_overlap_check_template; 811 int overlap_check_template = 0; 812 uint64_t l2_cache_size, refcount_cache_size; 813 int i; 814 const char *encryptfmt; 815 QDict *encryptopts = NULL; 816 Error *local_err = NULL; 817 int ret; 818 819 qdict_extract_subqdict(options, &encryptopts, "encrypt."); 820 encryptfmt = qdict_get_try_str(encryptopts, "format"); 821 822 opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); 823 qemu_opts_absorb_qdict(opts, options, &local_err); 824 if (local_err) { 825 error_propagate(errp, local_err); 826 ret = -EINVAL; 827 goto fail; 828 } 829 830 /* get L2 table/refcount block cache size from command line options */ 831 read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size, 832 &local_err); 833 if (local_err) { 834 error_propagate(errp, local_err); 835 ret = -EINVAL; 836 goto fail; 837 } 838 839 l2_cache_size /= s->cluster_size; 840 if (l2_cache_size < MIN_L2_CACHE_SIZE) { 841 l2_cache_size = MIN_L2_CACHE_SIZE; 842 } 843 if (l2_cache_size > INT_MAX) { 844 error_setg(errp, "L2 cache size too big"); 845 ret = -EINVAL; 846 goto fail; 847 } 848 849 refcount_cache_size /= s->cluster_size; 850 if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) { 851 refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE; 852 } 853 if (refcount_cache_size > INT_MAX) { 854 error_setg(errp, "Refcount cache size too big"); 855 ret = -EINVAL; 856 goto fail; 857 } 858 859 /* alloc new L2 table/refcount block cache, flush old one */ 860 if (s->l2_table_cache) { 861 ret = qcow2_cache_flush(bs, s->l2_table_cache); 862 if (ret) { 863 error_setg_errno(errp, -ret, "Failed to flush the L2 table cache"); 864 goto fail; 865 } 866 } 867 868 if (s->refcount_block_cache) { 869 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 870 if (ret) { 871 error_setg_errno(errp, -ret, 872 "Failed to flush the refcount block cache"); 873 goto fail; 874 } 875 } 876 877 r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size); 878 r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size); 879 if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) { 880 error_setg(errp, "Could not allocate metadata caches"); 881 ret = -ENOMEM; 882 goto fail; 883 } 884 885 /* New interval for cache cleanup timer */ 886 r->cache_clean_interval = 887 qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL, 888 s->cache_clean_interval); 889 #ifndef CONFIG_LINUX 890 if (r->cache_clean_interval != 0) { 891 error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL 892 " not supported on this host"); 893 ret = -EINVAL; 894 goto fail; 895 } 896 #endif 897 if (r->cache_clean_interval > UINT_MAX) { 898 error_setg(errp, "Cache clean interval too big"); 899 ret = -EINVAL; 900 goto fail; 901 } 902 903 /* lazy-refcounts; flush if going from enabled to disabled */ 904 r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, 905 (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); 906 if (r->use_lazy_refcounts && s->qcow_version < 3) { 907 error_setg(errp, "Lazy refcounts require a qcow2 image with at least " 908 "qemu 1.1 compatibility level"); 909 ret = -EINVAL; 910 goto fail; 911 } 912 913 if (s->use_lazy_refcounts && !r->use_lazy_refcounts) { 914 ret = qcow2_mark_clean(bs); 915 if (ret < 0) { 916 error_setg_errno(errp, -ret, "Failed to disable lazy refcounts"); 917 goto fail; 918 } 919 } 920 921 /* Overlap check options */ 922 opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP); 923 opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE); 924 if (opt_overlap_check_template && opt_overlap_check && 925 strcmp(opt_overlap_check_template, opt_overlap_check)) 926 { 927 error_setg(errp, "Conflicting values for qcow2 options '" 928 QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE 929 "' ('%s')", opt_overlap_check, opt_overlap_check_template); 930 ret = -EINVAL; 931 goto fail; 932 } 933 if (!opt_overlap_check) { 934 opt_overlap_check = opt_overlap_check_template ?: "cached"; 935 } 936 937 if (!strcmp(opt_overlap_check, "none")) { 938 overlap_check_template = 0; 939 } else if (!strcmp(opt_overlap_check, "constant")) { 940 overlap_check_template = QCOW2_OL_CONSTANT; 941 } else if (!strcmp(opt_overlap_check, "cached")) { 942 overlap_check_template = QCOW2_OL_CACHED; 943 } else if (!strcmp(opt_overlap_check, "all")) { 944 overlap_check_template = QCOW2_OL_ALL; 945 } else { 946 error_setg(errp, "Unsupported value '%s' for qcow2 option " 947 "'overlap-check'. Allowed are any of the following: " 948 "none, constant, cached, all", opt_overlap_check); 949 ret = -EINVAL; 950 goto fail; 951 } 952 953 r->overlap_check = 0; 954 for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { 955 /* overlap-check defines a template bitmask, but every flag may be 956 * overwritten through the associated boolean option */ 957 r->overlap_check |= 958 qemu_opt_get_bool(opts, overlap_bool_option_names[i], 959 overlap_check_template & (1 << i)) << i; 960 } 961 962 r->discard_passthrough[QCOW2_DISCARD_NEVER] = false; 963 r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; 964 r->discard_passthrough[QCOW2_DISCARD_REQUEST] = 965 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, 966 flags & BDRV_O_UNMAP); 967 r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = 968 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); 969 r->discard_passthrough[QCOW2_DISCARD_OTHER] = 970 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); 971 972 switch (s->crypt_method_header) { 973 case QCOW_CRYPT_NONE: 974 if (encryptfmt) { 975 error_setg(errp, "No encryption in image header, but options " 976 "specified format '%s'", encryptfmt); 977 ret = -EINVAL; 978 goto fail; 979 } 980 break; 981 982 case QCOW_CRYPT_AES: 983 if (encryptfmt && !g_str_equal(encryptfmt, "aes")) { 984 error_setg(errp, 985 "Header reported 'aes' encryption format but " 986 "options specify '%s'", encryptfmt); 987 ret = -EINVAL; 988 goto fail; 989 } 990 qdict_del(encryptopts, "format"); 991 r->crypto_opts = block_crypto_open_opts_init( 992 Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp); 993 break; 994 995 case QCOW_CRYPT_LUKS: 996 if (encryptfmt && !g_str_equal(encryptfmt, "luks")) { 997 error_setg(errp, 998 "Header reported 'luks' encryption format but " 999 "options specify '%s'", encryptfmt); 1000 ret = -EINVAL; 1001 goto fail; 1002 } 1003 qdict_del(encryptopts, "format"); 1004 r->crypto_opts = block_crypto_open_opts_init( 1005 Q_CRYPTO_BLOCK_FORMAT_LUKS, encryptopts, errp); 1006 break; 1007 1008 default: 1009 error_setg(errp, "Unsupported encryption method %d", 1010 s->crypt_method_header); 1011 break; 1012 } 1013 if (s->crypt_method_header != QCOW_CRYPT_NONE && !r->crypto_opts) { 1014 ret = -EINVAL; 1015 goto fail; 1016 } 1017 1018 ret = 0; 1019 fail: 1020 QDECREF(encryptopts); 1021 qemu_opts_del(opts); 1022 opts = NULL; 1023 return ret; 1024 } 1025 1026 static void qcow2_update_options_commit(BlockDriverState *bs, 1027 Qcow2ReopenState *r) 1028 { 1029 BDRVQcow2State *s = bs->opaque; 1030 int i; 1031 1032 if (s->l2_table_cache) { 1033 qcow2_cache_destroy(bs, s->l2_table_cache); 1034 } 1035 if (s->refcount_block_cache) { 1036 qcow2_cache_destroy(bs, s->refcount_block_cache); 1037 } 1038 s->l2_table_cache = r->l2_table_cache; 1039 s->refcount_block_cache = r->refcount_block_cache; 1040 1041 s->overlap_check = r->overlap_check; 1042 s->use_lazy_refcounts = r->use_lazy_refcounts; 1043 1044 for (i = 0; i < QCOW2_DISCARD_MAX; i++) { 1045 s->discard_passthrough[i] = r->discard_passthrough[i]; 1046 } 1047 1048 if (s->cache_clean_interval != r->cache_clean_interval) { 1049 cache_clean_timer_del(bs); 1050 s->cache_clean_interval = r->cache_clean_interval; 1051 cache_clean_timer_init(bs, bdrv_get_aio_context(bs)); 1052 } 1053 1054 qapi_free_QCryptoBlockOpenOptions(s->crypto_opts); 1055 s->crypto_opts = r->crypto_opts; 1056 } 1057 1058 static void qcow2_update_options_abort(BlockDriverState *bs, 1059 Qcow2ReopenState *r) 1060 { 1061 if (r->l2_table_cache) { 1062 qcow2_cache_destroy(bs, r->l2_table_cache); 1063 } 1064 if (r->refcount_block_cache) { 1065 qcow2_cache_destroy(bs, r->refcount_block_cache); 1066 } 1067 qapi_free_QCryptoBlockOpenOptions(r->crypto_opts); 1068 } 1069 1070 static int qcow2_update_options(BlockDriverState *bs, QDict *options, 1071 int flags, Error **errp) 1072 { 1073 Qcow2ReopenState r = {}; 1074 int ret; 1075 1076 ret = qcow2_update_options_prepare(bs, &r, options, flags, errp); 1077 if (ret >= 0) { 1078 qcow2_update_options_commit(bs, &r); 1079 } else { 1080 qcow2_update_options_abort(bs, &r); 1081 } 1082 1083 return ret; 1084 } 1085 1086 static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, 1087 Error **errp) 1088 { 1089 BDRVQcow2State *s = bs->opaque; 1090 unsigned int len, i; 1091 int ret = 0; 1092 QCowHeader header; 1093 Error *local_err = NULL; 1094 uint64_t ext_end; 1095 uint64_t l1_vm_state_index; 1096 bool update_header = false; 1097 1098 ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); 1099 if (ret < 0) { 1100 error_setg_errno(errp, -ret, "Could not read qcow2 header"); 1101 goto fail; 1102 } 1103 be32_to_cpus(&header.magic); 1104 be32_to_cpus(&header.version); 1105 be64_to_cpus(&header.backing_file_offset); 1106 be32_to_cpus(&header.backing_file_size); 1107 be64_to_cpus(&header.size); 1108 be32_to_cpus(&header.cluster_bits); 1109 be32_to_cpus(&header.crypt_method); 1110 be64_to_cpus(&header.l1_table_offset); 1111 be32_to_cpus(&header.l1_size); 1112 be64_to_cpus(&header.refcount_table_offset); 1113 be32_to_cpus(&header.refcount_table_clusters); 1114 be64_to_cpus(&header.snapshots_offset); 1115 be32_to_cpus(&header.nb_snapshots); 1116 1117 if (header.magic != QCOW_MAGIC) { 1118 error_setg(errp, "Image is not in qcow2 format"); 1119 ret = -EINVAL; 1120 goto fail; 1121 } 1122 if (header.version < 2 || header.version > 3) { 1123 error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version); 1124 ret = -ENOTSUP; 1125 goto fail; 1126 } 1127 1128 s->qcow_version = header.version; 1129 1130 /* Initialise cluster size */ 1131 if (header.cluster_bits < MIN_CLUSTER_BITS || 1132 header.cluster_bits > MAX_CLUSTER_BITS) { 1133 error_setg(errp, "Unsupported cluster size: 2^%" PRIu32, 1134 header.cluster_bits); 1135 ret = -EINVAL; 1136 goto fail; 1137 } 1138 1139 s->cluster_bits = header.cluster_bits; 1140 s->cluster_size = 1 << s->cluster_bits; 1141 s->cluster_sectors = 1 << (s->cluster_bits - 9); 1142 1143 /* Initialise version 3 header fields */ 1144 if (header.version == 2) { 1145 header.incompatible_features = 0; 1146 header.compatible_features = 0; 1147 header.autoclear_features = 0; 1148 header.refcount_order = 4; 1149 header.header_length = 72; 1150 } else { 1151 be64_to_cpus(&header.incompatible_features); 1152 be64_to_cpus(&header.compatible_features); 1153 be64_to_cpus(&header.autoclear_features); 1154 be32_to_cpus(&header.refcount_order); 1155 be32_to_cpus(&header.header_length); 1156 1157 if (header.header_length < 104) { 1158 error_setg(errp, "qcow2 header too short"); 1159 ret = -EINVAL; 1160 goto fail; 1161 } 1162 } 1163 1164 if (header.header_length > s->cluster_size) { 1165 error_setg(errp, "qcow2 header exceeds cluster size"); 1166 ret = -EINVAL; 1167 goto fail; 1168 } 1169 1170 if (header.header_length > sizeof(header)) { 1171 s->unknown_header_fields_size = header.header_length - sizeof(header); 1172 s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); 1173 ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, 1174 s->unknown_header_fields_size); 1175 if (ret < 0) { 1176 error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " 1177 "fields"); 1178 goto fail; 1179 } 1180 } 1181 1182 if (header.backing_file_offset > s->cluster_size) { 1183 error_setg(errp, "Invalid backing file offset"); 1184 ret = -EINVAL; 1185 goto fail; 1186 } 1187 1188 if (header.backing_file_offset) { 1189 ext_end = header.backing_file_offset; 1190 } else { 1191 ext_end = 1 << header.cluster_bits; 1192 } 1193 1194 /* Handle feature bits */ 1195 s->incompatible_features = header.incompatible_features; 1196 s->compatible_features = header.compatible_features; 1197 s->autoclear_features = header.autoclear_features; 1198 1199 if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { 1200 void *feature_table = NULL; 1201 qcow2_read_extensions(bs, header.header_length, ext_end, 1202 &feature_table, flags, NULL, NULL); 1203 report_unsupported_feature(errp, feature_table, 1204 s->incompatible_features & 1205 ~QCOW2_INCOMPAT_MASK); 1206 ret = -ENOTSUP; 1207 g_free(feature_table); 1208 goto fail; 1209 } 1210 1211 if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { 1212 /* Corrupt images may not be written to unless they are being repaired 1213 */ 1214 if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { 1215 error_setg(errp, "qcow2: Image is corrupt; cannot be opened " 1216 "read/write"); 1217 ret = -EACCES; 1218 goto fail; 1219 } 1220 } 1221 1222 /* Check support for various header values */ 1223 if (header.refcount_order > 6) { 1224 error_setg(errp, "Reference count entry width too large; may not " 1225 "exceed 64 bits"); 1226 ret = -EINVAL; 1227 goto fail; 1228 } 1229 s->refcount_order = header.refcount_order; 1230 s->refcount_bits = 1 << s->refcount_order; 1231 s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); 1232 s->refcount_max += s->refcount_max - 1; 1233 1234 s->crypt_method_header = header.crypt_method; 1235 if (s->crypt_method_header) { 1236 if (bdrv_uses_whitelist() && 1237 s->crypt_method_header == QCOW_CRYPT_AES) { 1238 error_setg(errp, 1239 "Use of AES-CBC encrypted qcow2 images is no longer " 1240 "supported in system emulators"); 1241 error_append_hint(errp, 1242 "You can use 'qemu-img convert' to convert your " 1243 "image to an alternative supported format, such " 1244 "as unencrypted qcow2, or raw with the LUKS " 1245 "format instead.\n"); 1246 ret = -ENOSYS; 1247 goto fail; 1248 } 1249 1250 if (s->crypt_method_header == QCOW_CRYPT_AES) { 1251 s->crypt_physical_offset = false; 1252 } else { 1253 /* Assuming LUKS and any future crypt methods we 1254 * add will all use physical offsets, due to the 1255 * fact that the alternative is insecure... */ 1256 s->crypt_physical_offset = true; 1257 } 1258 1259 bs->encrypted = true; 1260 } 1261 1262 s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ 1263 s->l2_size = 1 << s->l2_bits; 1264 /* 2^(s->refcount_order - 3) is the refcount width in bytes */ 1265 s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3); 1266 s->refcount_block_size = 1 << s->refcount_block_bits; 1267 bs->total_sectors = header.size / 512; 1268 s->csize_shift = (62 - (s->cluster_bits - 8)); 1269 s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; 1270 s->cluster_offset_mask = (1LL << s->csize_shift) - 1; 1271 1272 s->refcount_table_offset = header.refcount_table_offset; 1273 s->refcount_table_size = 1274 header.refcount_table_clusters << (s->cluster_bits - 3); 1275 1276 if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) { 1277 error_setg(errp, "Reference count table too large"); 1278 ret = -EINVAL; 1279 goto fail; 1280 } 1281 1282 ret = validate_table_offset(bs, s->refcount_table_offset, 1283 s->refcount_table_size, sizeof(uint64_t)); 1284 if (ret < 0) { 1285 error_setg(errp, "Invalid reference count table offset"); 1286 goto fail; 1287 } 1288 1289 /* Snapshot table offset/length */ 1290 if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) { 1291 error_setg(errp, "Too many snapshots"); 1292 ret = -EINVAL; 1293 goto fail; 1294 } 1295 1296 ret = validate_table_offset(bs, header.snapshots_offset, 1297 header.nb_snapshots, 1298 sizeof(QCowSnapshotHeader)); 1299 if (ret < 0) { 1300 error_setg(errp, "Invalid snapshot table offset"); 1301 goto fail; 1302 } 1303 1304 /* read the level 1 table */ 1305 if (header.l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { 1306 error_setg(errp, "Active L1 table too large"); 1307 ret = -EFBIG; 1308 goto fail; 1309 } 1310 s->l1_size = header.l1_size; 1311 1312 l1_vm_state_index = size_to_l1(s, header.size); 1313 if (l1_vm_state_index > INT_MAX) { 1314 error_setg(errp, "Image is too big"); 1315 ret = -EFBIG; 1316 goto fail; 1317 } 1318 s->l1_vm_state_index = l1_vm_state_index; 1319 1320 /* the L1 table must contain at least enough entries to put 1321 header.size bytes */ 1322 if (s->l1_size < s->l1_vm_state_index) { 1323 error_setg(errp, "L1 table is too small"); 1324 ret = -EINVAL; 1325 goto fail; 1326 } 1327 1328 ret = validate_table_offset(bs, header.l1_table_offset, 1329 header.l1_size, sizeof(uint64_t)); 1330 if (ret < 0) { 1331 error_setg(errp, "Invalid L1 table offset"); 1332 goto fail; 1333 } 1334 s->l1_table_offset = header.l1_table_offset; 1335 1336 1337 if (s->l1_size > 0) { 1338 s->l1_table = qemu_try_blockalign(bs->file->bs, 1339 align_offset(s->l1_size * sizeof(uint64_t), 512)); 1340 if (s->l1_table == NULL) { 1341 error_setg(errp, "Could not allocate L1 table"); 1342 ret = -ENOMEM; 1343 goto fail; 1344 } 1345 ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, 1346 s->l1_size * sizeof(uint64_t)); 1347 if (ret < 0) { 1348 error_setg_errno(errp, -ret, "Could not read L1 table"); 1349 goto fail; 1350 } 1351 for(i = 0;i < s->l1_size; i++) { 1352 be64_to_cpus(&s->l1_table[i]); 1353 } 1354 } 1355 1356 /* Parse driver-specific options */ 1357 ret = qcow2_update_options(bs, options, flags, errp); 1358 if (ret < 0) { 1359 goto fail; 1360 } 1361 1362 s->cluster_cache_offset = -1; 1363 s->flags = flags; 1364 1365 ret = qcow2_refcount_init(bs); 1366 if (ret != 0) { 1367 error_setg_errno(errp, -ret, "Could not initialize refcount handling"); 1368 goto fail; 1369 } 1370 1371 QLIST_INIT(&s->cluster_allocs); 1372 QTAILQ_INIT(&s->discards); 1373 1374 /* read qcow2 extensions */ 1375 if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL, 1376 flags, &update_header, &local_err)) { 1377 error_propagate(errp, local_err); 1378 ret = -EINVAL; 1379 goto fail; 1380 } 1381 1382 /* qcow2_read_extension may have set up the crypto context 1383 * if the crypt method needs a header region, some methods 1384 * don't need header extensions, so must check here 1385 */ 1386 if (s->crypt_method_header && !s->crypto) { 1387 if (s->crypt_method_header == QCOW_CRYPT_AES) { 1388 unsigned int cflags = 0; 1389 if (flags & BDRV_O_NO_IO) { 1390 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; 1391 } 1392 s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.", 1393 NULL, NULL, cflags, errp); 1394 if (!s->crypto) { 1395 ret = -EINVAL; 1396 goto fail; 1397 } 1398 } else if (!(flags & BDRV_O_NO_IO)) { 1399 error_setg(errp, "Missing CRYPTO header for crypt method %d", 1400 s->crypt_method_header); 1401 ret = -EINVAL; 1402 goto fail; 1403 } 1404 } 1405 1406 /* read the backing file name */ 1407 if (header.backing_file_offset != 0) { 1408 len = header.backing_file_size; 1409 if (len > MIN(1023, s->cluster_size - header.backing_file_offset) || 1410 len >= sizeof(bs->backing_file)) { 1411 error_setg(errp, "Backing file name too long"); 1412 ret = -EINVAL; 1413 goto fail; 1414 } 1415 ret = bdrv_pread(bs->file, header.backing_file_offset, 1416 bs->backing_file, len); 1417 if (ret < 0) { 1418 error_setg_errno(errp, -ret, "Could not read backing file name"); 1419 goto fail; 1420 } 1421 bs->backing_file[len] = '\0'; 1422 s->image_backing_file = g_strdup(bs->backing_file); 1423 } 1424 1425 /* Internal snapshots */ 1426 s->snapshots_offset = header.snapshots_offset; 1427 s->nb_snapshots = header.nb_snapshots; 1428 1429 ret = qcow2_read_snapshots(bs); 1430 if (ret < 0) { 1431 error_setg_errno(errp, -ret, "Could not read snapshots"); 1432 goto fail; 1433 } 1434 1435 /* Clear unknown autoclear feature bits */ 1436 update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK; 1437 update_header = 1438 update_header && !bs->read_only && !(flags & BDRV_O_INACTIVE); 1439 if (update_header) { 1440 s->autoclear_features &= QCOW2_AUTOCLEAR_MASK; 1441 } 1442 1443 if (qcow2_load_autoloading_dirty_bitmaps(bs, &local_err)) { 1444 update_header = false; 1445 } 1446 if (local_err != NULL) { 1447 error_propagate(errp, local_err); 1448 ret = -EINVAL; 1449 goto fail; 1450 } 1451 1452 if (update_header) { 1453 ret = qcow2_update_header(bs); 1454 if (ret < 0) { 1455 error_setg_errno(errp, -ret, "Could not update qcow2 header"); 1456 goto fail; 1457 } 1458 } 1459 1460 /* Initialise locks */ 1461 qemu_co_mutex_init(&s->lock); 1462 bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; 1463 1464 /* Repair image if dirty */ 1465 if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only && 1466 (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { 1467 BdrvCheckResult result = {0}; 1468 1469 ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS); 1470 if (ret < 0) { 1471 error_setg_errno(errp, -ret, "Could not repair dirty image"); 1472 goto fail; 1473 } 1474 } 1475 1476 #ifdef DEBUG_ALLOC 1477 { 1478 BdrvCheckResult result = {0}; 1479 qcow2_check_refcounts(bs, &result, 0); 1480 } 1481 #endif 1482 return ret; 1483 1484 fail: 1485 g_free(s->unknown_header_fields); 1486 cleanup_unknown_header_ext(bs); 1487 qcow2_free_snapshots(bs); 1488 qcow2_refcount_close(bs); 1489 qemu_vfree(s->l1_table); 1490 /* else pre-write overlap checks in cache_destroy may crash */ 1491 s->l1_table = NULL; 1492 cache_clean_timer_del(bs); 1493 if (s->l2_table_cache) { 1494 qcow2_cache_destroy(bs, s->l2_table_cache); 1495 } 1496 if (s->refcount_block_cache) { 1497 qcow2_cache_destroy(bs, s->refcount_block_cache); 1498 } 1499 qcrypto_block_free(s->crypto); 1500 qapi_free_QCryptoBlockOpenOptions(s->crypto_opts); 1501 return ret; 1502 } 1503 1504 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, 1505 Error **errp) 1506 { 1507 bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, 1508 false, errp); 1509 if (!bs->file) { 1510 return -EINVAL; 1511 } 1512 1513 return qcow2_do_open(bs, options, flags, errp); 1514 } 1515 1516 static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) 1517 { 1518 BDRVQcow2State *s = bs->opaque; 1519 1520 if (bs->encrypted) { 1521 /* Encryption works on a sector granularity */ 1522 bs->bl.request_alignment = BDRV_SECTOR_SIZE; 1523 } 1524 bs->bl.pwrite_zeroes_alignment = s->cluster_size; 1525 bs->bl.pdiscard_alignment = s->cluster_size; 1526 } 1527 1528 static int qcow2_reopen_prepare(BDRVReopenState *state, 1529 BlockReopenQueue *queue, Error **errp) 1530 { 1531 Qcow2ReopenState *r; 1532 int ret; 1533 1534 r = g_new0(Qcow2ReopenState, 1); 1535 state->opaque = r; 1536 1537 ret = qcow2_update_options_prepare(state->bs, r, state->options, 1538 state->flags, errp); 1539 if (ret < 0) { 1540 goto fail; 1541 } 1542 1543 /* We need to write out any unwritten data if we reopen read-only. */ 1544 if ((state->flags & BDRV_O_RDWR) == 0) { 1545 ret = qcow2_reopen_bitmaps_ro(state->bs, errp); 1546 if (ret < 0) { 1547 goto fail; 1548 } 1549 1550 ret = bdrv_flush(state->bs); 1551 if (ret < 0) { 1552 goto fail; 1553 } 1554 1555 ret = qcow2_mark_clean(state->bs); 1556 if (ret < 0) { 1557 goto fail; 1558 } 1559 } 1560 1561 return 0; 1562 1563 fail: 1564 qcow2_update_options_abort(state->bs, r); 1565 g_free(r); 1566 return ret; 1567 } 1568 1569 static void qcow2_reopen_commit(BDRVReopenState *state) 1570 { 1571 qcow2_update_options_commit(state->bs, state->opaque); 1572 g_free(state->opaque); 1573 } 1574 1575 static void qcow2_reopen_abort(BDRVReopenState *state) 1576 { 1577 qcow2_update_options_abort(state->bs, state->opaque); 1578 g_free(state->opaque); 1579 } 1580 1581 static void qcow2_join_options(QDict *options, QDict *old_options) 1582 { 1583 bool has_new_overlap_template = 1584 qdict_haskey(options, QCOW2_OPT_OVERLAP) || 1585 qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE); 1586 bool has_new_total_cache_size = 1587 qdict_haskey(options, QCOW2_OPT_CACHE_SIZE); 1588 bool has_all_cache_options; 1589 1590 /* New overlap template overrides all old overlap options */ 1591 if (has_new_overlap_template) { 1592 qdict_del(old_options, QCOW2_OPT_OVERLAP); 1593 qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE); 1594 qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER); 1595 qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1); 1596 qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2); 1597 qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE); 1598 qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK); 1599 qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE); 1600 qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1); 1601 qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2); 1602 } 1603 1604 /* New total cache size overrides all old options */ 1605 if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) { 1606 qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE); 1607 qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE); 1608 } 1609 1610 qdict_join(options, old_options, false); 1611 1612 /* 1613 * If after merging all cache size options are set, an old total size is 1614 * overwritten. Do keep all options, however, if all three are new. The 1615 * resulting error message is what we want to happen. 1616 */ 1617 has_all_cache_options = 1618 qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) || 1619 qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) || 1620 qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE); 1621 1622 if (has_all_cache_options && !has_new_total_cache_size) { 1623 qdict_del(options, QCOW2_OPT_CACHE_SIZE); 1624 } 1625 } 1626 1627 static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, 1628 int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) 1629 { 1630 BDRVQcow2State *s = bs->opaque; 1631 uint64_t cluster_offset; 1632 int index_in_cluster, ret; 1633 unsigned int bytes; 1634 int64_t status = 0; 1635 1636 bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE); 1637 qemu_co_mutex_lock(&s->lock); 1638 ret = qcow2_get_cluster_offset(bs, sector_num << 9, &bytes, 1639 &cluster_offset); 1640 qemu_co_mutex_unlock(&s->lock); 1641 if (ret < 0) { 1642 return ret; 1643 } 1644 1645 *pnum = bytes >> BDRV_SECTOR_BITS; 1646 1647 if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && 1648 !s->crypto) { 1649 index_in_cluster = sector_num & (s->cluster_sectors - 1); 1650 cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); 1651 *file = bs->file->bs; 1652 status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; 1653 } 1654 if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) { 1655 status |= BDRV_BLOCK_ZERO; 1656 } else if (ret != QCOW2_CLUSTER_UNALLOCATED) { 1657 status |= BDRV_BLOCK_DATA; 1658 } 1659 return status; 1660 } 1661 1662 /* handle reading after the end of the backing file */ 1663 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, 1664 int64_t offset, int bytes) 1665 { 1666 uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE; 1667 int n1; 1668 1669 if ((offset + bytes) <= bs_size) { 1670 return bytes; 1671 } 1672 1673 if (offset >= bs_size) { 1674 n1 = 0; 1675 } else { 1676 n1 = bs_size - offset; 1677 } 1678 1679 qemu_iovec_memset(qiov, n1, 0, bytes - n1); 1680 1681 return n1; 1682 } 1683 1684 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset, 1685 uint64_t bytes, QEMUIOVector *qiov, 1686 int flags) 1687 { 1688 BDRVQcow2State *s = bs->opaque; 1689 int offset_in_cluster, n1; 1690 int ret; 1691 unsigned int cur_bytes; /* number of bytes in current iteration */ 1692 uint64_t cluster_offset = 0; 1693 uint64_t bytes_done = 0; 1694 QEMUIOVector hd_qiov; 1695 uint8_t *cluster_data = NULL; 1696 1697 qemu_iovec_init(&hd_qiov, qiov->niov); 1698 1699 qemu_co_mutex_lock(&s->lock); 1700 1701 while (bytes != 0) { 1702 1703 /* prepare next request */ 1704 cur_bytes = MIN(bytes, INT_MAX); 1705 if (s->crypto) { 1706 cur_bytes = MIN(cur_bytes, 1707 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); 1708 } 1709 1710 ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset); 1711 if (ret < 0) { 1712 goto fail; 1713 } 1714 1715 offset_in_cluster = offset_into_cluster(s, offset); 1716 1717 qemu_iovec_reset(&hd_qiov); 1718 qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes); 1719 1720 switch (ret) { 1721 case QCOW2_CLUSTER_UNALLOCATED: 1722 1723 if (bs->backing) { 1724 /* read from the base image */ 1725 n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov, 1726 offset, cur_bytes); 1727 if (n1 > 0) { 1728 QEMUIOVector local_qiov; 1729 1730 qemu_iovec_init(&local_qiov, hd_qiov.niov); 1731 qemu_iovec_concat(&local_qiov, &hd_qiov, 0, n1); 1732 1733 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); 1734 qemu_co_mutex_unlock(&s->lock); 1735 ret = bdrv_co_preadv(bs->backing, offset, n1, 1736 &local_qiov, 0); 1737 qemu_co_mutex_lock(&s->lock); 1738 1739 qemu_iovec_destroy(&local_qiov); 1740 1741 if (ret < 0) { 1742 goto fail; 1743 } 1744 } 1745 } else { 1746 /* Note: in this case, no need to wait */ 1747 qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes); 1748 } 1749 break; 1750 1751 case QCOW2_CLUSTER_ZERO_PLAIN: 1752 case QCOW2_CLUSTER_ZERO_ALLOC: 1753 qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes); 1754 break; 1755 1756 case QCOW2_CLUSTER_COMPRESSED: 1757 /* add AIO support for compressed blocks ? */ 1758 ret = qcow2_decompress_cluster(bs, cluster_offset); 1759 if (ret < 0) { 1760 goto fail; 1761 } 1762 1763 qemu_iovec_from_buf(&hd_qiov, 0, 1764 s->cluster_cache + offset_in_cluster, 1765 cur_bytes); 1766 break; 1767 1768 case QCOW2_CLUSTER_NORMAL: 1769 if ((cluster_offset & 511) != 0) { 1770 ret = -EIO; 1771 goto fail; 1772 } 1773 1774 if (bs->encrypted) { 1775 assert(s->crypto); 1776 1777 /* 1778 * For encrypted images, read everything into a temporary 1779 * contiguous buffer on which the AES functions can work. 1780 */ 1781 if (!cluster_data) { 1782 cluster_data = 1783 qemu_try_blockalign(bs->file->bs, 1784 QCOW_MAX_CRYPT_CLUSTERS 1785 * s->cluster_size); 1786 if (cluster_data == NULL) { 1787 ret = -ENOMEM; 1788 goto fail; 1789 } 1790 } 1791 1792 assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); 1793 qemu_iovec_reset(&hd_qiov); 1794 qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes); 1795 } 1796 1797 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); 1798 qemu_co_mutex_unlock(&s->lock); 1799 ret = bdrv_co_preadv(bs->file, 1800 cluster_offset + offset_in_cluster, 1801 cur_bytes, &hd_qiov, 0); 1802 qemu_co_mutex_lock(&s->lock); 1803 if (ret < 0) { 1804 goto fail; 1805 } 1806 if (bs->encrypted) { 1807 assert(s->crypto); 1808 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); 1809 assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 1810 if (qcrypto_block_decrypt(s->crypto, 1811 (s->crypt_physical_offset ? 1812 cluster_offset + offset_in_cluster : 1813 offset) >> BDRV_SECTOR_BITS, 1814 cluster_data, 1815 cur_bytes, 1816 NULL) < 0) { 1817 ret = -EIO; 1818 goto fail; 1819 } 1820 qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes); 1821 } 1822 break; 1823 1824 default: 1825 g_assert_not_reached(); 1826 ret = -EIO; 1827 goto fail; 1828 } 1829 1830 bytes -= cur_bytes; 1831 offset += cur_bytes; 1832 bytes_done += cur_bytes; 1833 } 1834 ret = 0; 1835 1836 fail: 1837 qemu_co_mutex_unlock(&s->lock); 1838 1839 qemu_iovec_destroy(&hd_qiov); 1840 qemu_vfree(cluster_data); 1841 1842 return ret; 1843 } 1844 1845 /* Check if it's possible to merge a write request with the writing of 1846 * the data from the COW regions */ 1847 static bool merge_cow(uint64_t offset, unsigned bytes, 1848 QEMUIOVector *hd_qiov, QCowL2Meta *l2meta) 1849 { 1850 QCowL2Meta *m; 1851 1852 for (m = l2meta; m != NULL; m = m->next) { 1853 /* If both COW regions are empty then there's nothing to merge */ 1854 if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) { 1855 continue; 1856 } 1857 1858 /* The data (middle) region must be immediately after the 1859 * start region */ 1860 if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) { 1861 continue; 1862 } 1863 1864 /* The end region must be immediately after the data (middle) 1865 * region */ 1866 if (m->offset + m->cow_end.offset != offset + bytes) { 1867 continue; 1868 } 1869 1870 /* Make sure that adding both COW regions to the QEMUIOVector 1871 * does not exceed IOV_MAX */ 1872 if (hd_qiov->niov > IOV_MAX - 2) { 1873 continue; 1874 } 1875 1876 m->data_qiov = hd_qiov; 1877 return true; 1878 } 1879 1880 return false; 1881 } 1882 1883 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, 1884 uint64_t bytes, QEMUIOVector *qiov, 1885 int flags) 1886 { 1887 BDRVQcow2State *s = bs->opaque; 1888 int offset_in_cluster; 1889 int ret; 1890 unsigned int cur_bytes; /* number of sectors in current iteration */ 1891 uint64_t cluster_offset; 1892 QEMUIOVector hd_qiov; 1893 uint64_t bytes_done = 0; 1894 uint8_t *cluster_data = NULL; 1895 QCowL2Meta *l2meta = NULL; 1896 1897 trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes); 1898 1899 qemu_iovec_init(&hd_qiov, qiov->niov); 1900 1901 s->cluster_cache_offset = -1; /* disable compressed cache */ 1902 1903 qemu_co_mutex_lock(&s->lock); 1904 1905 while (bytes != 0) { 1906 1907 l2meta = NULL; 1908 1909 trace_qcow2_writev_start_part(qemu_coroutine_self()); 1910 offset_in_cluster = offset_into_cluster(s, offset); 1911 cur_bytes = MIN(bytes, INT_MAX); 1912 if (bs->encrypted) { 1913 cur_bytes = MIN(cur_bytes, 1914 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size 1915 - offset_in_cluster); 1916 } 1917 1918 ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes, 1919 &cluster_offset, &l2meta); 1920 if (ret < 0) { 1921 goto fail; 1922 } 1923 1924 assert((cluster_offset & 511) == 0); 1925 1926 qemu_iovec_reset(&hd_qiov); 1927 qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes); 1928 1929 if (bs->encrypted) { 1930 assert(s->crypto); 1931 if (!cluster_data) { 1932 cluster_data = qemu_try_blockalign(bs->file->bs, 1933 QCOW_MAX_CRYPT_CLUSTERS 1934 * s->cluster_size); 1935 if (cluster_data == NULL) { 1936 ret = -ENOMEM; 1937 goto fail; 1938 } 1939 } 1940 1941 assert(hd_qiov.size <= 1942 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); 1943 qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); 1944 1945 if (qcrypto_block_encrypt(s->crypto, 1946 (s->crypt_physical_offset ? 1947 cluster_offset + offset_in_cluster : 1948 offset) >> BDRV_SECTOR_BITS, 1949 cluster_data, 1950 cur_bytes, NULL) < 0) { 1951 ret = -EIO; 1952 goto fail; 1953 } 1954 1955 qemu_iovec_reset(&hd_qiov); 1956 qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes); 1957 } 1958 1959 ret = qcow2_pre_write_overlap_check(bs, 0, 1960 cluster_offset + offset_in_cluster, cur_bytes); 1961 if (ret < 0) { 1962 goto fail; 1963 } 1964 1965 /* If we need to do COW, check if it's possible to merge the 1966 * writing of the guest data together with that of the COW regions. 1967 * If it's not possible (or not necessary) then write the 1968 * guest data now. */ 1969 if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) { 1970 qemu_co_mutex_unlock(&s->lock); 1971 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); 1972 trace_qcow2_writev_data(qemu_coroutine_self(), 1973 cluster_offset + offset_in_cluster); 1974 ret = bdrv_co_pwritev(bs->file, 1975 cluster_offset + offset_in_cluster, 1976 cur_bytes, &hd_qiov, 0); 1977 qemu_co_mutex_lock(&s->lock); 1978 if (ret < 0) { 1979 goto fail; 1980 } 1981 } 1982 1983 while (l2meta != NULL) { 1984 QCowL2Meta *next; 1985 1986 ret = qcow2_alloc_cluster_link_l2(bs, l2meta); 1987 if (ret < 0) { 1988 goto fail; 1989 } 1990 1991 /* Take the request off the list of running requests */ 1992 if (l2meta->nb_clusters != 0) { 1993 QLIST_REMOVE(l2meta, next_in_flight); 1994 } 1995 1996 qemu_co_queue_restart_all(&l2meta->dependent_requests); 1997 1998 next = l2meta->next; 1999 g_free(l2meta); 2000 l2meta = next; 2001 } 2002 2003 bytes -= cur_bytes; 2004 offset += cur_bytes; 2005 bytes_done += cur_bytes; 2006 trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes); 2007 } 2008 ret = 0; 2009 2010 fail: 2011 while (l2meta != NULL) { 2012 QCowL2Meta *next; 2013 2014 if (l2meta->nb_clusters != 0) { 2015 QLIST_REMOVE(l2meta, next_in_flight); 2016 } 2017 qemu_co_queue_restart_all(&l2meta->dependent_requests); 2018 2019 next = l2meta->next; 2020 g_free(l2meta); 2021 l2meta = next; 2022 } 2023 2024 qemu_co_mutex_unlock(&s->lock); 2025 2026 qemu_iovec_destroy(&hd_qiov); 2027 qemu_vfree(cluster_data); 2028 trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); 2029 2030 return ret; 2031 } 2032 2033 static int qcow2_inactivate(BlockDriverState *bs) 2034 { 2035 BDRVQcow2State *s = bs->opaque; 2036 int ret, result = 0; 2037 Error *local_err = NULL; 2038 2039 qcow2_store_persistent_dirty_bitmaps(bs, &local_err); 2040 if (local_err != NULL) { 2041 result = -EINVAL; 2042 error_report_err(local_err); 2043 error_report("Persistent bitmaps are lost for node '%s'", 2044 bdrv_get_device_or_node_name(bs)); 2045 } 2046 2047 ret = qcow2_cache_flush(bs, s->l2_table_cache); 2048 if (ret) { 2049 result = ret; 2050 error_report("Failed to flush the L2 table cache: %s", 2051 strerror(-ret)); 2052 } 2053 2054 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 2055 if (ret) { 2056 result = ret; 2057 error_report("Failed to flush the refcount block cache: %s", 2058 strerror(-ret)); 2059 } 2060 2061 if (result == 0) { 2062 qcow2_mark_clean(bs); 2063 } 2064 2065 return result; 2066 } 2067 2068 static void qcow2_close(BlockDriverState *bs) 2069 { 2070 BDRVQcow2State *s = bs->opaque; 2071 qemu_vfree(s->l1_table); 2072 /* else pre-write overlap checks in cache_destroy may crash */ 2073 s->l1_table = NULL; 2074 2075 if (!(s->flags & BDRV_O_INACTIVE)) { 2076 qcow2_inactivate(bs); 2077 } 2078 2079 cache_clean_timer_del(bs); 2080 qcow2_cache_destroy(bs, s->l2_table_cache); 2081 qcow2_cache_destroy(bs, s->refcount_block_cache); 2082 2083 qcrypto_block_free(s->crypto); 2084 s->crypto = NULL; 2085 2086 g_free(s->unknown_header_fields); 2087 cleanup_unknown_header_ext(bs); 2088 2089 g_free(s->image_backing_file); 2090 g_free(s->image_backing_format); 2091 2092 g_free(s->cluster_cache); 2093 qemu_vfree(s->cluster_data); 2094 qcow2_refcount_close(bs); 2095 qcow2_free_snapshots(bs); 2096 } 2097 2098 static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) 2099 { 2100 BDRVQcow2State *s = bs->opaque; 2101 int flags = s->flags; 2102 QCryptoBlock *crypto = NULL; 2103 QDict *options; 2104 Error *local_err = NULL; 2105 int ret; 2106 2107 /* 2108 * Backing files are read-only which makes all of their metadata immutable, 2109 * that means we don't have to worry about reopening them here. 2110 */ 2111 2112 crypto = s->crypto; 2113 s->crypto = NULL; 2114 2115 qcow2_close(bs); 2116 2117 memset(s, 0, sizeof(BDRVQcow2State)); 2118 options = qdict_clone_shallow(bs->options); 2119 2120 flags &= ~BDRV_O_INACTIVE; 2121 ret = qcow2_do_open(bs, options, flags, &local_err); 2122 QDECREF(options); 2123 if (local_err) { 2124 error_propagate(errp, local_err); 2125 error_prepend(errp, "Could not reopen qcow2 layer: "); 2126 bs->drv = NULL; 2127 return; 2128 } else if (ret < 0) { 2129 error_setg_errno(errp, -ret, "Could not reopen qcow2 layer"); 2130 bs->drv = NULL; 2131 return; 2132 } 2133 2134 s->crypto = crypto; 2135 } 2136 2137 static size_t header_ext_add(char *buf, uint32_t magic, const void *s, 2138 size_t len, size_t buflen) 2139 { 2140 QCowExtension *ext_backing_fmt = (QCowExtension*) buf; 2141 size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); 2142 2143 if (buflen < ext_len) { 2144 return -ENOSPC; 2145 } 2146 2147 *ext_backing_fmt = (QCowExtension) { 2148 .magic = cpu_to_be32(magic), 2149 .len = cpu_to_be32(len), 2150 }; 2151 2152 if (len) { 2153 memcpy(buf + sizeof(QCowExtension), s, len); 2154 } 2155 2156 return ext_len; 2157 } 2158 2159 /* 2160 * Updates the qcow2 header, including the variable length parts of it, i.e. 2161 * the backing file name and all extensions. qcow2 was not designed to allow 2162 * such changes, so if we run out of space (we can only use the first cluster) 2163 * this function may fail. 2164 * 2165 * Returns 0 on success, -errno in error cases. 2166 */ 2167 int qcow2_update_header(BlockDriverState *bs) 2168 { 2169 BDRVQcow2State *s = bs->opaque; 2170 QCowHeader *header; 2171 char *buf; 2172 size_t buflen = s->cluster_size; 2173 int ret; 2174 uint64_t total_size; 2175 uint32_t refcount_table_clusters; 2176 size_t header_length; 2177 Qcow2UnknownHeaderExtension *uext; 2178 2179 buf = qemu_blockalign(bs, buflen); 2180 2181 /* Header structure */ 2182 header = (QCowHeader*) buf; 2183 2184 if (buflen < sizeof(*header)) { 2185 ret = -ENOSPC; 2186 goto fail; 2187 } 2188 2189 header_length = sizeof(*header) + s->unknown_header_fields_size; 2190 total_size = bs->total_sectors * BDRV_SECTOR_SIZE; 2191 refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); 2192 2193 *header = (QCowHeader) { 2194 /* Version 2 fields */ 2195 .magic = cpu_to_be32(QCOW_MAGIC), 2196 .version = cpu_to_be32(s->qcow_version), 2197 .backing_file_offset = 0, 2198 .backing_file_size = 0, 2199 .cluster_bits = cpu_to_be32(s->cluster_bits), 2200 .size = cpu_to_be64(total_size), 2201 .crypt_method = cpu_to_be32(s->crypt_method_header), 2202 .l1_size = cpu_to_be32(s->l1_size), 2203 .l1_table_offset = cpu_to_be64(s->l1_table_offset), 2204 .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), 2205 .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), 2206 .nb_snapshots = cpu_to_be32(s->nb_snapshots), 2207 .snapshots_offset = cpu_to_be64(s->snapshots_offset), 2208 2209 /* Version 3 fields */ 2210 .incompatible_features = cpu_to_be64(s->incompatible_features), 2211 .compatible_features = cpu_to_be64(s->compatible_features), 2212 .autoclear_features = cpu_to_be64(s->autoclear_features), 2213 .refcount_order = cpu_to_be32(s->refcount_order), 2214 .header_length = cpu_to_be32(header_length), 2215 }; 2216 2217 /* For older versions, write a shorter header */ 2218 switch (s->qcow_version) { 2219 case 2: 2220 ret = offsetof(QCowHeader, incompatible_features); 2221 break; 2222 case 3: 2223 ret = sizeof(*header); 2224 break; 2225 default: 2226 ret = -EINVAL; 2227 goto fail; 2228 } 2229 2230 buf += ret; 2231 buflen -= ret; 2232 memset(buf, 0, buflen); 2233 2234 /* Preserve any unknown field in the header */ 2235 if (s->unknown_header_fields_size) { 2236 if (buflen < s->unknown_header_fields_size) { 2237 ret = -ENOSPC; 2238 goto fail; 2239 } 2240 2241 memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); 2242 buf += s->unknown_header_fields_size; 2243 buflen -= s->unknown_header_fields_size; 2244 } 2245 2246 /* Backing file format header extension */ 2247 if (s->image_backing_format) { 2248 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, 2249 s->image_backing_format, 2250 strlen(s->image_backing_format), 2251 buflen); 2252 if (ret < 0) { 2253 goto fail; 2254 } 2255 2256 buf += ret; 2257 buflen -= ret; 2258 } 2259 2260 /* Full disk encryption header pointer extension */ 2261 if (s->crypto_header.offset != 0) { 2262 cpu_to_be64s(&s->crypto_header.offset); 2263 cpu_to_be64s(&s->crypto_header.length); 2264 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER, 2265 &s->crypto_header, sizeof(s->crypto_header), 2266 buflen); 2267 be64_to_cpus(&s->crypto_header.offset); 2268 be64_to_cpus(&s->crypto_header.length); 2269 if (ret < 0) { 2270 goto fail; 2271 } 2272 buf += ret; 2273 buflen -= ret; 2274 } 2275 2276 /* Feature table */ 2277 if (s->qcow_version >= 3) { 2278 Qcow2Feature features[] = { 2279 { 2280 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, 2281 .bit = QCOW2_INCOMPAT_DIRTY_BITNR, 2282 .name = "dirty bit", 2283 }, 2284 { 2285 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, 2286 .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, 2287 .name = "corrupt bit", 2288 }, 2289 { 2290 .type = QCOW2_FEAT_TYPE_COMPATIBLE, 2291 .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, 2292 .name = "lazy refcounts", 2293 }, 2294 }; 2295 2296 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, 2297 features, sizeof(features), buflen); 2298 if (ret < 0) { 2299 goto fail; 2300 } 2301 buf += ret; 2302 buflen -= ret; 2303 } 2304 2305 /* Bitmap extension */ 2306 if (s->nb_bitmaps > 0) { 2307 Qcow2BitmapHeaderExt bitmaps_header = { 2308 .nb_bitmaps = cpu_to_be32(s->nb_bitmaps), 2309 .bitmap_directory_size = 2310 cpu_to_be64(s->bitmap_directory_size), 2311 .bitmap_directory_offset = 2312 cpu_to_be64(s->bitmap_directory_offset) 2313 }; 2314 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS, 2315 &bitmaps_header, sizeof(bitmaps_header), 2316 buflen); 2317 if (ret < 0) { 2318 goto fail; 2319 } 2320 buf += ret; 2321 buflen -= ret; 2322 } 2323 2324 /* Keep unknown header extensions */ 2325 QLIST_FOREACH(uext, &s->unknown_header_ext, next) { 2326 ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); 2327 if (ret < 0) { 2328 goto fail; 2329 } 2330 2331 buf += ret; 2332 buflen -= ret; 2333 } 2334 2335 /* End of header extensions */ 2336 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); 2337 if (ret < 0) { 2338 goto fail; 2339 } 2340 2341 buf += ret; 2342 buflen -= ret; 2343 2344 /* Backing file name */ 2345 if (s->image_backing_file) { 2346 size_t backing_file_len = strlen(s->image_backing_file); 2347 2348 if (buflen < backing_file_len) { 2349 ret = -ENOSPC; 2350 goto fail; 2351 } 2352 2353 /* Using strncpy is ok here, since buf is not NUL-terminated. */ 2354 strncpy(buf, s->image_backing_file, buflen); 2355 2356 header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); 2357 header->backing_file_size = cpu_to_be32(backing_file_len); 2358 } 2359 2360 /* Write the new header */ 2361 ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); 2362 if (ret < 0) { 2363 goto fail; 2364 } 2365 2366 ret = 0; 2367 fail: 2368 qemu_vfree(header); 2369 return ret; 2370 } 2371 2372 static int qcow2_change_backing_file(BlockDriverState *bs, 2373 const char *backing_file, const char *backing_fmt) 2374 { 2375 BDRVQcow2State *s = bs->opaque; 2376 2377 if (backing_file && strlen(backing_file) > 1023) { 2378 return -EINVAL; 2379 } 2380 2381 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 2382 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 2383 2384 g_free(s->image_backing_file); 2385 g_free(s->image_backing_format); 2386 2387 s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL; 2388 s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL; 2389 2390 return qcow2_update_header(bs); 2391 } 2392 2393 static int qcow2_crypt_method_from_format(const char *encryptfmt) 2394 { 2395 if (g_str_equal(encryptfmt, "luks")) { 2396 return QCOW_CRYPT_LUKS; 2397 } else if (g_str_equal(encryptfmt, "aes")) { 2398 return QCOW_CRYPT_AES; 2399 } else { 2400 return -EINVAL; 2401 } 2402 } 2403 2404 static int qcow2_set_up_encryption(BlockDriverState *bs, const char *encryptfmt, 2405 QemuOpts *opts, Error **errp) 2406 { 2407 BDRVQcow2State *s = bs->opaque; 2408 QCryptoBlockCreateOptions *cryptoopts = NULL; 2409 QCryptoBlock *crypto = NULL; 2410 int ret = -EINVAL; 2411 QDict *options, *encryptopts; 2412 int fmt; 2413 2414 options = qemu_opts_to_qdict(opts, NULL); 2415 qdict_extract_subqdict(options, &encryptopts, "encrypt."); 2416 QDECREF(options); 2417 2418 fmt = qcow2_crypt_method_from_format(encryptfmt); 2419 2420 switch (fmt) { 2421 case QCOW_CRYPT_LUKS: 2422 cryptoopts = block_crypto_create_opts_init( 2423 Q_CRYPTO_BLOCK_FORMAT_LUKS, encryptopts, errp); 2424 break; 2425 case QCOW_CRYPT_AES: 2426 cryptoopts = block_crypto_create_opts_init( 2427 Q_CRYPTO_BLOCK_FORMAT_QCOW, encryptopts, errp); 2428 break; 2429 default: 2430 error_setg(errp, "Unknown encryption format '%s'", encryptfmt); 2431 break; 2432 } 2433 if (!cryptoopts) { 2434 ret = -EINVAL; 2435 goto out; 2436 } 2437 s->crypt_method_header = fmt; 2438 2439 crypto = qcrypto_block_create(cryptoopts, "encrypt.", 2440 qcow2_crypto_hdr_init_func, 2441 qcow2_crypto_hdr_write_func, 2442 bs, errp); 2443 if (!crypto) { 2444 ret = -EINVAL; 2445 goto out; 2446 } 2447 2448 ret = qcow2_update_header(bs); 2449 if (ret < 0) { 2450 error_setg_errno(errp, -ret, "Could not write encryption header"); 2451 goto out; 2452 } 2453 2454 out: 2455 QDECREF(encryptopts); 2456 qcrypto_block_free(crypto); 2457 qapi_free_QCryptoBlockCreateOptions(cryptoopts); 2458 return ret; 2459 } 2460 2461 2462 /** 2463 * Preallocates metadata structures for data clusters between @offset (in the 2464 * guest disk) and @new_length (which is thus generally the new guest disk 2465 * size). 2466 * 2467 * Returns: 0 on success, -errno on failure. 2468 */ 2469 static int preallocate(BlockDriverState *bs, 2470 uint64_t offset, uint64_t new_length) 2471 { 2472 BDRVQcow2State *s = bs->opaque; 2473 uint64_t bytes; 2474 uint64_t host_offset = 0; 2475 unsigned int cur_bytes; 2476 int ret; 2477 QCowL2Meta *meta; 2478 2479 if (qemu_in_coroutine()) { 2480 qemu_co_mutex_lock(&s->lock); 2481 } 2482 2483 assert(offset <= new_length); 2484 bytes = new_length - offset; 2485 2486 while (bytes) { 2487 cur_bytes = MIN(bytes, INT_MAX); 2488 ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes, 2489 &host_offset, &meta); 2490 if (ret < 0) { 2491 goto done; 2492 } 2493 2494 while (meta) { 2495 QCowL2Meta *next = meta->next; 2496 2497 ret = qcow2_alloc_cluster_link_l2(bs, meta); 2498 if (ret < 0) { 2499 qcow2_free_any_clusters(bs, meta->alloc_offset, 2500 meta->nb_clusters, QCOW2_DISCARD_NEVER); 2501 goto done; 2502 } 2503 2504 /* There are no dependent requests, but we need to remove our 2505 * request from the list of in-flight requests */ 2506 QLIST_REMOVE(meta, next_in_flight); 2507 2508 g_free(meta); 2509 meta = next; 2510 } 2511 2512 /* TODO Preallocate data if requested */ 2513 2514 bytes -= cur_bytes; 2515 offset += cur_bytes; 2516 } 2517 2518 /* 2519 * It is expected that the image file is large enough to actually contain 2520 * all of the allocated clusters (otherwise we get failing reads after 2521 * EOF). Extend the image to the last allocated sector. 2522 */ 2523 if (host_offset != 0) { 2524 uint8_t data = 0; 2525 ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1, 2526 &data, 1); 2527 if (ret < 0) { 2528 goto done; 2529 } 2530 } 2531 2532 ret = 0; 2533 2534 done: 2535 if (qemu_in_coroutine()) { 2536 qemu_co_mutex_unlock(&s->lock); 2537 } 2538 return ret; 2539 } 2540 2541 /* qcow2_refcount_metadata_size: 2542 * @clusters: number of clusters to refcount (including data and L1/L2 tables) 2543 * @cluster_size: size of a cluster, in bytes 2544 * @refcount_order: refcount bits power-of-2 exponent 2545 * @generous_increase: allow for the refcount table to be 1.5x as large as it 2546 * needs to be 2547 * 2548 * Returns: Number of bytes required for refcount blocks and table metadata. 2549 */ 2550 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size, 2551 int refcount_order, bool generous_increase, 2552 uint64_t *refblock_count) 2553 { 2554 /* 2555 * Every host cluster is reference-counted, including metadata (even 2556 * refcount metadata is recursively included). 2557 * 2558 * An accurate formula for the size of refcount metadata size is difficult 2559 * to derive. An easier method of calculation is finding the fixed point 2560 * where no further refcount blocks or table clusters are required to 2561 * reference count every cluster. 2562 */ 2563 int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t); 2564 int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order); 2565 int64_t table = 0; /* number of refcount table clusters */ 2566 int64_t blocks = 0; /* number of refcount block clusters */ 2567 int64_t last; 2568 int64_t n = 0; 2569 2570 do { 2571 last = n; 2572 blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block); 2573 table = DIV_ROUND_UP(blocks, blocks_per_table_cluster); 2574 n = clusters + blocks + table; 2575 2576 if (n == last && generous_increase) { 2577 clusters += DIV_ROUND_UP(table, 2); 2578 n = 0; /* force another loop */ 2579 generous_increase = false; 2580 } 2581 } while (n != last); 2582 2583 if (refblock_count) { 2584 *refblock_count = blocks; 2585 } 2586 2587 return (blocks + table) * cluster_size; 2588 } 2589 2590 /** 2591 * qcow2_calc_prealloc_size: 2592 * @total_size: virtual disk size in bytes 2593 * @cluster_size: cluster size in bytes 2594 * @refcount_order: refcount bits power-of-2 exponent 2595 * 2596 * Returns: Total number of bytes required for the fully allocated image 2597 * (including metadata). 2598 */ 2599 static int64_t qcow2_calc_prealloc_size(int64_t total_size, 2600 size_t cluster_size, 2601 int refcount_order) 2602 { 2603 int64_t meta_size = 0; 2604 uint64_t nl1e, nl2e; 2605 int64_t aligned_total_size = align_offset(total_size, cluster_size); 2606 2607 /* header: 1 cluster */ 2608 meta_size += cluster_size; 2609 2610 /* total size of L2 tables */ 2611 nl2e = aligned_total_size / cluster_size; 2612 nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t)); 2613 meta_size += nl2e * sizeof(uint64_t); 2614 2615 /* total size of L1 tables */ 2616 nl1e = nl2e * sizeof(uint64_t) / cluster_size; 2617 nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t)); 2618 meta_size += nl1e * sizeof(uint64_t); 2619 2620 /* total size of refcount table and blocks */ 2621 meta_size += qcow2_refcount_metadata_size( 2622 (meta_size + aligned_total_size) / cluster_size, 2623 cluster_size, refcount_order, false, NULL); 2624 2625 return meta_size + aligned_total_size; 2626 } 2627 2628 static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp) 2629 { 2630 size_t cluster_size; 2631 int cluster_bits; 2632 2633 cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 2634 DEFAULT_CLUSTER_SIZE); 2635 cluster_bits = ctz32(cluster_size); 2636 if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || 2637 (1 << cluster_bits) != cluster_size) 2638 { 2639 error_setg(errp, "Cluster size must be a power of two between %d and " 2640 "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); 2641 return 0; 2642 } 2643 return cluster_size; 2644 } 2645 2646 static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp) 2647 { 2648 char *buf; 2649 int ret; 2650 2651 buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL); 2652 if (!buf) { 2653 ret = 3; /* default */ 2654 } else if (!strcmp(buf, "0.10")) { 2655 ret = 2; 2656 } else if (!strcmp(buf, "1.1")) { 2657 ret = 3; 2658 } else { 2659 error_setg(errp, "Invalid compatibility level: '%s'", buf); 2660 ret = -EINVAL; 2661 } 2662 g_free(buf); 2663 return ret; 2664 } 2665 2666 static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version, 2667 Error **errp) 2668 { 2669 uint64_t refcount_bits; 2670 2671 refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16); 2672 if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) { 2673 error_setg(errp, "Refcount width must be a power of two and may not " 2674 "exceed 64 bits"); 2675 return 0; 2676 } 2677 2678 if (version < 3 && refcount_bits != 16) { 2679 error_setg(errp, "Different refcount widths than 16 bits require " 2680 "compatibility level 1.1 or above (use compat=1.1 or " 2681 "greater)"); 2682 return 0; 2683 } 2684 2685 return refcount_bits; 2686 } 2687 2688 static int qcow2_create2(const char *filename, int64_t total_size, 2689 const char *backing_file, const char *backing_format, 2690 int flags, size_t cluster_size, PreallocMode prealloc, 2691 QemuOpts *opts, int version, int refcount_order, 2692 const char *encryptfmt, Error **errp) 2693 { 2694 QDict *options; 2695 2696 /* 2697 * Open the image file and write a minimal qcow2 header. 2698 * 2699 * We keep things simple and start with a zero-sized image. We also 2700 * do without refcount blocks or a L1 table for now. We'll fix the 2701 * inconsistency later. 2702 * 2703 * We do need a refcount table because growing the refcount table means 2704 * allocating two new refcount blocks - the seconds of which would be at 2705 * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file 2706 * size for any qcow2 image. 2707 */ 2708 BlockBackend *blk; 2709 QCowHeader *header; 2710 uint64_t* refcount_table; 2711 Error *local_err = NULL; 2712 int ret; 2713 2714 if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { 2715 int64_t prealloc_size = 2716 qcow2_calc_prealloc_size(total_size, cluster_size, refcount_order); 2717 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, prealloc_size, &error_abort); 2718 qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_str(prealloc), 2719 &error_abort); 2720 } 2721 2722 ret = bdrv_create_file(filename, opts, &local_err); 2723 if (ret < 0) { 2724 error_propagate(errp, local_err); 2725 return ret; 2726 } 2727 2728 blk = blk_new_open(filename, NULL, NULL, 2729 BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, 2730 &local_err); 2731 if (blk == NULL) { 2732 error_propagate(errp, local_err); 2733 return -EIO; 2734 } 2735 2736 blk_set_allow_write_beyond_eof(blk, true); 2737 2738 /* Write the header */ 2739 QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); 2740 header = g_malloc0(cluster_size); 2741 *header = (QCowHeader) { 2742 .magic = cpu_to_be32(QCOW_MAGIC), 2743 .version = cpu_to_be32(version), 2744 .cluster_bits = cpu_to_be32(ctz32(cluster_size)), 2745 .size = cpu_to_be64(0), 2746 .l1_table_offset = cpu_to_be64(0), 2747 .l1_size = cpu_to_be32(0), 2748 .refcount_table_offset = cpu_to_be64(cluster_size), 2749 .refcount_table_clusters = cpu_to_be32(1), 2750 .refcount_order = cpu_to_be32(refcount_order), 2751 .header_length = cpu_to_be32(sizeof(*header)), 2752 }; 2753 2754 /* We'll update this to correct value later */ 2755 header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); 2756 2757 if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { 2758 header->compatible_features |= 2759 cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); 2760 } 2761 2762 ret = blk_pwrite(blk, 0, header, cluster_size, 0); 2763 g_free(header); 2764 if (ret < 0) { 2765 error_setg_errno(errp, -ret, "Could not write qcow2 header"); 2766 goto out; 2767 } 2768 2769 /* Write a refcount table with one refcount block */ 2770 refcount_table = g_malloc0(2 * cluster_size); 2771 refcount_table[0] = cpu_to_be64(2 * cluster_size); 2772 ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0); 2773 g_free(refcount_table); 2774 2775 if (ret < 0) { 2776 error_setg_errno(errp, -ret, "Could not write refcount table"); 2777 goto out; 2778 } 2779 2780 blk_unref(blk); 2781 blk = NULL; 2782 2783 /* 2784 * And now open the image and make it consistent first (i.e. increase the 2785 * refcount of the cluster that is occupied by the header and the refcount 2786 * table) 2787 */ 2788 options = qdict_new(); 2789 qdict_put_str(options, "driver", "qcow2"); 2790 blk = blk_new_open(filename, NULL, options, 2791 BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH, 2792 &local_err); 2793 if (blk == NULL) { 2794 error_propagate(errp, local_err); 2795 ret = -EIO; 2796 goto out; 2797 } 2798 2799 ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size); 2800 if (ret < 0) { 2801 error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " 2802 "header and refcount table"); 2803 goto out; 2804 2805 } else if (ret != 0) { 2806 error_report("Huh, first cluster in empty image is already in use?"); 2807 abort(); 2808 } 2809 2810 /* Create a full header (including things like feature table) */ 2811 ret = qcow2_update_header(blk_bs(blk)); 2812 if (ret < 0) { 2813 error_setg_errno(errp, -ret, "Could not update qcow2 header"); 2814 goto out; 2815 } 2816 2817 /* Okay, now that we have a valid image, let's give it the right size */ 2818 ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp); 2819 if (ret < 0) { 2820 error_prepend(errp, "Could not resize image: "); 2821 goto out; 2822 } 2823 2824 /* Want a backing file? There you go.*/ 2825 if (backing_file) { 2826 ret = bdrv_change_backing_file(blk_bs(blk), backing_file, backing_format); 2827 if (ret < 0) { 2828 error_setg_errno(errp, -ret, "Could not assign backing file '%s' " 2829 "with format '%s'", backing_file, backing_format); 2830 goto out; 2831 } 2832 } 2833 2834 /* Want encryption? There you go. */ 2835 if (encryptfmt) { 2836 ret = qcow2_set_up_encryption(blk_bs(blk), encryptfmt, opts, errp); 2837 if (ret < 0) { 2838 goto out; 2839 } 2840 } 2841 2842 /* And if we're supposed to preallocate metadata, do that now */ 2843 if (prealloc != PREALLOC_MODE_OFF) { 2844 ret = preallocate(blk_bs(blk), 0, total_size); 2845 if (ret < 0) { 2846 error_setg_errno(errp, -ret, "Could not preallocate metadata"); 2847 goto out; 2848 } 2849 } 2850 2851 blk_unref(blk); 2852 blk = NULL; 2853 2854 /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning. 2855 * Using BDRV_O_NO_IO, since encryption is now setup we don't want to 2856 * have to setup decryption context. We're not doing any I/O on the top 2857 * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does 2858 * not have effect. 2859 */ 2860 options = qdict_new(); 2861 qdict_put_str(options, "driver", "qcow2"); 2862 blk = blk_new_open(filename, NULL, options, 2863 BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO, 2864 &local_err); 2865 if (blk == NULL) { 2866 error_propagate(errp, local_err); 2867 ret = -EIO; 2868 goto out; 2869 } 2870 2871 ret = 0; 2872 out: 2873 if (blk) { 2874 blk_unref(blk); 2875 } 2876 return ret; 2877 } 2878 2879 static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) 2880 { 2881 char *backing_file = NULL; 2882 char *backing_fmt = NULL; 2883 char *buf = NULL; 2884 uint64_t size = 0; 2885 int flags = 0; 2886 size_t cluster_size = DEFAULT_CLUSTER_SIZE; 2887 PreallocMode prealloc; 2888 int version; 2889 uint64_t refcount_bits; 2890 int refcount_order; 2891 char *encryptfmt = NULL; 2892 Error *local_err = NULL; 2893 int ret; 2894 2895 /* Read out options */ 2896 size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 2897 BDRV_SECTOR_SIZE); 2898 backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); 2899 backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); 2900 encryptfmt = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT); 2901 if (encryptfmt) { 2902 if (qemu_opt_get(opts, BLOCK_OPT_ENCRYPT)) { 2903 error_setg(errp, "Options " BLOCK_OPT_ENCRYPT " and " 2904 BLOCK_OPT_ENCRYPT_FORMAT " are mutually exclusive"); 2905 ret = -EINVAL; 2906 goto finish; 2907 } 2908 } else if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { 2909 encryptfmt = g_strdup("aes"); 2910 } 2911 cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err); 2912 if (local_err) { 2913 error_propagate(errp, local_err); 2914 ret = -EINVAL; 2915 goto finish; 2916 } 2917 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 2918 prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, 2919 PREALLOC_MODE_OFF, &local_err); 2920 if (local_err) { 2921 error_propagate(errp, local_err); 2922 ret = -EINVAL; 2923 goto finish; 2924 } 2925 2926 version = qcow2_opt_get_version_del(opts, &local_err); 2927 if (local_err) { 2928 error_propagate(errp, local_err); 2929 ret = -EINVAL; 2930 goto finish; 2931 } 2932 2933 if (qemu_opt_get_bool_del(opts, BLOCK_OPT_LAZY_REFCOUNTS, false)) { 2934 flags |= BLOCK_FLAG_LAZY_REFCOUNTS; 2935 } 2936 2937 if (backing_file && prealloc != PREALLOC_MODE_OFF) { 2938 error_setg(errp, "Backing file and preallocation cannot be used at " 2939 "the same time"); 2940 ret = -EINVAL; 2941 goto finish; 2942 } 2943 2944 if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { 2945 error_setg(errp, "Lazy refcounts only supported with compatibility " 2946 "level 1.1 and above (use compat=1.1 or greater)"); 2947 ret = -EINVAL; 2948 goto finish; 2949 } 2950 2951 refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err); 2952 if (local_err) { 2953 error_propagate(errp, local_err); 2954 ret = -EINVAL; 2955 goto finish; 2956 } 2957 2958 refcount_order = ctz32(refcount_bits); 2959 2960 ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, 2961 cluster_size, prealloc, opts, version, refcount_order, 2962 encryptfmt, &local_err); 2963 error_propagate(errp, local_err); 2964 2965 finish: 2966 g_free(backing_file); 2967 g_free(backing_fmt); 2968 g_free(encryptfmt); 2969 g_free(buf); 2970 return ret; 2971 } 2972 2973 2974 static bool is_zero_sectors(BlockDriverState *bs, int64_t start, 2975 uint32_t count) 2976 { 2977 int nr; 2978 BlockDriverState *file; 2979 int64_t res; 2980 2981 if (start + count > bs->total_sectors) { 2982 count = bs->total_sectors - start; 2983 } 2984 2985 if (!count) { 2986 return true; 2987 } 2988 res = bdrv_get_block_status_above(bs, NULL, start, count, 2989 &nr, &file); 2990 return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count; 2991 } 2992 2993 static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, 2994 int64_t offset, int bytes, BdrvRequestFlags flags) 2995 { 2996 int ret; 2997 BDRVQcow2State *s = bs->opaque; 2998 2999 uint32_t head = offset % s->cluster_size; 3000 uint32_t tail = (offset + bytes) % s->cluster_size; 3001 3002 trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes); 3003 if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) { 3004 tail = 0; 3005 } 3006 3007 if (head || tail) { 3008 int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS; 3009 uint64_t off; 3010 unsigned int nr; 3011 3012 assert(head + bytes <= s->cluster_size); 3013 3014 /* check whether remainder of cluster already reads as zero */ 3015 if (!(is_zero_sectors(bs, cl_start, 3016 DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) && 3017 is_zero_sectors(bs, (offset + bytes) >> BDRV_SECTOR_BITS, 3018 DIV_ROUND_UP(-tail & (s->cluster_size - 1), 3019 BDRV_SECTOR_SIZE)))) { 3020 return -ENOTSUP; 3021 } 3022 3023 qemu_co_mutex_lock(&s->lock); 3024 /* We can have new write after previous check */ 3025 offset = cl_start << BDRV_SECTOR_BITS; 3026 bytes = s->cluster_size; 3027 nr = s->cluster_size; 3028 ret = qcow2_get_cluster_offset(bs, offset, &nr, &off); 3029 if (ret != QCOW2_CLUSTER_UNALLOCATED && 3030 ret != QCOW2_CLUSTER_ZERO_PLAIN && 3031 ret != QCOW2_CLUSTER_ZERO_ALLOC) { 3032 qemu_co_mutex_unlock(&s->lock); 3033 return -ENOTSUP; 3034 } 3035 } else { 3036 qemu_co_mutex_lock(&s->lock); 3037 } 3038 3039 trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes); 3040 3041 /* Whatever is left can use real zero clusters */ 3042 ret = qcow2_cluster_zeroize(bs, offset, bytes, flags); 3043 qemu_co_mutex_unlock(&s->lock); 3044 3045 return ret; 3046 } 3047 3048 static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, 3049 int64_t offset, int bytes) 3050 { 3051 int ret; 3052 BDRVQcow2State *s = bs->opaque; 3053 3054 if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) { 3055 assert(bytes < s->cluster_size); 3056 /* Ignore partial clusters, except for the special case of the 3057 * complete partial cluster at the end of an unaligned file */ 3058 if (!QEMU_IS_ALIGNED(offset, s->cluster_size) || 3059 offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) { 3060 return -ENOTSUP; 3061 } 3062 } 3063 3064 qemu_co_mutex_lock(&s->lock); 3065 ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST, 3066 false); 3067 qemu_co_mutex_unlock(&s->lock); 3068 return ret; 3069 } 3070 3071 static int qcow2_truncate(BlockDriverState *bs, int64_t offset, 3072 PreallocMode prealloc, Error **errp) 3073 { 3074 BDRVQcow2State *s = bs->opaque; 3075 uint64_t old_length; 3076 int64_t new_l1_size; 3077 int ret; 3078 3079 if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA && 3080 prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL) 3081 { 3082 error_setg(errp, "Unsupported preallocation mode '%s'", 3083 PreallocMode_str(prealloc)); 3084 return -ENOTSUP; 3085 } 3086 3087 if (offset & 511) { 3088 error_setg(errp, "The new size must be a multiple of 512"); 3089 return -EINVAL; 3090 } 3091 3092 /* cannot proceed if image has snapshots */ 3093 if (s->nb_snapshots) { 3094 error_setg(errp, "Can't resize an image which has snapshots"); 3095 return -ENOTSUP; 3096 } 3097 3098 /* cannot proceed if image has bitmaps */ 3099 if (s->nb_bitmaps) { 3100 /* TODO: resize bitmaps in the image */ 3101 error_setg(errp, "Can't resize an image which has bitmaps"); 3102 return -ENOTSUP; 3103 } 3104 3105 old_length = bs->total_sectors * 512; 3106 3107 /* shrinking is currently not supported */ 3108 if (offset < old_length) { 3109 error_setg(errp, "qcow2 doesn't support shrinking images yet"); 3110 return -ENOTSUP; 3111 } 3112 3113 new_l1_size = size_to_l1(s, offset); 3114 ret = qcow2_grow_l1_table(bs, new_l1_size, true); 3115 if (ret < 0) { 3116 error_setg_errno(errp, -ret, "Failed to grow the L1 table"); 3117 return ret; 3118 } 3119 3120 switch (prealloc) { 3121 case PREALLOC_MODE_OFF: 3122 break; 3123 3124 case PREALLOC_MODE_METADATA: 3125 ret = preallocate(bs, old_length, offset); 3126 if (ret < 0) { 3127 error_setg_errno(errp, -ret, "Preallocation failed"); 3128 return ret; 3129 } 3130 break; 3131 3132 case PREALLOC_MODE_FALLOC: 3133 case PREALLOC_MODE_FULL: 3134 { 3135 int64_t allocation_start, host_offset, guest_offset; 3136 int64_t clusters_allocated; 3137 int64_t old_file_size, new_file_size; 3138 uint64_t nb_new_data_clusters, nb_new_l2_tables; 3139 3140 old_file_size = bdrv_getlength(bs->file->bs); 3141 if (old_file_size < 0) { 3142 error_setg_errno(errp, -old_file_size, 3143 "Failed to inquire current file length"); 3144 return ret; 3145 } 3146 3147 nb_new_data_clusters = DIV_ROUND_UP(offset - old_length, 3148 s->cluster_size); 3149 3150 /* This is an overestimation; we will not actually allocate space for 3151 * these in the file but just make sure the new refcount structures are 3152 * able to cover them so we will not have to allocate new refblocks 3153 * while entering the data blocks in the potentially new L2 tables. 3154 * (We do not actually care where the L2 tables are placed. Maybe they 3155 * are already allocated or they can be placed somewhere before 3156 * @old_file_size. It does not matter because they will be fully 3157 * allocated automatically, so they do not need to be covered by the 3158 * preallocation. All that matters is that we will not have to allocate 3159 * new refcount structures for them.) */ 3160 nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters, 3161 s->cluster_size / sizeof(uint64_t)); 3162 /* The cluster range may not be aligned to L2 boundaries, so add one L2 3163 * table for a potential head/tail */ 3164 nb_new_l2_tables++; 3165 3166 allocation_start = qcow2_refcount_area(bs, old_file_size, 3167 nb_new_data_clusters + 3168 nb_new_l2_tables, 3169 true, 0, 0); 3170 if (allocation_start < 0) { 3171 error_setg_errno(errp, -allocation_start, 3172 "Failed to resize refcount structures"); 3173 return -allocation_start; 3174 } 3175 3176 clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start, 3177 nb_new_data_clusters); 3178 if (clusters_allocated < 0) { 3179 error_setg_errno(errp, -clusters_allocated, 3180 "Failed to allocate data clusters"); 3181 return -clusters_allocated; 3182 } 3183 3184 assert(clusters_allocated == nb_new_data_clusters); 3185 3186 /* Allocate the data area */ 3187 new_file_size = allocation_start + 3188 nb_new_data_clusters * s->cluster_size; 3189 ret = bdrv_truncate(bs->file, new_file_size, prealloc, errp); 3190 if (ret < 0) { 3191 error_prepend(errp, "Failed to resize underlying file: "); 3192 qcow2_free_clusters(bs, allocation_start, 3193 nb_new_data_clusters * s->cluster_size, 3194 QCOW2_DISCARD_OTHER); 3195 return ret; 3196 } 3197 3198 /* Create the necessary L2 entries */ 3199 host_offset = allocation_start; 3200 guest_offset = old_length; 3201 while (nb_new_data_clusters) { 3202 int64_t guest_cluster = guest_offset >> s->cluster_bits; 3203 int64_t nb_clusters = MIN(nb_new_data_clusters, 3204 s->l2_size - guest_cluster % s->l2_size); 3205 QCowL2Meta allocation = { 3206 .offset = guest_offset, 3207 .alloc_offset = host_offset, 3208 .nb_clusters = nb_clusters, 3209 }; 3210 qemu_co_queue_init(&allocation.dependent_requests); 3211 3212 ret = qcow2_alloc_cluster_link_l2(bs, &allocation); 3213 if (ret < 0) { 3214 error_setg_errno(errp, -ret, "Failed to update L2 tables"); 3215 qcow2_free_clusters(bs, host_offset, 3216 nb_new_data_clusters * s->cluster_size, 3217 QCOW2_DISCARD_OTHER); 3218 return ret; 3219 } 3220 3221 guest_offset += nb_clusters * s->cluster_size; 3222 host_offset += nb_clusters * s->cluster_size; 3223 nb_new_data_clusters -= nb_clusters; 3224 } 3225 break; 3226 } 3227 3228 default: 3229 g_assert_not_reached(); 3230 } 3231 3232 if (prealloc != PREALLOC_MODE_OFF) { 3233 /* Flush metadata before actually changing the image size */ 3234 ret = bdrv_flush(bs); 3235 if (ret < 0) { 3236 error_setg_errno(errp, -ret, 3237 "Failed to flush the preallocated area to disk"); 3238 return ret; 3239 } 3240 } 3241 3242 /* write updated header.size */ 3243 offset = cpu_to_be64(offset); 3244 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), 3245 &offset, sizeof(uint64_t)); 3246 if (ret < 0) { 3247 error_setg_errno(errp, -ret, "Failed to update the image size"); 3248 return ret; 3249 } 3250 3251 s->l1_vm_state_index = new_l1_size; 3252 return 0; 3253 } 3254 3255 /* XXX: put compressed sectors first, then all the cluster aligned 3256 tables to avoid losing bytes in alignment */ 3257 static coroutine_fn int 3258 qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset, 3259 uint64_t bytes, QEMUIOVector *qiov) 3260 { 3261 BDRVQcow2State *s = bs->opaque; 3262 QEMUIOVector hd_qiov; 3263 struct iovec iov; 3264 z_stream strm; 3265 int ret, out_len; 3266 uint8_t *buf, *out_buf; 3267 int64_t cluster_offset; 3268 3269 if (bytes == 0) { 3270 /* align end of file to a sector boundary to ease reading with 3271 sector based I/Os */ 3272 cluster_offset = bdrv_getlength(bs->file->bs); 3273 if (cluster_offset < 0) { 3274 return cluster_offset; 3275 } 3276 return bdrv_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF, NULL); 3277 } 3278 3279 buf = qemu_blockalign(bs, s->cluster_size); 3280 if (bytes != s->cluster_size) { 3281 if (bytes > s->cluster_size || 3282 offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS) 3283 { 3284 qemu_vfree(buf); 3285 return -EINVAL; 3286 } 3287 /* Zero-pad last write if image size is not cluster aligned */ 3288 memset(buf + bytes, 0, s->cluster_size - bytes); 3289 } 3290 qemu_iovec_to_buf(qiov, 0, buf, bytes); 3291 3292 out_buf = g_malloc(s->cluster_size); 3293 3294 /* best compression, small window, no zlib header */ 3295 memset(&strm, 0, sizeof(strm)); 3296 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, 3297 Z_DEFLATED, -12, 3298 9, Z_DEFAULT_STRATEGY); 3299 if (ret != 0) { 3300 ret = -EINVAL; 3301 goto fail; 3302 } 3303 3304 strm.avail_in = s->cluster_size; 3305 strm.next_in = (uint8_t *)buf; 3306 strm.avail_out = s->cluster_size; 3307 strm.next_out = out_buf; 3308 3309 ret = deflate(&strm, Z_FINISH); 3310 if (ret != Z_STREAM_END && ret != Z_OK) { 3311 deflateEnd(&strm); 3312 ret = -EINVAL; 3313 goto fail; 3314 } 3315 out_len = strm.next_out - out_buf; 3316 3317 deflateEnd(&strm); 3318 3319 if (ret != Z_STREAM_END || out_len >= s->cluster_size) { 3320 /* could not compress: write normal cluster */ 3321 ret = qcow2_co_pwritev(bs, offset, bytes, qiov, 0); 3322 if (ret < 0) { 3323 goto fail; 3324 } 3325 goto success; 3326 } 3327 3328 qemu_co_mutex_lock(&s->lock); 3329 cluster_offset = 3330 qcow2_alloc_compressed_cluster_offset(bs, offset, out_len); 3331 if (!cluster_offset) { 3332 qemu_co_mutex_unlock(&s->lock); 3333 ret = -EIO; 3334 goto fail; 3335 } 3336 cluster_offset &= s->cluster_offset_mask; 3337 3338 ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len); 3339 qemu_co_mutex_unlock(&s->lock); 3340 if (ret < 0) { 3341 goto fail; 3342 } 3343 3344 iov = (struct iovec) { 3345 .iov_base = out_buf, 3346 .iov_len = out_len, 3347 }; 3348 qemu_iovec_init_external(&hd_qiov, &iov, 1); 3349 3350 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); 3351 ret = bdrv_co_pwritev(bs->file, cluster_offset, out_len, &hd_qiov, 0); 3352 if (ret < 0) { 3353 goto fail; 3354 } 3355 success: 3356 ret = 0; 3357 fail: 3358 qemu_vfree(buf); 3359 g_free(out_buf); 3360 return ret; 3361 } 3362 3363 static int make_completely_empty(BlockDriverState *bs) 3364 { 3365 BDRVQcow2State *s = bs->opaque; 3366 Error *local_err = NULL; 3367 int ret, l1_clusters; 3368 int64_t offset; 3369 uint64_t *new_reftable = NULL; 3370 uint64_t rt_entry, l1_size2; 3371 struct { 3372 uint64_t l1_offset; 3373 uint64_t reftable_offset; 3374 uint32_t reftable_clusters; 3375 } QEMU_PACKED l1_ofs_rt_ofs_cls; 3376 3377 ret = qcow2_cache_empty(bs, s->l2_table_cache); 3378 if (ret < 0) { 3379 goto fail; 3380 } 3381 3382 ret = qcow2_cache_empty(bs, s->refcount_block_cache); 3383 if (ret < 0) { 3384 goto fail; 3385 } 3386 3387 /* Refcounts will be broken utterly */ 3388 ret = qcow2_mark_dirty(bs); 3389 if (ret < 0) { 3390 goto fail; 3391 } 3392 3393 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 3394 3395 l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); 3396 l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t); 3397 3398 /* After this call, neither the in-memory nor the on-disk refcount 3399 * information accurately describe the actual references */ 3400 3401 ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset, 3402 l1_clusters * s->cluster_size, 0); 3403 if (ret < 0) { 3404 goto fail_broken_refcounts; 3405 } 3406 memset(s->l1_table, 0, l1_size2); 3407 3408 BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE); 3409 3410 /* Overwrite enough clusters at the beginning of the sectors to place 3411 * the refcount table, a refcount block and the L1 table in; this may 3412 * overwrite parts of the existing refcount and L1 table, which is not 3413 * an issue because the dirty flag is set, complete data loss is in fact 3414 * desired and partial data loss is consequently fine as well */ 3415 ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size, 3416 (2 + l1_clusters) * s->cluster_size, 0); 3417 /* This call (even if it failed overall) may have overwritten on-disk 3418 * refcount structures; in that case, the in-memory refcount information 3419 * will probably differ from the on-disk information which makes the BDS 3420 * unusable */ 3421 if (ret < 0) { 3422 goto fail_broken_refcounts; 3423 } 3424 3425 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 3426 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); 3427 3428 /* "Create" an empty reftable (one cluster) directly after the image 3429 * header and an empty L1 table three clusters after the image header; 3430 * the cluster between those two will be used as the first refblock */ 3431 l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size); 3432 l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size); 3433 l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1); 3434 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset), 3435 &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); 3436 if (ret < 0) { 3437 goto fail_broken_refcounts; 3438 } 3439 3440 s->l1_table_offset = 3 * s->cluster_size; 3441 3442 new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t)); 3443 if (!new_reftable) { 3444 ret = -ENOMEM; 3445 goto fail_broken_refcounts; 3446 } 3447 3448 s->refcount_table_offset = s->cluster_size; 3449 s->refcount_table_size = s->cluster_size / sizeof(uint64_t); 3450 s->max_refcount_table_index = 0; 3451 3452 g_free(s->refcount_table); 3453 s->refcount_table = new_reftable; 3454 new_reftable = NULL; 3455 3456 /* Now the in-memory refcount information again corresponds to the on-disk 3457 * information (reftable is empty and no refblocks (the refblock cache is 3458 * empty)); however, this means some clusters (e.g. the image header) are 3459 * referenced, but not refcounted, but the normal qcow2 code assumes that 3460 * the in-memory information is always correct */ 3461 3462 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); 3463 3464 /* Enter the first refblock into the reftable */ 3465 rt_entry = cpu_to_be64(2 * s->cluster_size); 3466 ret = bdrv_pwrite_sync(bs->file, s->cluster_size, 3467 &rt_entry, sizeof(rt_entry)); 3468 if (ret < 0) { 3469 goto fail_broken_refcounts; 3470 } 3471 s->refcount_table[0] = 2 * s->cluster_size; 3472 3473 s->free_cluster_index = 0; 3474 assert(3 + l1_clusters <= s->refcount_block_size); 3475 offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2); 3476 if (offset < 0) { 3477 ret = offset; 3478 goto fail_broken_refcounts; 3479 } else if (offset > 0) { 3480 error_report("First cluster in emptied image is in use"); 3481 abort(); 3482 } 3483 3484 /* Now finally the in-memory information corresponds to the on-disk 3485 * structures and is correct */ 3486 ret = qcow2_mark_clean(bs); 3487 if (ret < 0) { 3488 goto fail; 3489 } 3490 3491 ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, 3492 PREALLOC_MODE_OFF, &local_err); 3493 if (ret < 0) { 3494 error_report_err(local_err); 3495 goto fail; 3496 } 3497 3498 return 0; 3499 3500 fail_broken_refcounts: 3501 /* The BDS is unusable at this point. If we wanted to make it usable, we 3502 * would have to call qcow2_refcount_close(), qcow2_refcount_init(), 3503 * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init() 3504 * again. However, because the functions which could have caused this error 3505 * path to be taken are used by those functions as well, it's very likely 3506 * that that sequence will fail as well. Therefore, just eject the BDS. */ 3507 bs->drv = NULL; 3508 3509 fail: 3510 g_free(new_reftable); 3511 return ret; 3512 } 3513 3514 static int qcow2_make_empty(BlockDriverState *bs) 3515 { 3516 BDRVQcow2State *s = bs->opaque; 3517 uint64_t offset, end_offset; 3518 int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size); 3519 int l1_clusters, ret = 0; 3520 3521 l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); 3522 3523 if (s->qcow_version >= 3 && !s->snapshots && 3524 3 + l1_clusters <= s->refcount_block_size) { 3525 /* The following function only works for qcow2 v3 images (it requires 3526 * the dirty flag) and only as long as there are no snapshots (because 3527 * it completely empties the image). Furthermore, the L1 table and three 3528 * additional clusters (image header, refcount table, one refcount 3529 * block) have to fit inside one refcount block. */ 3530 return make_completely_empty(bs); 3531 } 3532 3533 /* This fallback code simply discards every active cluster; this is slow, 3534 * but works in all cases */ 3535 end_offset = bs->total_sectors * BDRV_SECTOR_SIZE; 3536 for (offset = 0; offset < end_offset; offset += step) { 3537 /* As this function is generally used after committing an external 3538 * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the 3539 * default action for this kind of discard is to pass the discard, 3540 * which will ideally result in an actually smaller image file, as 3541 * is probably desired. */ 3542 ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset), 3543 QCOW2_DISCARD_SNAPSHOT, true); 3544 if (ret < 0) { 3545 break; 3546 } 3547 } 3548 3549 return ret; 3550 } 3551 3552 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) 3553 { 3554 BDRVQcow2State *s = bs->opaque; 3555 int ret; 3556 3557 qemu_co_mutex_lock(&s->lock); 3558 ret = qcow2_cache_write(bs, s->l2_table_cache); 3559 if (ret < 0) { 3560 qemu_co_mutex_unlock(&s->lock); 3561 return ret; 3562 } 3563 3564 if (qcow2_need_accurate_refcounts(s)) { 3565 ret = qcow2_cache_write(bs, s->refcount_block_cache); 3566 if (ret < 0) { 3567 qemu_co_mutex_unlock(&s->lock); 3568 return ret; 3569 } 3570 } 3571 qemu_co_mutex_unlock(&s->lock); 3572 3573 return 0; 3574 } 3575 3576 static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs, 3577 Error **errp) 3578 { 3579 Error *local_err = NULL; 3580 BlockMeasureInfo *info; 3581 uint64_t required = 0; /* bytes that contribute to required size */ 3582 uint64_t virtual_size; /* disk size as seen by guest */ 3583 uint64_t refcount_bits; 3584 uint64_t l2_tables; 3585 size_t cluster_size; 3586 int version; 3587 char *optstr; 3588 PreallocMode prealloc; 3589 bool has_backing_file; 3590 3591 /* Parse image creation options */ 3592 cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err); 3593 if (local_err) { 3594 goto err; 3595 } 3596 3597 version = qcow2_opt_get_version_del(opts, &local_err); 3598 if (local_err) { 3599 goto err; 3600 } 3601 3602 refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err); 3603 if (local_err) { 3604 goto err; 3605 } 3606 3607 optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 3608 prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr, 3609 PREALLOC_MODE_OFF, &local_err); 3610 g_free(optstr); 3611 if (local_err) { 3612 goto err; 3613 } 3614 3615 optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); 3616 has_backing_file = !!optstr; 3617 g_free(optstr); 3618 3619 virtual_size = align_offset(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 3620 cluster_size); 3621 3622 /* Check that virtual disk size is valid */ 3623 l2_tables = DIV_ROUND_UP(virtual_size / cluster_size, 3624 cluster_size / sizeof(uint64_t)); 3625 if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) { 3626 error_setg(&local_err, "The image size is too large " 3627 "(try using a larger cluster size)"); 3628 goto err; 3629 } 3630 3631 /* Account for input image */ 3632 if (in_bs) { 3633 int64_t ssize = bdrv_getlength(in_bs); 3634 if (ssize < 0) { 3635 error_setg_errno(&local_err, -ssize, 3636 "Unable to get image virtual_size"); 3637 goto err; 3638 } 3639 3640 virtual_size = align_offset(ssize, cluster_size); 3641 3642 if (has_backing_file) { 3643 /* We don't how much of the backing chain is shared by the input 3644 * image and the new image file. In the worst case the new image's 3645 * backing file has nothing in common with the input image. Be 3646 * conservative and assume all clusters need to be written. 3647 */ 3648 required = virtual_size; 3649 } else { 3650 int cluster_sectors = cluster_size / BDRV_SECTOR_SIZE; 3651 int64_t sector_num; 3652 int pnum = 0; 3653 3654 for (sector_num = 0; 3655 sector_num < ssize / BDRV_SECTOR_SIZE; 3656 sector_num += pnum) { 3657 int nb_sectors = MIN(ssize / BDRV_SECTOR_SIZE - sector_num, 3658 BDRV_REQUEST_MAX_SECTORS); 3659 BlockDriverState *file; 3660 int64_t ret; 3661 3662 ret = bdrv_get_block_status_above(in_bs, NULL, 3663 sector_num, nb_sectors, 3664 &pnum, &file); 3665 if (ret < 0) { 3666 error_setg_errno(&local_err, -ret, 3667 "Unable to get block status"); 3668 goto err; 3669 } 3670 3671 if (ret & BDRV_BLOCK_ZERO) { 3672 /* Skip zero regions (safe with no backing file) */ 3673 } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) == 3674 (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) { 3675 /* Extend pnum to end of cluster for next iteration */ 3676 pnum = ROUND_UP(sector_num + pnum, cluster_sectors) - 3677 sector_num; 3678 3679 /* Count clusters we've seen */ 3680 required += (sector_num % cluster_sectors + pnum) * 3681 BDRV_SECTOR_SIZE; 3682 } 3683 } 3684 } 3685 } 3686 3687 /* Take into account preallocation. Nothing special is needed for 3688 * PREALLOC_MODE_METADATA since metadata is always counted. 3689 */ 3690 if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { 3691 required = virtual_size; 3692 } 3693 3694 info = g_new(BlockMeasureInfo, 1); 3695 info->fully_allocated = 3696 qcow2_calc_prealloc_size(virtual_size, cluster_size, 3697 ctz32(refcount_bits)); 3698 3699 /* Remove data clusters that are not required. This overestimates the 3700 * required size because metadata needed for the fully allocated file is 3701 * still counted. 3702 */ 3703 info->required = info->fully_allocated - virtual_size + required; 3704 return info; 3705 3706 err: 3707 error_propagate(errp, local_err); 3708 return NULL; 3709 } 3710 3711 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 3712 { 3713 BDRVQcow2State *s = bs->opaque; 3714 bdi->unallocated_blocks_are_zero = true; 3715 bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3); 3716 bdi->cluster_size = s->cluster_size; 3717 bdi->vm_state_offset = qcow2_vm_state_offset(s); 3718 return 0; 3719 } 3720 3721 static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) 3722 { 3723 BDRVQcow2State *s = bs->opaque; 3724 ImageInfoSpecific *spec_info; 3725 QCryptoBlockInfo *encrypt_info = NULL; 3726 3727 if (s->crypto != NULL) { 3728 encrypt_info = qcrypto_block_get_info(s->crypto, &error_abort); 3729 } 3730 3731 spec_info = g_new(ImageInfoSpecific, 1); 3732 *spec_info = (ImageInfoSpecific){ 3733 .type = IMAGE_INFO_SPECIFIC_KIND_QCOW2, 3734 .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1), 3735 }; 3736 if (s->qcow_version == 2) { 3737 *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){ 3738 .compat = g_strdup("0.10"), 3739 .refcount_bits = s->refcount_bits, 3740 }; 3741 } else if (s->qcow_version == 3) { 3742 *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){ 3743 .compat = g_strdup("1.1"), 3744 .lazy_refcounts = s->compatible_features & 3745 QCOW2_COMPAT_LAZY_REFCOUNTS, 3746 .has_lazy_refcounts = true, 3747 .corrupt = s->incompatible_features & 3748 QCOW2_INCOMPAT_CORRUPT, 3749 .has_corrupt = true, 3750 .refcount_bits = s->refcount_bits, 3751 }; 3752 } else { 3753 /* if this assertion fails, this probably means a new version was 3754 * added without having it covered here */ 3755 assert(false); 3756 } 3757 3758 if (encrypt_info) { 3759 ImageInfoSpecificQCow2Encryption *qencrypt = 3760 g_new(ImageInfoSpecificQCow2Encryption, 1); 3761 switch (encrypt_info->format) { 3762 case Q_CRYPTO_BLOCK_FORMAT_QCOW: 3763 qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES; 3764 qencrypt->u.aes = encrypt_info->u.qcow; 3765 break; 3766 case Q_CRYPTO_BLOCK_FORMAT_LUKS: 3767 qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS; 3768 qencrypt->u.luks = encrypt_info->u.luks; 3769 break; 3770 default: 3771 abort(); 3772 } 3773 /* Since we did shallow copy above, erase any pointers 3774 * in the original info */ 3775 memset(&encrypt_info->u, 0, sizeof(encrypt_info->u)); 3776 qapi_free_QCryptoBlockInfo(encrypt_info); 3777 3778 spec_info->u.qcow2.data->has_encrypt = true; 3779 spec_info->u.qcow2.data->encrypt = qencrypt; 3780 } 3781 3782 return spec_info; 3783 } 3784 3785 static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, 3786 int64_t pos) 3787 { 3788 BDRVQcow2State *s = bs->opaque; 3789 3790 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); 3791 return bs->drv->bdrv_co_pwritev(bs, qcow2_vm_state_offset(s) + pos, 3792 qiov->size, qiov, 0); 3793 } 3794 3795 static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, 3796 int64_t pos) 3797 { 3798 BDRVQcow2State *s = bs->opaque; 3799 3800 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); 3801 return bs->drv->bdrv_co_preadv(bs, qcow2_vm_state_offset(s) + pos, 3802 qiov->size, qiov, 0); 3803 } 3804 3805 /* 3806 * Downgrades an image's version. To achieve this, any incompatible features 3807 * have to be removed. 3808 */ 3809 static int qcow2_downgrade(BlockDriverState *bs, int target_version, 3810 BlockDriverAmendStatusCB *status_cb, void *cb_opaque) 3811 { 3812 BDRVQcow2State *s = bs->opaque; 3813 int current_version = s->qcow_version; 3814 int ret; 3815 3816 if (target_version == current_version) { 3817 return 0; 3818 } else if (target_version > current_version) { 3819 return -EINVAL; 3820 } else if (target_version != 2) { 3821 return -EINVAL; 3822 } 3823 3824 if (s->refcount_order != 4) { 3825 error_report("compat=0.10 requires refcount_bits=16"); 3826 return -ENOTSUP; 3827 } 3828 3829 /* clear incompatible features */ 3830 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { 3831 ret = qcow2_mark_clean(bs); 3832 if (ret < 0) { 3833 return ret; 3834 } 3835 } 3836 3837 /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in 3838 * the first place; if that happens nonetheless, returning -ENOTSUP is the 3839 * best thing to do anyway */ 3840 3841 if (s->incompatible_features) { 3842 return -ENOTSUP; 3843 } 3844 3845 /* since we can ignore compatible features, we can set them to 0 as well */ 3846 s->compatible_features = 0; 3847 /* if lazy refcounts have been used, they have already been fixed through 3848 * clearing the dirty flag */ 3849 3850 /* clearing autoclear features is trivial */ 3851 s->autoclear_features = 0; 3852 3853 ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque); 3854 if (ret < 0) { 3855 return ret; 3856 } 3857 3858 s->qcow_version = target_version; 3859 ret = qcow2_update_header(bs); 3860 if (ret < 0) { 3861 s->qcow_version = current_version; 3862 return ret; 3863 } 3864 return 0; 3865 } 3866 3867 typedef enum Qcow2AmendOperation { 3868 /* This is the value Qcow2AmendHelperCBInfo::last_operation will be 3869 * statically initialized to so that the helper CB can discern the first 3870 * invocation from an operation change */ 3871 QCOW2_NO_OPERATION = 0, 3872 3873 QCOW2_CHANGING_REFCOUNT_ORDER, 3874 QCOW2_DOWNGRADING, 3875 } Qcow2AmendOperation; 3876 3877 typedef struct Qcow2AmendHelperCBInfo { 3878 /* The code coordinating the amend operations should only modify 3879 * these four fields; the rest will be managed by the CB */ 3880 BlockDriverAmendStatusCB *original_status_cb; 3881 void *original_cb_opaque; 3882 3883 Qcow2AmendOperation current_operation; 3884 3885 /* Total number of operations to perform (only set once) */ 3886 int total_operations; 3887 3888 /* The following fields are managed by the CB */ 3889 3890 /* Number of operations completed */ 3891 int operations_completed; 3892 3893 /* Cumulative offset of all completed operations */ 3894 int64_t offset_completed; 3895 3896 Qcow2AmendOperation last_operation; 3897 int64_t last_work_size; 3898 } Qcow2AmendHelperCBInfo; 3899 3900 static void qcow2_amend_helper_cb(BlockDriverState *bs, 3901 int64_t operation_offset, 3902 int64_t operation_work_size, void *opaque) 3903 { 3904 Qcow2AmendHelperCBInfo *info = opaque; 3905 int64_t current_work_size; 3906 int64_t projected_work_size; 3907 3908 if (info->current_operation != info->last_operation) { 3909 if (info->last_operation != QCOW2_NO_OPERATION) { 3910 info->offset_completed += info->last_work_size; 3911 info->operations_completed++; 3912 } 3913 3914 info->last_operation = info->current_operation; 3915 } 3916 3917 assert(info->total_operations > 0); 3918 assert(info->operations_completed < info->total_operations); 3919 3920 info->last_work_size = operation_work_size; 3921 3922 current_work_size = info->offset_completed + operation_work_size; 3923 3924 /* current_work_size is the total work size for (operations_completed + 1) 3925 * operations (which includes this one), so multiply it by the number of 3926 * operations not covered and divide it by the number of operations 3927 * covered to get a projection for the operations not covered */ 3928 projected_work_size = current_work_size * (info->total_operations - 3929 info->operations_completed - 1) 3930 / (info->operations_completed + 1); 3931 3932 info->original_status_cb(bs, info->offset_completed + operation_offset, 3933 current_work_size + projected_work_size, 3934 info->original_cb_opaque); 3935 } 3936 3937 static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, 3938 BlockDriverAmendStatusCB *status_cb, 3939 void *cb_opaque) 3940 { 3941 BDRVQcow2State *s = bs->opaque; 3942 int old_version = s->qcow_version, new_version = old_version; 3943 uint64_t new_size = 0; 3944 const char *backing_file = NULL, *backing_format = NULL; 3945 bool lazy_refcounts = s->use_lazy_refcounts; 3946 const char *compat = NULL; 3947 uint64_t cluster_size = s->cluster_size; 3948 bool encrypt; 3949 int encformat; 3950 int refcount_bits = s->refcount_bits; 3951 Error *local_err = NULL; 3952 int ret; 3953 QemuOptDesc *desc = opts->list->desc; 3954 Qcow2AmendHelperCBInfo helper_cb_info; 3955 3956 while (desc && desc->name) { 3957 if (!qemu_opt_find(opts, desc->name)) { 3958 /* only change explicitly defined options */ 3959 desc++; 3960 continue; 3961 } 3962 3963 if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) { 3964 compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL); 3965 if (!compat) { 3966 /* preserve default */ 3967 } else if (!strcmp(compat, "0.10")) { 3968 new_version = 2; 3969 } else if (!strcmp(compat, "1.1")) { 3970 new_version = 3; 3971 } else { 3972 error_report("Unknown compatibility level %s", compat); 3973 return -EINVAL; 3974 } 3975 } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) { 3976 error_report("Cannot change preallocation mode"); 3977 return -ENOTSUP; 3978 } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) { 3979 new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); 3980 } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) { 3981 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); 3982 } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) { 3983 backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); 3984 } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) { 3985 encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT, 3986 !!s->crypto); 3987 3988 if (encrypt != !!s->crypto) { 3989 error_report("Changing the encryption flag is not supported"); 3990 return -ENOTSUP; 3991 } 3992 } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT_FORMAT)) { 3993 encformat = qcow2_crypt_method_from_format( 3994 qemu_opt_get(opts, BLOCK_OPT_ENCRYPT_FORMAT)); 3995 3996 if (encformat != s->crypt_method_header) { 3997 error_report("Changing the encryption format is not supported"); 3998 return -ENOTSUP; 3999 } 4000 } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) { 4001 cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 4002 cluster_size); 4003 if (cluster_size != s->cluster_size) { 4004 error_report("Changing the cluster size is not supported"); 4005 return -ENOTSUP; 4006 } 4007 } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) { 4008 lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS, 4009 lazy_refcounts); 4010 } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) { 4011 refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS, 4012 refcount_bits); 4013 4014 if (refcount_bits <= 0 || refcount_bits > 64 || 4015 !is_power_of_2(refcount_bits)) 4016 { 4017 error_report("Refcount width must be a power of two and may " 4018 "not exceed 64 bits"); 4019 return -EINVAL; 4020 } 4021 } else { 4022 /* if this point is reached, this probably means a new option was 4023 * added without having it covered here */ 4024 abort(); 4025 } 4026 4027 desc++; 4028 } 4029 4030 helper_cb_info = (Qcow2AmendHelperCBInfo){ 4031 .original_status_cb = status_cb, 4032 .original_cb_opaque = cb_opaque, 4033 .total_operations = (new_version < old_version) 4034 + (s->refcount_bits != refcount_bits) 4035 }; 4036 4037 /* Upgrade first (some features may require compat=1.1) */ 4038 if (new_version > old_version) { 4039 s->qcow_version = new_version; 4040 ret = qcow2_update_header(bs); 4041 if (ret < 0) { 4042 s->qcow_version = old_version; 4043 return ret; 4044 } 4045 } 4046 4047 if (s->refcount_bits != refcount_bits) { 4048 int refcount_order = ctz32(refcount_bits); 4049 4050 if (new_version < 3 && refcount_bits != 16) { 4051 error_report("Different refcount widths than 16 bits require " 4052 "compatibility level 1.1 or above (use compat=1.1 or " 4053 "greater)"); 4054 return -EINVAL; 4055 } 4056 4057 helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER; 4058 ret = qcow2_change_refcount_order(bs, refcount_order, 4059 &qcow2_amend_helper_cb, 4060 &helper_cb_info, &local_err); 4061 if (ret < 0) { 4062 error_report_err(local_err); 4063 return ret; 4064 } 4065 } 4066 4067 if (backing_file || backing_format) { 4068 ret = qcow2_change_backing_file(bs, 4069 backing_file ?: s->image_backing_file, 4070 backing_format ?: s->image_backing_format); 4071 if (ret < 0) { 4072 return ret; 4073 } 4074 } 4075 4076 if (s->use_lazy_refcounts != lazy_refcounts) { 4077 if (lazy_refcounts) { 4078 if (new_version < 3) { 4079 error_report("Lazy refcounts only supported with compatibility " 4080 "level 1.1 and above (use compat=1.1 or greater)"); 4081 return -EINVAL; 4082 } 4083 s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; 4084 ret = qcow2_update_header(bs); 4085 if (ret < 0) { 4086 s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; 4087 return ret; 4088 } 4089 s->use_lazy_refcounts = true; 4090 } else { 4091 /* make image clean first */ 4092 ret = qcow2_mark_clean(bs); 4093 if (ret < 0) { 4094 return ret; 4095 } 4096 /* now disallow lazy refcounts */ 4097 s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; 4098 ret = qcow2_update_header(bs); 4099 if (ret < 0) { 4100 s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; 4101 return ret; 4102 } 4103 s->use_lazy_refcounts = false; 4104 } 4105 } 4106 4107 if (new_size) { 4108 BlockBackend *blk = blk_new(BLK_PERM_RESIZE, BLK_PERM_ALL); 4109 ret = blk_insert_bs(blk, bs, &local_err); 4110 if (ret < 0) { 4111 error_report_err(local_err); 4112 blk_unref(blk); 4113 return ret; 4114 } 4115 4116 ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, &local_err); 4117 blk_unref(blk); 4118 if (ret < 0) { 4119 error_report_err(local_err); 4120 return ret; 4121 } 4122 } 4123 4124 /* Downgrade last (so unsupported features can be removed before) */ 4125 if (new_version < old_version) { 4126 helper_cb_info.current_operation = QCOW2_DOWNGRADING; 4127 ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb, 4128 &helper_cb_info); 4129 if (ret < 0) { 4130 return ret; 4131 } 4132 } 4133 4134 return 0; 4135 } 4136 4137 /* 4138 * If offset or size are negative, respectively, they will not be included in 4139 * the BLOCK_IMAGE_CORRUPTED event emitted. 4140 * fatal will be ignored for read-only BDS; corruptions found there will always 4141 * be considered non-fatal. 4142 */ 4143 void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, 4144 int64_t size, const char *message_format, ...) 4145 { 4146 BDRVQcow2State *s = bs->opaque; 4147 const char *node_name; 4148 char *message; 4149 va_list ap; 4150 4151 fatal = fatal && !bs->read_only; 4152 4153 if (s->signaled_corruption && 4154 (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT))) 4155 { 4156 return; 4157 } 4158 4159 va_start(ap, message_format); 4160 message = g_strdup_vprintf(message_format, ap); 4161 va_end(ap); 4162 4163 if (fatal) { 4164 fprintf(stderr, "qcow2: Marking image as corrupt: %s; further " 4165 "corruption events will be suppressed\n", message); 4166 } else { 4167 fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal " 4168 "corruption events will be suppressed\n", message); 4169 } 4170 4171 node_name = bdrv_get_node_name(bs); 4172 qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), 4173 *node_name != '\0', node_name, 4174 message, offset >= 0, offset, 4175 size >= 0, size, 4176 fatal, &error_abort); 4177 g_free(message); 4178 4179 if (fatal) { 4180 qcow2_mark_corrupt(bs); 4181 bs->drv = NULL; /* make BDS unusable */ 4182 } 4183 4184 s->signaled_corruption = true; 4185 } 4186 4187 static QemuOptsList qcow2_create_opts = { 4188 .name = "qcow2-create-opts", 4189 .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head), 4190 .desc = { 4191 { 4192 .name = BLOCK_OPT_SIZE, 4193 .type = QEMU_OPT_SIZE, 4194 .help = "Virtual disk size" 4195 }, 4196 { 4197 .name = BLOCK_OPT_COMPAT_LEVEL, 4198 .type = QEMU_OPT_STRING, 4199 .help = "Compatibility level (0.10 or 1.1)" 4200 }, 4201 { 4202 .name = BLOCK_OPT_BACKING_FILE, 4203 .type = QEMU_OPT_STRING, 4204 .help = "File name of a base image" 4205 }, 4206 { 4207 .name = BLOCK_OPT_BACKING_FMT, 4208 .type = QEMU_OPT_STRING, 4209 .help = "Image format of the base image" 4210 }, 4211 { 4212 .name = BLOCK_OPT_ENCRYPT, 4213 .type = QEMU_OPT_BOOL, 4214 .help = "Encrypt the image with format 'aes'. (Deprecated " 4215 "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)", 4216 }, 4217 { 4218 .name = BLOCK_OPT_ENCRYPT_FORMAT, 4219 .type = QEMU_OPT_STRING, 4220 .help = "Encrypt the image, format choices: 'aes', 'luks'", 4221 }, 4222 BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.", 4223 "ID of secret providing qcow AES key or LUKS passphrase"), 4224 BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."), 4225 BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."), 4226 BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."), 4227 BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."), 4228 BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."), 4229 BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."), 4230 { 4231 .name = BLOCK_OPT_CLUSTER_SIZE, 4232 .type = QEMU_OPT_SIZE, 4233 .help = "qcow2 cluster size", 4234 .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) 4235 }, 4236 { 4237 .name = BLOCK_OPT_PREALLOC, 4238 .type = QEMU_OPT_STRING, 4239 .help = "Preallocation mode (allowed values: off, metadata, " 4240 "falloc, full)" 4241 }, 4242 { 4243 .name = BLOCK_OPT_LAZY_REFCOUNTS, 4244 .type = QEMU_OPT_BOOL, 4245 .help = "Postpone refcount updates", 4246 .def_value_str = "off" 4247 }, 4248 { 4249 .name = BLOCK_OPT_REFCOUNT_BITS, 4250 .type = QEMU_OPT_NUMBER, 4251 .help = "Width of a reference count entry in bits", 4252 .def_value_str = "16" 4253 }, 4254 { /* end of list */ } 4255 } 4256 }; 4257 4258 BlockDriver bdrv_qcow2 = { 4259 .format_name = "qcow2", 4260 .instance_size = sizeof(BDRVQcow2State), 4261 .bdrv_probe = qcow2_probe, 4262 .bdrv_open = qcow2_open, 4263 .bdrv_close = qcow2_close, 4264 .bdrv_reopen_prepare = qcow2_reopen_prepare, 4265 .bdrv_reopen_commit = qcow2_reopen_commit, 4266 .bdrv_reopen_abort = qcow2_reopen_abort, 4267 .bdrv_join_options = qcow2_join_options, 4268 .bdrv_child_perm = bdrv_format_default_perms, 4269 .bdrv_create = qcow2_create, 4270 .bdrv_has_zero_init = bdrv_has_zero_init_1, 4271 .bdrv_co_get_block_status = qcow2_co_get_block_status, 4272 4273 .bdrv_co_preadv = qcow2_co_preadv, 4274 .bdrv_co_pwritev = qcow2_co_pwritev, 4275 .bdrv_co_flush_to_os = qcow2_co_flush_to_os, 4276 4277 .bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes, 4278 .bdrv_co_pdiscard = qcow2_co_pdiscard, 4279 .bdrv_truncate = qcow2_truncate, 4280 .bdrv_co_pwritev_compressed = qcow2_co_pwritev_compressed, 4281 .bdrv_make_empty = qcow2_make_empty, 4282 4283 .bdrv_snapshot_create = qcow2_snapshot_create, 4284 .bdrv_snapshot_goto = qcow2_snapshot_goto, 4285 .bdrv_snapshot_delete = qcow2_snapshot_delete, 4286 .bdrv_snapshot_list = qcow2_snapshot_list, 4287 .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, 4288 .bdrv_measure = qcow2_measure, 4289 .bdrv_get_info = qcow2_get_info, 4290 .bdrv_get_specific_info = qcow2_get_specific_info, 4291 4292 .bdrv_save_vmstate = qcow2_save_vmstate, 4293 .bdrv_load_vmstate = qcow2_load_vmstate, 4294 4295 .supports_backing = true, 4296 .bdrv_change_backing_file = qcow2_change_backing_file, 4297 4298 .bdrv_refresh_limits = qcow2_refresh_limits, 4299 .bdrv_invalidate_cache = qcow2_invalidate_cache, 4300 .bdrv_inactivate = qcow2_inactivate, 4301 4302 .create_opts = &qcow2_create_opts, 4303 .bdrv_check = qcow2_check, 4304 .bdrv_amend_options = qcow2_amend_options, 4305 4306 .bdrv_detach_aio_context = qcow2_detach_aio_context, 4307 .bdrv_attach_aio_context = qcow2_attach_aio_context, 4308 4309 .bdrv_reopen_bitmaps_rw = qcow2_reopen_bitmaps_rw, 4310 .bdrv_can_store_new_dirty_bitmap = qcow2_can_store_new_dirty_bitmap, 4311 .bdrv_remove_persistent_dirty_bitmap = qcow2_remove_persistent_dirty_bitmap, 4312 }; 4313 4314 static void bdrv_qcow2_init(void) 4315 { 4316 bdrv_register(&bdrv_qcow2); 4317 } 4318 4319 block_init(bdrv_qcow2_init); 4320