1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 27 #include "block/qdict.h" 28 #include "sysemu/block-backend.h" 29 #include "qemu/main-loop.h" 30 #include "qemu/module.h" 31 #include "qcow2.h" 32 #include "qemu/error-report.h" 33 #include "qapi/error.h" 34 #include "qapi/qapi-events-block-core.h" 35 #include "qapi/qmp/qdict.h" 36 #include "qapi/qmp/qstring.h" 37 #include "trace.h" 38 #include "qemu/option_int.h" 39 #include "qemu/cutils.h" 40 #include "qemu/bswap.h" 41 #include "qapi/qobject-input-visitor.h" 42 #include "qapi/qapi-visit-block-core.h" 43 #include "crypto.h" 44 #include "block/aio_task.h" 45 46 /* 47 Differences with QCOW: 48 49 - Support for multiple incremental snapshots. 50 - Memory management by reference counts. 51 - Clusters which have a reference count of one have the bit 52 QCOW_OFLAG_COPIED to optimize write performance. 53 - Size of compressed clusters is stored in sectors to reduce bit usage 54 in the cluster offsets. 55 - Support for storing additional data (such as the VM state) in the 56 snapshots. 57 - If a backing store is used, the cluster size is not constrained 58 (could be backported to QCOW). 59 - L2 tables have always a size of one cluster. 60 */ 61 62 63 typedef struct { 64 uint32_t magic; 65 uint32_t len; 66 } QEMU_PACKED QCowExtension; 67 68 #define QCOW2_EXT_MAGIC_END 0 69 #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA 70 #define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 71 #define QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77 72 #define QCOW2_EXT_MAGIC_BITMAPS 0x23852875 73 #define QCOW2_EXT_MAGIC_DATA_FILE 0x44415441 74 75 static int coroutine_fn 76 qcow2_co_preadv_compressed(BlockDriverState *bs, 77 uint64_t file_cluster_offset, 78 uint64_t offset, 79 uint64_t bytes, 80 QEMUIOVector *qiov, 81 size_t qiov_offset); 82 83 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) 84 { 85 const QCowHeader *cow_header = (const void *)buf; 86 87 if (buf_size >= sizeof(QCowHeader) && 88 be32_to_cpu(cow_header->magic) == QCOW_MAGIC && 89 be32_to_cpu(cow_header->version) >= 2) 90 return 100; 91 else 92 return 0; 93 } 94 95 96 static ssize_t qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset, 97 uint8_t *buf, size_t buflen, 98 void *opaque, Error **errp) 99 { 100 BlockDriverState *bs = opaque; 101 BDRVQcow2State *s = bs->opaque; 102 ssize_t ret; 103 104 if ((offset + buflen) > s->crypto_header.length) { 105 error_setg(errp, "Request for data outside of extension header"); 106 return -1; 107 } 108 109 ret = bdrv_pread(bs->file, 110 s->crypto_header.offset + offset, buf, buflen); 111 if (ret < 0) { 112 error_setg_errno(errp, -ret, "Could not read encryption header"); 113 return -1; 114 } 115 return ret; 116 } 117 118 119 static ssize_t qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, 120 void *opaque, Error **errp) 121 { 122 BlockDriverState *bs = opaque; 123 BDRVQcow2State *s = bs->opaque; 124 int64_t ret; 125 int64_t clusterlen; 126 127 ret = qcow2_alloc_clusters(bs, headerlen); 128 if (ret < 0) { 129 error_setg_errno(errp, -ret, 130 "Cannot allocate cluster for LUKS header size %zu", 131 headerlen); 132 return -1; 133 } 134 135 s->crypto_header.length = headerlen; 136 s->crypto_header.offset = ret; 137 138 /* Zero fill remaining space in cluster so it has predictable 139 * content in case of future spec changes */ 140 clusterlen = size_to_clusters(s, headerlen) * s->cluster_size; 141 assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen, false) == 0); 142 ret = bdrv_pwrite_zeroes(bs->file, 143 ret + headerlen, 144 clusterlen - headerlen, 0); 145 if (ret < 0) { 146 error_setg_errno(errp, -ret, "Could not zero fill encryption header"); 147 return -1; 148 } 149 150 return ret; 151 } 152 153 154 static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset, 155 const uint8_t *buf, size_t buflen, 156 void *opaque, Error **errp) 157 { 158 BlockDriverState *bs = opaque; 159 BDRVQcow2State *s = bs->opaque; 160 ssize_t ret; 161 162 if ((offset + buflen) > s->crypto_header.length) { 163 error_setg(errp, "Request for data outside of extension header"); 164 return -1; 165 } 166 167 ret = bdrv_pwrite(bs->file, 168 s->crypto_header.offset + offset, buf, buflen); 169 if (ret < 0) { 170 error_setg_errno(errp, -ret, "Could not read encryption header"); 171 return -1; 172 } 173 return ret; 174 } 175 176 177 /* 178 * read qcow2 extension and fill bs 179 * start reading from start_offset 180 * finish reading upon magic of value 0 or when end_offset reached 181 * unknown magic is skipped (future extension this version knows nothing about) 182 * return 0 upon success, non-0 otherwise 183 */ 184 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, 185 uint64_t end_offset, void **p_feature_table, 186 int flags, bool *need_update_header, 187 Error **errp) 188 { 189 BDRVQcow2State *s = bs->opaque; 190 QCowExtension ext; 191 uint64_t offset; 192 int ret; 193 Qcow2BitmapHeaderExt bitmaps_ext; 194 195 if (need_update_header != NULL) { 196 *need_update_header = false; 197 } 198 199 #ifdef DEBUG_EXT 200 printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); 201 #endif 202 offset = start_offset; 203 while (offset < end_offset) { 204 205 #ifdef DEBUG_EXT 206 /* Sanity check */ 207 if (offset > s->cluster_size) 208 printf("qcow2_read_extension: suspicious offset %lu\n", offset); 209 210 printf("attempting to read extended header in offset %lu\n", offset); 211 #endif 212 213 ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext)); 214 if (ret < 0) { 215 error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " 216 "pread fail from offset %" PRIu64, offset); 217 return 1; 218 } 219 ext.magic = be32_to_cpu(ext.magic); 220 ext.len = be32_to_cpu(ext.len); 221 offset += sizeof(ext); 222 #ifdef DEBUG_EXT 223 printf("ext.magic = 0x%x\n", ext.magic); 224 #endif 225 if (offset > end_offset || ext.len > end_offset - offset) { 226 error_setg(errp, "Header extension too large"); 227 return -EINVAL; 228 } 229 230 switch (ext.magic) { 231 case QCOW2_EXT_MAGIC_END: 232 return 0; 233 234 case QCOW2_EXT_MAGIC_BACKING_FORMAT: 235 if (ext.len >= sizeof(bs->backing_format)) { 236 error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32 237 " too large (>=%zu)", ext.len, 238 sizeof(bs->backing_format)); 239 return 2; 240 } 241 ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len); 242 if (ret < 0) { 243 error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " 244 "Could not read format name"); 245 return 3; 246 } 247 bs->backing_format[ext.len] = '\0'; 248 s->image_backing_format = g_strdup(bs->backing_format); 249 #ifdef DEBUG_EXT 250 printf("Qcow2: Got format extension %s\n", bs->backing_format); 251 #endif 252 break; 253 254 case QCOW2_EXT_MAGIC_FEATURE_TABLE: 255 if (p_feature_table != NULL) { 256 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); 257 ret = bdrv_pread(bs->file, offset , feature_table, ext.len); 258 if (ret < 0) { 259 error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " 260 "Could not read table"); 261 return ret; 262 } 263 264 *p_feature_table = feature_table; 265 } 266 break; 267 268 case QCOW2_EXT_MAGIC_CRYPTO_HEADER: { 269 unsigned int cflags = 0; 270 if (s->crypt_method_header != QCOW_CRYPT_LUKS) { 271 error_setg(errp, "CRYPTO header extension only " 272 "expected with LUKS encryption method"); 273 return -EINVAL; 274 } 275 if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) { 276 error_setg(errp, "CRYPTO header extension size %u, " 277 "but expected size %zu", ext.len, 278 sizeof(Qcow2CryptoHeaderExtension)); 279 return -EINVAL; 280 } 281 282 ret = bdrv_pread(bs->file, offset, &s->crypto_header, ext.len); 283 if (ret < 0) { 284 error_setg_errno(errp, -ret, 285 "Unable to read CRYPTO header extension"); 286 return ret; 287 } 288 s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset); 289 s->crypto_header.length = be64_to_cpu(s->crypto_header.length); 290 291 if ((s->crypto_header.offset % s->cluster_size) != 0) { 292 error_setg(errp, "Encryption header offset '%" PRIu64 "' is " 293 "not a multiple of cluster size '%u'", 294 s->crypto_header.offset, s->cluster_size); 295 return -EINVAL; 296 } 297 298 if (flags & BDRV_O_NO_IO) { 299 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; 300 } 301 s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.", 302 qcow2_crypto_hdr_read_func, 303 bs, cflags, QCOW2_MAX_THREADS, errp); 304 if (!s->crypto) { 305 return -EINVAL; 306 } 307 } break; 308 309 case QCOW2_EXT_MAGIC_BITMAPS: 310 if (ext.len != sizeof(bitmaps_ext)) { 311 error_setg_errno(errp, -ret, "bitmaps_ext: " 312 "Invalid extension length"); 313 return -EINVAL; 314 } 315 316 if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) { 317 if (s->qcow_version < 3) { 318 /* Let's be a bit more specific */ 319 warn_report("This qcow2 v2 image contains bitmaps, but " 320 "they may have been modified by a program " 321 "without persistent bitmap support; so now " 322 "they must all be considered inconsistent"); 323 } else { 324 warn_report("a program lacking bitmap support " 325 "modified this file, so all bitmaps are now " 326 "considered inconsistent"); 327 } 328 error_printf("Some clusters may be leaked, " 329 "run 'qemu-img check -r' on the image " 330 "file to fix."); 331 if (need_update_header != NULL) { 332 /* Updating is needed to drop invalid bitmap extension. */ 333 *need_update_header = true; 334 } 335 break; 336 } 337 338 ret = bdrv_pread(bs->file, offset, &bitmaps_ext, ext.len); 339 if (ret < 0) { 340 error_setg_errno(errp, -ret, "bitmaps_ext: " 341 "Could not read ext header"); 342 return ret; 343 } 344 345 if (bitmaps_ext.reserved32 != 0) { 346 error_setg_errno(errp, -ret, "bitmaps_ext: " 347 "Reserved field is not zero"); 348 return -EINVAL; 349 } 350 351 bitmaps_ext.nb_bitmaps = be32_to_cpu(bitmaps_ext.nb_bitmaps); 352 bitmaps_ext.bitmap_directory_size = 353 be64_to_cpu(bitmaps_ext.bitmap_directory_size); 354 bitmaps_ext.bitmap_directory_offset = 355 be64_to_cpu(bitmaps_ext.bitmap_directory_offset); 356 357 if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) { 358 error_setg(errp, 359 "bitmaps_ext: Image has %" PRIu32 " bitmaps, " 360 "exceeding the QEMU supported maximum of %d", 361 bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS); 362 return -EINVAL; 363 } 364 365 if (bitmaps_ext.nb_bitmaps == 0) { 366 error_setg(errp, "found bitmaps extension with zero bitmaps"); 367 return -EINVAL; 368 } 369 370 if (offset_into_cluster(s, bitmaps_ext.bitmap_directory_offset)) { 371 error_setg(errp, "bitmaps_ext: " 372 "invalid bitmap directory offset"); 373 return -EINVAL; 374 } 375 376 if (bitmaps_ext.bitmap_directory_size > 377 QCOW2_MAX_BITMAP_DIRECTORY_SIZE) { 378 error_setg(errp, "bitmaps_ext: " 379 "bitmap directory size (%" PRIu64 ") exceeds " 380 "the maximum supported size (%d)", 381 bitmaps_ext.bitmap_directory_size, 382 QCOW2_MAX_BITMAP_DIRECTORY_SIZE); 383 return -EINVAL; 384 } 385 386 s->nb_bitmaps = bitmaps_ext.nb_bitmaps; 387 s->bitmap_directory_offset = 388 bitmaps_ext.bitmap_directory_offset; 389 s->bitmap_directory_size = 390 bitmaps_ext.bitmap_directory_size; 391 392 #ifdef DEBUG_EXT 393 printf("Qcow2: Got bitmaps extension: " 394 "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n", 395 s->bitmap_directory_offset, s->nb_bitmaps); 396 #endif 397 break; 398 399 case QCOW2_EXT_MAGIC_DATA_FILE: 400 { 401 s->image_data_file = g_malloc0(ext.len + 1); 402 ret = bdrv_pread(bs->file, offset, s->image_data_file, ext.len); 403 if (ret < 0) { 404 error_setg_errno(errp, -ret, 405 "ERROR: Could not read data file name"); 406 return ret; 407 } 408 #ifdef DEBUG_EXT 409 printf("Qcow2: Got external data file %s\n", s->image_data_file); 410 #endif 411 break; 412 } 413 414 default: 415 /* unknown magic - save it in case we need to rewrite the header */ 416 /* If you add a new feature, make sure to also update the fast 417 * path of qcow2_make_empty() to deal with it. */ 418 { 419 Qcow2UnknownHeaderExtension *uext; 420 421 uext = g_malloc0(sizeof(*uext) + ext.len); 422 uext->magic = ext.magic; 423 uext->len = ext.len; 424 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); 425 426 ret = bdrv_pread(bs->file, offset , uext->data, uext->len); 427 if (ret < 0) { 428 error_setg_errno(errp, -ret, "ERROR: unknown extension: " 429 "Could not read data"); 430 return ret; 431 } 432 } 433 break; 434 } 435 436 offset += ((ext.len + 7) & ~7); 437 } 438 439 return 0; 440 } 441 442 static void cleanup_unknown_header_ext(BlockDriverState *bs) 443 { 444 BDRVQcow2State *s = bs->opaque; 445 Qcow2UnknownHeaderExtension *uext, *next; 446 447 QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { 448 QLIST_REMOVE(uext, next); 449 g_free(uext); 450 } 451 } 452 453 static void report_unsupported_feature(Error **errp, Qcow2Feature *table, 454 uint64_t mask) 455 { 456 char *features = g_strdup(""); 457 char *old; 458 459 while (table && table->name[0] != '\0') { 460 if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { 461 if (mask & (1ULL << table->bit)) { 462 old = features; 463 features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "", 464 table->name); 465 g_free(old); 466 mask &= ~(1ULL << table->bit); 467 } 468 } 469 table++; 470 } 471 472 if (mask) { 473 old = features; 474 features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64, 475 old, *old ? ", " : "", mask); 476 g_free(old); 477 } 478 479 error_setg(errp, "Unsupported qcow2 feature(s): %s", features); 480 g_free(features); 481 } 482 483 /* 484 * Sets the dirty bit and flushes afterwards if necessary. 485 * 486 * The incompatible_features bit is only set if the image file header was 487 * updated successfully. Therefore it is not required to check the return 488 * value of this function. 489 */ 490 int qcow2_mark_dirty(BlockDriverState *bs) 491 { 492 BDRVQcow2State *s = bs->opaque; 493 uint64_t val; 494 int ret; 495 496 assert(s->qcow_version >= 3); 497 498 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { 499 return 0; /* already dirty */ 500 } 501 502 val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); 503 ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), 504 &val, sizeof(val)); 505 if (ret < 0) { 506 return ret; 507 } 508 ret = bdrv_flush(bs->file->bs); 509 if (ret < 0) { 510 return ret; 511 } 512 513 /* Only treat image as dirty if the header was updated successfully */ 514 s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; 515 return 0; 516 } 517 518 /* 519 * Clears the dirty bit and flushes before if necessary. Only call this 520 * function when there are no pending requests, it does not guard against 521 * concurrent requests dirtying the image. 522 */ 523 static int qcow2_mark_clean(BlockDriverState *bs) 524 { 525 BDRVQcow2State *s = bs->opaque; 526 527 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { 528 int ret; 529 530 s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; 531 532 ret = qcow2_flush_caches(bs); 533 if (ret < 0) { 534 return ret; 535 } 536 537 return qcow2_update_header(bs); 538 } 539 return 0; 540 } 541 542 /* 543 * Marks the image as corrupt. 544 */ 545 int qcow2_mark_corrupt(BlockDriverState *bs) 546 { 547 BDRVQcow2State *s = bs->opaque; 548 549 s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; 550 return qcow2_update_header(bs); 551 } 552 553 /* 554 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes 555 * before if necessary. 556 */ 557 int qcow2_mark_consistent(BlockDriverState *bs) 558 { 559 BDRVQcow2State *s = bs->opaque; 560 561 if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { 562 int ret = qcow2_flush_caches(bs); 563 if (ret < 0) { 564 return ret; 565 } 566 567 s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT; 568 return qcow2_update_header(bs); 569 } 570 return 0; 571 } 572 573 static void qcow2_add_check_result(BdrvCheckResult *out, 574 const BdrvCheckResult *src, 575 bool set_allocation_info) 576 { 577 out->corruptions += src->corruptions; 578 out->leaks += src->leaks; 579 out->check_errors += src->check_errors; 580 out->corruptions_fixed += src->corruptions_fixed; 581 out->leaks_fixed += src->leaks_fixed; 582 583 if (set_allocation_info) { 584 out->image_end_offset = src->image_end_offset; 585 out->bfi = src->bfi; 586 } 587 } 588 589 static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs, 590 BdrvCheckResult *result, 591 BdrvCheckMode fix) 592 { 593 BdrvCheckResult snapshot_res = {}; 594 BdrvCheckResult refcount_res = {}; 595 int ret; 596 597 memset(result, 0, sizeof(*result)); 598 599 ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix); 600 if (ret < 0) { 601 qcow2_add_check_result(result, &snapshot_res, false); 602 return ret; 603 } 604 605 ret = qcow2_check_refcounts(bs, &refcount_res, fix); 606 qcow2_add_check_result(result, &refcount_res, true); 607 if (ret < 0) { 608 qcow2_add_check_result(result, &snapshot_res, false); 609 return ret; 610 } 611 612 ret = qcow2_check_fix_snapshot_table(bs, &snapshot_res, fix); 613 qcow2_add_check_result(result, &snapshot_res, false); 614 if (ret < 0) { 615 return ret; 616 } 617 618 if (fix && result->check_errors == 0 && result->corruptions == 0) { 619 ret = qcow2_mark_clean(bs); 620 if (ret < 0) { 621 return ret; 622 } 623 return qcow2_mark_consistent(bs); 624 } 625 return ret; 626 } 627 628 static int coroutine_fn qcow2_co_check(BlockDriverState *bs, 629 BdrvCheckResult *result, 630 BdrvCheckMode fix) 631 { 632 BDRVQcow2State *s = bs->opaque; 633 int ret; 634 635 qemu_co_mutex_lock(&s->lock); 636 ret = qcow2_co_check_locked(bs, result, fix); 637 qemu_co_mutex_unlock(&s->lock); 638 return ret; 639 } 640 641 int qcow2_validate_table(BlockDriverState *bs, uint64_t offset, 642 uint64_t entries, size_t entry_len, 643 int64_t max_size_bytes, const char *table_name, 644 Error **errp) 645 { 646 BDRVQcow2State *s = bs->opaque; 647 648 if (entries > max_size_bytes / entry_len) { 649 error_setg(errp, "%s too large", table_name); 650 return -EFBIG; 651 } 652 653 /* Use signed INT64_MAX as the maximum even for uint64_t header fields, 654 * because values will be passed to qemu functions taking int64_t. */ 655 if ((INT64_MAX - entries * entry_len < offset) || 656 (offset_into_cluster(s, offset) != 0)) { 657 error_setg(errp, "%s offset invalid", table_name); 658 return -EINVAL; 659 } 660 661 return 0; 662 } 663 664 static const char *const mutable_opts[] = { 665 QCOW2_OPT_LAZY_REFCOUNTS, 666 QCOW2_OPT_DISCARD_REQUEST, 667 QCOW2_OPT_DISCARD_SNAPSHOT, 668 QCOW2_OPT_DISCARD_OTHER, 669 QCOW2_OPT_OVERLAP, 670 QCOW2_OPT_OVERLAP_TEMPLATE, 671 QCOW2_OPT_OVERLAP_MAIN_HEADER, 672 QCOW2_OPT_OVERLAP_ACTIVE_L1, 673 QCOW2_OPT_OVERLAP_ACTIVE_L2, 674 QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, 675 QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, 676 QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, 677 QCOW2_OPT_OVERLAP_INACTIVE_L1, 678 QCOW2_OPT_OVERLAP_INACTIVE_L2, 679 QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY, 680 QCOW2_OPT_CACHE_SIZE, 681 QCOW2_OPT_L2_CACHE_SIZE, 682 QCOW2_OPT_L2_CACHE_ENTRY_SIZE, 683 QCOW2_OPT_REFCOUNT_CACHE_SIZE, 684 QCOW2_OPT_CACHE_CLEAN_INTERVAL, 685 NULL 686 }; 687 688 static QemuOptsList qcow2_runtime_opts = { 689 .name = "qcow2", 690 .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), 691 .desc = { 692 { 693 .name = QCOW2_OPT_LAZY_REFCOUNTS, 694 .type = QEMU_OPT_BOOL, 695 .help = "Postpone refcount updates", 696 }, 697 { 698 .name = QCOW2_OPT_DISCARD_REQUEST, 699 .type = QEMU_OPT_BOOL, 700 .help = "Pass guest discard requests to the layer below", 701 }, 702 { 703 .name = QCOW2_OPT_DISCARD_SNAPSHOT, 704 .type = QEMU_OPT_BOOL, 705 .help = "Generate discard requests when snapshot related space " 706 "is freed", 707 }, 708 { 709 .name = QCOW2_OPT_DISCARD_OTHER, 710 .type = QEMU_OPT_BOOL, 711 .help = "Generate discard requests when other clusters are freed", 712 }, 713 { 714 .name = QCOW2_OPT_OVERLAP, 715 .type = QEMU_OPT_STRING, 716 .help = "Selects which overlap checks to perform from a range of " 717 "templates (none, constant, cached, all)", 718 }, 719 { 720 .name = QCOW2_OPT_OVERLAP_TEMPLATE, 721 .type = QEMU_OPT_STRING, 722 .help = "Selects which overlap checks to perform from a range of " 723 "templates (none, constant, cached, all)", 724 }, 725 { 726 .name = QCOW2_OPT_OVERLAP_MAIN_HEADER, 727 .type = QEMU_OPT_BOOL, 728 .help = "Check for unintended writes into the main qcow2 header", 729 }, 730 { 731 .name = QCOW2_OPT_OVERLAP_ACTIVE_L1, 732 .type = QEMU_OPT_BOOL, 733 .help = "Check for unintended writes into the active L1 table", 734 }, 735 { 736 .name = QCOW2_OPT_OVERLAP_ACTIVE_L2, 737 .type = QEMU_OPT_BOOL, 738 .help = "Check for unintended writes into an active L2 table", 739 }, 740 { 741 .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, 742 .type = QEMU_OPT_BOOL, 743 .help = "Check for unintended writes into the refcount table", 744 }, 745 { 746 .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, 747 .type = QEMU_OPT_BOOL, 748 .help = "Check for unintended writes into a refcount block", 749 }, 750 { 751 .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, 752 .type = QEMU_OPT_BOOL, 753 .help = "Check for unintended writes into the snapshot table", 754 }, 755 { 756 .name = QCOW2_OPT_OVERLAP_INACTIVE_L1, 757 .type = QEMU_OPT_BOOL, 758 .help = "Check for unintended writes into an inactive L1 table", 759 }, 760 { 761 .name = QCOW2_OPT_OVERLAP_INACTIVE_L2, 762 .type = QEMU_OPT_BOOL, 763 .help = "Check for unintended writes into an inactive L2 table", 764 }, 765 { 766 .name = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY, 767 .type = QEMU_OPT_BOOL, 768 .help = "Check for unintended writes into the bitmap directory", 769 }, 770 { 771 .name = QCOW2_OPT_CACHE_SIZE, 772 .type = QEMU_OPT_SIZE, 773 .help = "Maximum combined metadata (L2 tables and refcount blocks) " 774 "cache size", 775 }, 776 { 777 .name = QCOW2_OPT_L2_CACHE_SIZE, 778 .type = QEMU_OPT_SIZE, 779 .help = "Maximum L2 table cache size", 780 }, 781 { 782 .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE, 783 .type = QEMU_OPT_SIZE, 784 .help = "Size of each entry in the L2 cache", 785 }, 786 { 787 .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE, 788 .type = QEMU_OPT_SIZE, 789 .help = "Maximum refcount block cache size", 790 }, 791 { 792 .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL, 793 .type = QEMU_OPT_NUMBER, 794 .help = "Clean unused cache entries after this time (in seconds)", 795 }, 796 BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.", 797 "ID of secret providing qcow2 AES key or LUKS passphrase"), 798 { /* end of list */ } 799 }, 800 }; 801 802 static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { 803 [QCOW2_OL_MAIN_HEADER_BITNR] = QCOW2_OPT_OVERLAP_MAIN_HEADER, 804 [QCOW2_OL_ACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L1, 805 [QCOW2_OL_ACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L2, 806 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, 807 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, 808 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, 809 [QCOW2_OL_INACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L1, 810 [QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2, 811 [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY, 812 }; 813 814 static void cache_clean_timer_cb(void *opaque) 815 { 816 BlockDriverState *bs = opaque; 817 BDRVQcow2State *s = bs->opaque; 818 qcow2_cache_clean_unused(s->l2_table_cache); 819 qcow2_cache_clean_unused(s->refcount_block_cache); 820 timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 821 (int64_t) s->cache_clean_interval * 1000); 822 } 823 824 static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context) 825 { 826 BDRVQcow2State *s = bs->opaque; 827 if (s->cache_clean_interval > 0) { 828 s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL, 829 SCALE_MS, cache_clean_timer_cb, 830 bs); 831 timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 832 (int64_t) s->cache_clean_interval * 1000); 833 } 834 } 835 836 static void cache_clean_timer_del(BlockDriverState *bs) 837 { 838 BDRVQcow2State *s = bs->opaque; 839 if (s->cache_clean_timer) { 840 timer_del(s->cache_clean_timer); 841 timer_free(s->cache_clean_timer); 842 s->cache_clean_timer = NULL; 843 } 844 } 845 846 static void qcow2_detach_aio_context(BlockDriverState *bs) 847 { 848 cache_clean_timer_del(bs); 849 } 850 851 static void qcow2_attach_aio_context(BlockDriverState *bs, 852 AioContext *new_context) 853 { 854 cache_clean_timer_init(bs, new_context); 855 } 856 857 static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, 858 uint64_t *l2_cache_size, 859 uint64_t *l2_cache_entry_size, 860 uint64_t *refcount_cache_size, Error **errp) 861 { 862 BDRVQcow2State *s = bs->opaque; 863 uint64_t combined_cache_size, l2_cache_max_setting; 864 bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set; 865 bool l2_cache_entry_size_set; 866 int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size; 867 uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; 868 uint64_t max_l2_entries = DIV_ROUND_UP(virtual_disk_size, s->cluster_size); 869 /* An L2 table is always one cluster in size so the max cache size 870 * should be a multiple of the cluster size. */ 871 uint64_t max_l2_cache = ROUND_UP(max_l2_entries * sizeof(uint64_t), 872 s->cluster_size); 873 874 combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE); 875 l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE); 876 refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE); 877 l2_cache_entry_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE); 878 879 combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0); 880 l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 881 DEFAULT_L2_CACHE_MAX_SIZE); 882 *refcount_cache_size = qemu_opt_get_size(opts, 883 QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0); 884 885 *l2_cache_entry_size = qemu_opt_get_size( 886 opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size); 887 888 *l2_cache_size = MIN(max_l2_cache, l2_cache_max_setting); 889 890 if (combined_cache_size_set) { 891 if (l2_cache_size_set && refcount_cache_size_set) { 892 error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE 893 " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set " 894 "at the same time"); 895 return; 896 } else if (l2_cache_size_set && 897 (l2_cache_max_setting > combined_cache_size)) { 898 error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed " 899 QCOW2_OPT_CACHE_SIZE); 900 return; 901 } else if (*refcount_cache_size > combined_cache_size) { 902 error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed " 903 QCOW2_OPT_CACHE_SIZE); 904 return; 905 } 906 907 if (l2_cache_size_set) { 908 *refcount_cache_size = combined_cache_size - *l2_cache_size; 909 } else if (refcount_cache_size_set) { 910 *l2_cache_size = combined_cache_size - *refcount_cache_size; 911 } else { 912 /* Assign as much memory as possible to the L2 cache, and 913 * use the remainder for the refcount cache */ 914 if (combined_cache_size >= max_l2_cache + min_refcount_cache) { 915 *l2_cache_size = max_l2_cache; 916 *refcount_cache_size = combined_cache_size - *l2_cache_size; 917 } else { 918 *refcount_cache_size = 919 MIN(combined_cache_size, min_refcount_cache); 920 *l2_cache_size = combined_cache_size - *refcount_cache_size; 921 } 922 } 923 } 924 925 /* 926 * If the L2 cache is not enough to cover the whole disk then 927 * default to 4KB entries. Smaller entries reduce the cost of 928 * loads and evictions and increase I/O performance. 929 */ 930 if (*l2_cache_size < max_l2_cache && !l2_cache_entry_size_set) { 931 *l2_cache_entry_size = MIN(s->cluster_size, 4096); 932 } 933 934 /* l2_cache_size and refcount_cache_size are ensured to have at least 935 * their minimum values in qcow2_update_options_prepare() */ 936 937 if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) || 938 *l2_cache_entry_size > s->cluster_size || 939 !is_power_of_2(*l2_cache_entry_size)) { 940 error_setg(errp, "L2 cache entry size must be a power of two " 941 "between %d and the cluster size (%d)", 942 1 << MIN_CLUSTER_BITS, s->cluster_size); 943 return; 944 } 945 } 946 947 typedef struct Qcow2ReopenState { 948 Qcow2Cache *l2_table_cache; 949 Qcow2Cache *refcount_block_cache; 950 int l2_slice_size; /* Number of entries in a slice of the L2 table */ 951 bool use_lazy_refcounts; 952 int overlap_check; 953 bool discard_passthrough[QCOW2_DISCARD_MAX]; 954 uint64_t cache_clean_interval; 955 QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */ 956 } Qcow2ReopenState; 957 958 static int qcow2_update_options_prepare(BlockDriverState *bs, 959 Qcow2ReopenState *r, 960 QDict *options, int flags, 961 Error **errp) 962 { 963 BDRVQcow2State *s = bs->opaque; 964 QemuOpts *opts = NULL; 965 const char *opt_overlap_check, *opt_overlap_check_template; 966 int overlap_check_template = 0; 967 uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size; 968 int i; 969 const char *encryptfmt; 970 QDict *encryptopts = NULL; 971 Error *local_err = NULL; 972 int ret; 973 974 qdict_extract_subqdict(options, &encryptopts, "encrypt."); 975 encryptfmt = qdict_get_try_str(encryptopts, "format"); 976 977 opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); 978 qemu_opts_absorb_qdict(opts, options, &local_err); 979 if (local_err) { 980 error_propagate(errp, local_err); 981 ret = -EINVAL; 982 goto fail; 983 } 984 985 /* get L2 table/refcount block cache size from command line options */ 986 read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size, 987 &refcount_cache_size, &local_err); 988 if (local_err) { 989 error_propagate(errp, local_err); 990 ret = -EINVAL; 991 goto fail; 992 } 993 994 l2_cache_size /= l2_cache_entry_size; 995 if (l2_cache_size < MIN_L2_CACHE_SIZE) { 996 l2_cache_size = MIN_L2_CACHE_SIZE; 997 } 998 if (l2_cache_size > INT_MAX) { 999 error_setg(errp, "L2 cache size too big"); 1000 ret = -EINVAL; 1001 goto fail; 1002 } 1003 1004 refcount_cache_size /= s->cluster_size; 1005 if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) { 1006 refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE; 1007 } 1008 if (refcount_cache_size > INT_MAX) { 1009 error_setg(errp, "Refcount cache size too big"); 1010 ret = -EINVAL; 1011 goto fail; 1012 } 1013 1014 /* alloc new L2 table/refcount block cache, flush old one */ 1015 if (s->l2_table_cache) { 1016 ret = qcow2_cache_flush(bs, s->l2_table_cache); 1017 if (ret) { 1018 error_setg_errno(errp, -ret, "Failed to flush the L2 table cache"); 1019 goto fail; 1020 } 1021 } 1022 1023 if (s->refcount_block_cache) { 1024 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 1025 if (ret) { 1026 error_setg_errno(errp, -ret, 1027 "Failed to flush the refcount block cache"); 1028 goto fail; 1029 } 1030 } 1031 1032 r->l2_slice_size = l2_cache_entry_size / sizeof(uint64_t); 1033 r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size, 1034 l2_cache_entry_size); 1035 r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size, 1036 s->cluster_size); 1037 if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) { 1038 error_setg(errp, "Could not allocate metadata caches"); 1039 ret = -ENOMEM; 1040 goto fail; 1041 } 1042 1043 /* New interval for cache cleanup timer */ 1044 r->cache_clean_interval = 1045 qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL, 1046 DEFAULT_CACHE_CLEAN_INTERVAL); 1047 #ifndef CONFIG_LINUX 1048 if (r->cache_clean_interval != 0) { 1049 error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL 1050 " not supported on this host"); 1051 ret = -EINVAL; 1052 goto fail; 1053 } 1054 #endif 1055 if (r->cache_clean_interval > UINT_MAX) { 1056 error_setg(errp, "Cache clean interval too big"); 1057 ret = -EINVAL; 1058 goto fail; 1059 } 1060 1061 /* lazy-refcounts; flush if going from enabled to disabled */ 1062 r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, 1063 (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); 1064 if (r->use_lazy_refcounts && s->qcow_version < 3) { 1065 error_setg(errp, "Lazy refcounts require a qcow2 image with at least " 1066 "qemu 1.1 compatibility level"); 1067 ret = -EINVAL; 1068 goto fail; 1069 } 1070 1071 if (s->use_lazy_refcounts && !r->use_lazy_refcounts) { 1072 ret = qcow2_mark_clean(bs); 1073 if (ret < 0) { 1074 error_setg_errno(errp, -ret, "Failed to disable lazy refcounts"); 1075 goto fail; 1076 } 1077 } 1078 1079 /* Overlap check options */ 1080 opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP); 1081 opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE); 1082 if (opt_overlap_check_template && opt_overlap_check && 1083 strcmp(opt_overlap_check_template, opt_overlap_check)) 1084 { 1085 error_setg(errp, "Conflicting values for qcow2 options '" 1086 QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE 1087 "' ('%s')", opt_overlap_check, opt_overlap_check_template); 1088 ret = -EINVAL; 1089 goto fail; 1090 } 1091 if (!opt_overlap_check) { 1092 opt_overlap_check = opt_overlap_check_template ?: "cached"; 1093 } 1094 1095 if (!strcmp(opt_overlap_check, "none")) { 1096 overlap_check_template = 0; 1097 } else if (!strcmp(opt_overlap_check, "constant")) { 1098 overlap_check_template = QCOW2_OL_CONSTANT; 1099 } else if (!strcmp(opt_overlap_check, "cached")) { 1100 overlap_check_template = QCOW2_OL_CACHED; 1101 } else if (!strcmp(opt_overlap_check, "all")) { 1102 overlap_check_template = QCOW2_OL_ALL; 1103 } else { 1104 error_setg(errp, "Unsupported value '%s' for qcow2 option " 1105 "'overlap-check'. Allowed are any of the following: " 1106 "none, constant, cached, all", opt_overlap_check); 1107 ret = -EINVAL; 1108 goto fail; 1109 } 1110 1111 r->overlap_check = 0; 1112 for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { 1113 /* overlap-check defines a template bitmask, but every flag may be 1114 * overwritten through the associated boolean option */ 1115 r->overlap_check |= 1116 qemu_opt_get_bool(opts, overlap_bool_option_names[i], 1117 overlap_check_template & (1 << i)) << i; 1118 } 1119 1120 r->discard_passthrough[QCOW2_DISCARD_NEVER] = false; 1121 r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; 1122 r->discard_passthrough[QCOW2_DISCARD_REQUEST] = 1123 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, 1124 flags & BDRV_O_UNMAP); 1125 r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = 1126 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); 1127 r->discard_passthrough[QCOW2_DISCARD_OTHER] = 1128 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); 1129 1130 switch (s->crypt_method_header) { 1131 case QCOW_CRYPT_NONE: 1132 if (encryptfmt) { 1133 error_setg(errp, "No encryption in image header, but options " 1134 "specified format '%s'", encryptfmt); 1135 ret = -EINVAL; 1136 goto fail; 1137 } 1138 break; 1139 1140 case QCOW_CRYPT_AES: 1141 if (encryptfmt && !g_str_equal(encryptfmt, "aes")) { 1142 error_setg(errp, 1143 "Header reported 'aes' encryption format but " 1144 "options specify '%s'", encryptfmt); 1145 ret = -EINVAL; 1146 goto fail; 1147 } 1148 qdict_put_str(encryptopts, "format", "qcow"); 1149 r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp); 1150 break; 1151 1152 case QCOW_CRYPT_LUKS: 1153 if (encryptfmt && !g_str_equal(encryptfmt, "luks")) { 1154 error_setg(errp, 1155 "Header reported 'luks' encryption format but " 1156 "options specify '%s'", encryptfmt); 1157 ret = -EINVAL; 1158 goto fail; 1159 } 1160 qdict_put_str(encryptopts, "format", "luks"); 1161 r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp); 1162 break; 1163 1164 default: 1165 error_setg(errp, "Unsupported encryption method %d", 1166 s->crypt_method_header); 1167 break; 1168 } 1169 if (s->crypt_method_header != QCOW_CRYPT_NONE && !r->crypto_opts) { 1170 ret = -EINVAL; 1171 goto fail; 1172 } 1173 1174 ret = 0; 1175 fail: 1176 qobject_unref(encryptopts); 1177 qemu_opts_del(opts); 1178 opts = NULL; 1179 return ret; 1180 } 1181 1182 static void qcow2_update_options_commit(BlockDriverState *bs, 1183 Qcow2ReopenState *r) 1184 { 1185 BDRVQcow2State *s = bs->opaque; 1186 int i; 1187 1188 if (s->l2_table_cache) { 1189 qcow2_cache_destroy(s->l2_table_cache); 1190 } 1191 if (s->refcount_block_cache) { 1192 qcow2_cache_destroy(s->refcount_block_cache); 1193 } 1194 s->l2_table_cache = r->l2_table_cache; 1195 s->refcount_block_cache = r->refcount_block_cache; 1196 s->l2_slice_size = r->l2_slice_size; 1197 1198 s->overlap_check = r->overlap_check; 1199 s->use_lazy_refcounts = r->use_lazy_refcounts; 1200 1201 for (i = 0; i < QCOW2_DISCARD_MAX; i++) { 1202 s->discard_passthrough[i] = r->discard_passthrough[i]; 1203 } 1204 1205 if (s->cache_clean_interval != r->cache_clean_interval) { 1206 cache_clean_timer_del(bs); 1207 s->cache_clean_interval = r->cache_clean_interval; 1208 cache_clean_timer_init(bs, bdrv_get_aio_context(bs)); 1209 } 1210 1211 qapi_free_QCryptoBlockOpenOptions(s->crypto_opts); 1212 s->crypto_opts = r->crypto_opts; 1213 } 1214 1215 static void qcow2_update_options_abort(BlockDriverState *bs, 1216 Qcow2ReopenState *r) 1217 { 1218 if (r->l2_table_cache) { 1219 qcow2_cache_destroy(r->l2_table_cache); 1220 } 1221 if (r->refcount_block_cache) { 1222 qcow2_cache_destroy(r->refcount_block_cache); 1223 } 1224 qapi_free_QCryptoBlockOpenOptions(r->crypto_opts); 1225 } 1226 1227 static int qcow2_update_options(BlockDriverState *bs, QDict *options, 1228 int flags, Error **errp) 1229 { 1230 Qcow2ReopenState r = {}; 1231 int ret; 1232 1233 ret = qcow2_update_options_prepare(bs, &r, options, flags, errp); 1234 if (ret >= 0) { 1235 qcow2_update_options_commit(bs, &r); 1236 } else { 1237 qcow2_update_options_abort(bs, &r); 1238 } 1239 1240 return ret; 1241 } 1242 1243 /* Called with s->lock held. */ 1244 static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options, 1245 int flags, Error **errp) 1246 { 1247 BDRVQcow2State *s = bs->opaque; 1248 unsigned int len, i; 1249 int ret = 0; 1250 QCowHeader header; 1251 Error *local_err = NULL; 1252 uint64_t ext_end; 1253 uint64_t l1_vm_state_index; 1254 bool update_header = false; 1255 1256 ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); 1257 if (ret < 0) { 1258 error_setg_errno(errp, -ret, "Could not read qcow2 header"); 1259 goto fail; 1260 } 1261 header.magic = be32_to_cpu(header.magic); 1262 header.version = be32_to_cpu(header.version); 1263 header.backing_file_offset = be64_to_cpu(header.backing_file_offset); 1264 header.backing_file_size = be32_to_cpu(header.backing_file_size); 1265 header.size = be64_to_cpu(header.size); 1266 header.cluster_bits = be32_to_cpu(header.cluster_bits); 1267 header.crypt_method = be32_to_cpu(header.crypt_method); 1268 header.l1_table_offset = be64_to_cpu(header.l1_table_offset); 1269 header.l1_size = be32_to_cpu(header.l1_size); 1270 header.refcount_table_offset = be64_to_cpu(header.refcount_table_offset); 1271 header.refcount_table_clusters = 1272 be32_to_cpu(header.refcount_table_clusters); 1273 header.snapshots_offset = be64_to_cpu(header.snapshots_offset); 1274 header.nb_snapshots = be32_to_cpu(header.nb_snapshots); 1275 1276 if (header.magic != QCOW_MAGIC) { 1277 error_setg(errp, "Image is not in qcow2 format"); 1278 ret = -EINVAL; 1279 goto fail; 1280 } 1281 if (header.version < 2 || header.version > 3) { 1282 error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version); 1283 ret = -ENOTSUP; 1284 goto fail; 1285 } 1286 1287 s->qcow_version = header.version; 1288 1289 /* Initialise cluster size */ 1290 if (header.cluster_bits < MIN_CLUSTER_BITS || 1291 header.cluster_bits > MAX_CLUSTER_BITS) { 1292 error_setg(errp, "Unsupported cluster size: 2^%" PRIu32, 1293 header.cluster_bits); 1294 ret = -EINVAL; 1295 goto fail; 1296 } 1297 1298 s->cluster_bits = header.cluster_bits; 1299 s->cluster_size = 1 << s->cluster_bits; 1300 1301 /* Initialise version 3 header fields */ 1302 if (header.version == 2) { 1303 header.incompatible_features = 0; 1304 header.compatible_features = 0; 1305 header.autoclear_features = 0; 1306 header.refcount_order = 4; 1307 header.header_length = 72; 1308 } else { 1309 header.incompatible_features = 1310 be64_to_cpu(header.incompatible_features); 1311 header.compatible_features = be64_to_cpu(header.compatible_features); 1312 header.autoclear_features = be64_to_cpu(header.autoclear_features); 1313 header.refcount_order = be32_to_cpu(header.refcount_order); 1314 header.header_length = be32_to_cpu(header.header_length); 1315 1316 if (header.header_length < 104) { 1317 error_setg(errp, "qcow2 header too short"); 1318 ret = -EINVAL; 1319 goto fail; 1320 } 1321 } 1322 1323 if (header.header_length > s->cluster_size) { 1324 error_setg(errp, "qcow2 header exceeds cluster size"); 1325 ret = -EINVAL; 1326 goto fail; 1327 } 1328 1329 if (header.header_length > sizeof(header)) { 1330 s->unknown_header_fields_size = header.header_length - sizeof(header); 1331 s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); 1332 ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, 1333 s->unknown_header_fields_size); 1334 if (ret < 0) { 1335 error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " 1336 "fields"); 1337 goto fail; 1338 } 1339 } 1340 1341 if (header.backing_file_offset > s->cluster_size) { 1342 error_setg(errp, "Invalid backing file offset"); 1343 ret = -EINVAL; 1344 goto fail; 1345 } 1346 1347 if (header.backing_file_offset) { 1348 ext_end = header.backing_file_offset; 1349 } else { 1350 ext_end = 1 << header.cluster_bits; 1351 } 1352 1353 /* Handle feature bits */ 1354 s->incompatible_features = header.incompatible_features; 1355 s->compatible_features = header.compatible_features; 1356 s->autoclear_features = header.autoclear_features; 1357 1358 if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { 1359 void *feature_table = NULL; 1360 qcow2_read_extensions(bs, header.header_length, ext_end, 1361 &feature_table, flags, NULL, NULL); 1362 report_unsupported_feature(errp, feature_table, 1363 s->incompatible_features & 1364 ~QCOW2_INCOMPAT_MASK); 1365 ret = -ENOTSUP; 1366 g_free(feature_table); 1367 goto fail; 1368 } 1369 1370 if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { 1371 /* Corrupt images may not be written to unless they are being repaired 1372 */ 1373 if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { 1374 error_setg(errp, "qcow2: Image is corrupt; cannot be opened " 1375 "read/write"); 1376 ret = -EACCES; 1377 goto fail; 1378 } 1379 } 1380 1381 /* Check support for various header values */ 1382 if (header.refcount_order > 6) { 1383 error_setg(errp, "Reference count entry width too large; may not " 1384 "exceed 64 bits"); 1385 ret = -EINVAL; 1386 goto fail; 1387 } 1388 s->refcount_order = header.refcount_order; 1389 s->refcount_bits = 1 << s->refcount_order; 1390 s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); 1391 s->refcount_max += s->refcount_max - 1; 1392 1393 s->crypt_method_header = header.crypt_method; 1394 if (s->crypt_method_header) { 1395 if (bdrv_uses_whitelist() && 1396 s->crypt_method_header == QCOW_CRYPT_AES) { 1397 error_setg(errp, 1398 "Use of AES-CBC encrypted qcow2 images is no longer " 1399 "supported in system emulators"); 1400 error_append_hint(errp, 1401 "You can use 'qemu-img convert' to convert your " 1402 "image to an alternative supported format, such " 1403 "as unencrypted qcow2, or raw with the LUKS " 1404 "format instead.\n"); 1405 ret = -ENOSYS; 1406 goto fail; 1407 } 1408 1409 if (s->crypt_method_header == QCOW_CRYPT_AES) { 1410 s->crypt_physical_offset = false; 1411 } else { 1412 /* Assuming LUKS and any future crypt methods we 1413 * add will all use physical offsets, due to the 1414 * fact that the alternative is insecure... */ 1415 s->crypt_physical_offset = true; 1416 } 1417 1418 bs->encrypted = true; 1419 } 1420 1421 s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ 1422 s->l2_size = 1 << s->l2_bits; 1423 /* 2^(s->refcount_order - 3) is the refcount width in bytes */ 1424 s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3); 1425 s->refcount_block_size = 1 << s->refcount_block_bits; 1426 bs->total_sectors = header.size / BDRV_SECTOR_SIZE; 1427 s->csize_shift = (62 - (s->cluster_bits - 8)); 1428 s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; 1429 s->cluster_offset_mask = (1LL << s->csize_shift) - 1; 1430 1431 s->refcount_table_offset = header.refcount_table_offset; 1432 s->refcount_table_size = 1433 header.refcount_table_clusters << (s->cluster_bits - 3); 1434 1435 if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) { 1436 error_setg(errp, "Image does not contain a reference count table"); 1437 ret = -EINVAL; 1438 goto fail; 1439 } 1440 1441 ret = qcow2_validate_table(bs, s->refcount_table_offset, 1442 header.refcount_table_clusters, 1443 s->cluster_size, QCOW_MAX_REFTABLE_SIZE, 1444 "Reference count table", errp); 1445 if (ret < 0) { 1446 goto fail; 1447 } 1448 1449 if (!(flags & BDRV_O_CHECK)) { 1450 /* 1451 * The total size in bytes of the snapshot table is checked in 1452 * qcow2_read_snapshots() because the size of each snapshot is 1453 * variable and we don't know it yet. 1454 * Here we only check the offset and number of snapshots. 1455 */ 1456 ret = qcow2_validate_table(bs, header.snapshots_offset, 1457 header.nb_snapshots, 1458 sizeof(QCowSnapshotHeader), 1459 sizeof(QCowSnapshotHeader) * 1460 QCOW_MAX_SNAPSHOTS, 1461 "Snapshot table", errp); 1462 if (ret < 0) { 1463 goto fail; 1464 } 1465 } 1466 1467 /* read the level 1 table */ 1468 ret = qcow2_validate_table(bs, header.l1_table_offset, 1469 header.l1_size, sizeof(uint64_t), 1470 QCOW_MAX_L1_SIZE, "Active L1 table", errp); 1471 if (ret < 0) { 1472 goto fail; 1473 } 1474 s->l1_size = header.l1_size; 1475 s->l1_table_offset = header.l1_table_offset; 1476 1477 l1_vm_state_index = size_to_l1(s, header.size); 1478 if (l1_vm_state_index > INT_MAX) { 1479 error_setg(errp, "Image is too big"); 1480 ret = -EFBIG; 1481 goto fail; 1482 } 1483 s->l1_vm_state_index = l1_vm_state_index; 1484 1485 /* the L1 table must contain at least enough entries to put 1486 header.size bytes */ 1487 if (s->l1_size < s->l1_vm_state_index) { 1488 error_setg(errp, "L1 table is too small"); 1489 ret = -EINVAL; 1490 goto fail; 1491 } 1492 1493 if (s->l1_size > 0) { 1494 s->l1_table = qemu_try_blockalign(bs->file->bs, 1495 ROUND_UP(s->l1_size * sizeof(uint64_t), 512)); 1496 if (s->l1_table == NULL) { 1497 error_setg(errp, "Could not allocate L1 table"); 1498 ret = -ENOMEM; 1499 goto fail; 1500 } 1501 ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, 1502 s->l1_size * sizeof(uint64_t)); 1503 if (ret < 0) { 1504 error_setg_errno(errp, -ret, "Could not read L1 table"); 1505 goto fail; 1506 } 1507 for(i = 0;i < s->l1_size; i++) { 1508 s->l1_table[i] = be64_to_cpu(s->l1_table[i]); 1509 } 1510 } 1511 1512 /* Parse driver-specific options */ 1513 ret = qcow2_update_options(bs, options, flags, errp); 1514 if (ret < 0) { 1515 goto fail; 1516 } 1517 1518 s->flags = flags; 1519 1520 ret = qcow2_refcount_init(bs); 1521 if (ret != 0) { 1522 error_setg_errno(errp, -ret, "Could not initialize refcount handling"); 1523 goto fail; 1524 } 1525 1526 QLIST_INIT(&s->cluster_allocs); 1527 QTAILQ_INIT(&s->discards); 1528 1529 /* read qcow2 extensions */ 1530 if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL, 1531 flags, &update_header, &local_err)) { 1532 error_propagate(errp, local_err); 1533 ret = -EINVAL; 1534 goto fail; 1535 } 1536 1537 /* Open external data file */ 1538 s->data_file = bdrv_open_child(NULL, options, "data-file", bs, &child_file, 1539 true, &local_err); 1540 if (local_err) { 1541 error_propagate(errp, local_err); 1542 ret = -EINVAL; 1543 goto fail; 1544 } 1545 1546 if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) { 1547 if (!s->data_file && s->image_data_file) { 1548 s->data_file = bdrv_open_child(s->image_data_file, options, 1549 "data-file", bs, &child_file, 1550 false, errp); 1551 if (!s->data_file) { 1552 ret = -EINVAL; 1553 goto fail; 1554 } 1555 } 1556 if (!s->data_file) { 1557 error_setg(errp, "'data-file' is required for this image"); 1558 ret = -EINVAL; 1559 goto fail; 1560 } 1561 } else { 1562 if (s->data_file) { 1563 error_setg(errp, "'data-file' can only be set for images with an " 1564 "external data file"); 1565 ret = -EINVAL; 1566 goto fail; 1567 } 1568 1569 s->data_file = bs->file; 1570 1571 if (data_file_is_raw(bs)) { 1572 error_setg(errp, "data-file-raw requires a data file"); 1573 ret = -EINVAL; 1574 goto fail; 1575 } 1576 } 1577 1578 /* qcow2_read_extension may have set up the crypto context 1579 * if the crypt method needs a header region, some methods 1580 * don't need header extensions, so must check here 1581 */ 1582 if (s->crypt_method_header && !s->crypto) { 1583 if (s->crypt_method_header == QCOW_CRYPT_AES) { 1584 unsigned int cflags = 0; 1585 if (flags & BDRV_O_NO_IO) { 1586 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; 1587 } 1588 s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.", 1589 NULL, NULL, cflags, 1590 QCOW2_MAX_THREADS, errp); 1591 if (!s->crypto) { 1592 ret = -EINVAL; 1593 goto fail; 1594 } 1595 } else if (!(flags & BDRV_O_NO_IO)) { 1596 error_setg(errp, "Missing CRYPTO header for crypt method %d", 1597 s->crypt_method_header); 1598 ret = -EINVAL; 1599 goto fail; 1600 } 1601 } 1602 1603 /* read the backing file name */ 1604 if (header.backing_file_offset != 0) { 1605 len = header.backing_file_size; 1606 if (len > MIN(1023, s->cluster_size - header.backing_file_offset) || 1607 len >= sizeof(bs->backing_file)) { 1608 error_setg(errp, "Backing file name too long"); 1609 ret = -EINVAL; 1610 goto fail; 1611 } 1612 ret = bdrv_pread(bs->file, header.backing_file_offset, 1613 bs->auto_backing_file, len); 1614 if (ret < 0) { 1615 error_setg_errno(errp, -ret, "Could not read backing file name"); 1616 goto fail; 1617 } 1618 bs->auto_backing_file[len] = '\0'; 1619 pstrcpy(bs->backing_file, sizeof(bs->backing_file), 1620 bs->auto_backing_file); 1621 s->image_backing_file = g_strdup(bs->auto_backing_file); 1622 } 1623 1624 /* 1625 * Internal snapshots; skip reading them in check mode, because 1626 * we do not need them then, and we do not want to abort because 1627 * of a broken table. 1628 */ 1629 if (!(flags & BDRV_O_CHECK)) { 1630 s->snapshots_offset = header.snapshots_offset; 1631 s->nb_snapshots = header.nb_snapshots; 1632 1633 ret = qcow2_read_snapshots(bs, errp); 1634 if (ret < 0) { 1635 goto fail; 1636 } 1637 } 1638 1639 /* Clear unknown autoclear feature bits */ 1640 update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK; 1641 update_header = 1642 update_header && !bs->read_only && !(flags & BDRV_O_INACTIVE); 1643 if (update_header) { 1644 s->autoclear_features &= QCOW2_AUTOCLEAR_MASK; 1645 } 1646 1647 /* == Handle persistent dirty bitmaps == 1648 * 1649 * We want load dirty bitmaps in three cases: 1650 * 1651 * 1. Normal open of the disk in active mode, not related to invalidation 1652 * after migration. 1653 * 1654 * 2. Invalidation of the target vm after pre-copy phase of migration, if 1655 * bitmaps are _not_ migrating through migration channel, i.e. 1656 * 'dirty-bitmaps' capability is disabled. 1657 * 1658 * 3. Invalidation of source vm after failed or canceled migration. 1659 * This is a very interesting case. There are two possible types of 1660 * bitmaps: 1661 * 1662 * A. Stored on inactivation and removed. They should be loaded from the 1663 * image. 1664 * 1665 * B. Not stored: not-persistent bitmaps and bitmaps, migrated through 1666 * the migration channel (with dirty-bitmaps capability). 1667 * 1668 * On the other hand, there are two possible sub-cases: 1669 * 1670 * 3.1 disk was changed by somebody else while were inactive. In this 1671 * case all in-RAM dirty bitmaps (both persistent and not) are 1672 * definitely invalid. And we don't have any method to determine 1673 * this. 1674 * 1675 * Simple and safe thing is to just drop all the bitmaps of type B on 1676 * inactivation. But in this case we lose bitmaps in valid 4.2 case. 1677 * 1678 * On the other hand, resuming source vm, if disk was already changed 1679 * is a bad thing anyway: not only bitmaps, the whole vm state is 1680 * out of sync with disk. 1681 * 1682 * This means, that user or management tool, who for some reason 1683 * decided to resume source vm, after disk was already changed by 1684 * target vm, should at least drop all dirty bitmaps by hand. 1685 * 1686 * So, we can ignore this case for now, but TODO: "generation" 1687 * extension for qcow2, to determine, that image was changed after 1688 * last inactivation. And if it is changed, we will drop (or at least 1689 * mark as 'invalid' all the bitmaps of type B, both persistent 1690 * and not). 1691 * 1692 * 3.2 disk was _not_ changed while were inactive. Bitmaps may be saved 1693 * to disk ('dirty-bitmaps' capability disabled), or not saved 1694 * ('dirty-bitmaps' capability enabled), but we don't need to care 1695 * of: let's load bitmaps as always: stored bitmaps will be loaded, 1696 * and not stored has flag IN_USE=1 in the image and will be skipped 1697 * on loading. 1698 * 1699 * One remaining possible case when we don't want load bitmaps: 1700 * 1701 * 4. Open disk in inactive mode in target vm (bitmaps are migrating or 1702 * will be loaded on invalidation, no needs try loading them before) 1703 */ 1704 1705 if (!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)) { 1706 /* It's case 1, 2 or 3.2. Or 3.1 which is BUG in management layer. */ 1707 bool header_updated = qcow2_load_dirty_bitmaps(bs, &local_err); 1708 if (local_err != NULL) { 1709 error_propagate(errp, local_err); 1710 ret = -EINVAL; 1711 goto fail; 1712 } 1713 1714 update_header = update_header && !header_updated; 1715 } 1716 1717 if (update_header) { 1718 ret = qcow2_update_header(bs); 1719 if (ret < 0) { 1720 error_setg_errno(errp, -ret, "Could not update qcow2 header"); 1721 goto fail; 1722 } 1723 } 1724 1725 bs->supported_zero_flags = header.version >= 3 ? 1726 BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK : 0; 1727 1728 /* Repair image if dirty */ 1729 if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only && 1730 (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { 1731 BdrvCheckResult result = {0}; 1732 1733 ret = qcow2_co_check_locked(bs, &result, 1734 BDRV_FIX_ERRORS | BDRV_FIX_LEAKS); 1735 if (ret < 0 || result.check_errors) { 1736 if (ret >= 0) { 1737 ret = -EIO; 1738 } 1739 error_setg_errno(errp, -ret, "Could not repair dirty image"); 1740 goto fail; 1741 } 1742 } 1743 1744 #ifdef DEBUG_ALLOC 1745 { 1746 BdrvCheckResult result = {0}; 1747 qcow2_check_refcounts(bs, &result, 0); 1748 } 1749 #endif 1750 1751 qemu_co_queue_init(&s->thread_task_queue); 1752 1753 return ret; 1754 1755 fail: 1756 g_free(s->image_data_file); 1757 if (has_data_file(bs)) { 1758 bdrv_unref_child(bs, s->data_file); 1759 } 1760 g_free(s->unknown_header_fields); 1761 cleanup_unknown_header_ext(bs); 1762 qcow2_free_snapshots(bs); 1763 qcow2_refcount_close(bs); 1764 qemu_vfree(s->l1_table); 1765 /* else pre-write overlap checks in cache_destroy may crash */ 1766 s->l1_table = NULL; 1767 cache_clean_timer_del(bs); 1768 if (s->l2_table_cache) { 1769 qcow2_cache_destroy(s->l2_table_cache); 1770 } 1771 if (s->refcount_block_cache) { 1772 qcow2_cache_destroy(s->refcount_block_cache); 1773 } 1774 qcrypto_block_free(s->crypto); 1775 qapi_free_QCryptoBlockOpenOptions(s->crypto_opts); 1776 return ret; 1777 } 1778 1779 typedef struct QCow2OpenCo { 1780 BlockDriverState *bs; 1781 QDict *options; 1782 int flags; 1783 Error **errp; 1784 int ret; 1785 } QCow2OpenCo; 1786 1787 static void coroutine_fn qcow2_open_entry(void *opaque) 1788 { 1789 QCow2OpenCo *qoc = opaque; 1790 BDRVQcow2State *s = qoc->bs->opaque; 1791 1792 qemu_co_mutex_lock(&s->lock); 1793 qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp); 1794 qemu_co_mutex_unlock(&s->lock); 1795 } 1796 1797 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, 1798 Error **errp) 1799 { 1800 BDRVQcow2State *s = bs->opaque; 1801 QCow2OpenCo qoc = { 1802 .bs = bs, 1803 .options = options, 1804 .flags = flags, 1805 .errp = errp, 1806 .ret = -EINPROGRESS 1807 }; 1808 1809 bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, 1810 false, errp); 1811 if (!bs->file) { 1812 return -EINVAL; 1813 } 1814 1815 /* Initialise locks */ 1816 qemu_co_mutex_init(&s->lock); 1817 1818 if (qemu_in_coroutine()) { 1819 /* From bdrv_co_create. */ 1820 qcow2_open_entry(&qoc); 1821 } else { 1822 assert(qemu_get_current_aio_context() == qemu_get_aio_context()); 1823 qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc)); 1824 BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS); 1825 } 1826 return qoc.ret; 1827 } 1828 1829 static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) 1830 { 1831 BDRVQcow2State *s = bs->opaque; 1832 1833 if (bs->encrypted) { 1834 /* Encryption works on a sector granularity */ 1835 bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto); 1836 } 1837 bs->bl.pwrite_zeroes_alignment = s->cluster_size; 1838 bs->bl.pdiscard_alignment = s->cluster_size; 1839 } 1840 1841 static int qcow2_reopen_prepare(BDRVReopenState *state, 1842 BlockReopenQueue *queue, Error **errp) 1843 { 1844 Qcow2ReopenState *r; 1845 int ret; 1846 1847 r = g_new0(Qcow2ReopenState, 1); 1848 state->opaque = r; 1849 1850 ret = qcow2_update_options_prepare(state->bs, r, state->options, 1851 state->flags, errp); 1852 if (ret < 0) { 1853 goto fail; 1854 } 1855 1856 /* We need to write out any unwritten data if we reopen read-only. */ 1857 if ((state->flags & BDRV_O_RDWR) == 0) { 1858 ret = qcow2_reopen_bitmaps_ro(state->bs, errp); 1859 if (ret < 0) { 1860 goto fail; 1861 } 1862 1863 ret = bdrv_flush(state->bs); 1864 if (ret < 0) { 1865 goto fail; 1866 } 1867 1868 ret = qcow2_mark_clean(state->bs); 1869 if (ret < 0) { 1870 goto fail; 1871 } 1872 } 1873 1874 return 0; 1875 1876 fail: 1877 qcow2_update_options_abort(state->bs, r); 1878 g_free(r); 1879 return ret; 1880 } 1881 1882 static void qcow2_reopen_commit(BDRVReopenState *state) 1883 { 1884 qcow2_update_options_commit(state->bs, state->opaque); 1885 if (state->flags & BDRV_O_RDWR) { 1886 Error *local_err = NULL; 1887 1888 if (qcow2_reopen_bitmaps_rw(state->bs, &local_err) < 0) { 1889 /* 1890 * This is not fatal, bitmaps just left read-only, so all following 1891 * writes will fail. User can remove read-only bitmaps to unblock 1892 * writes or retry reopen. 1893 */ 1894 error_reportf_err(local_err, 1895 "%s: Failed to make dirty bitmaps writable: ", 1896 bdrv_get_node_name(state->bs)); 1897 } 1898 } 1899 g_free(state->opaque); 1900 } 1901 1902 static void qcow2_reopen_abort(BDRVReopenState *state) 1903 { 1904 qcow2_update_options_abort(state->bs, state->opaque); 1905 g_free(state->opaque); 1906 } 1907 1908 static void qcow2_join_options(QDict *options, QDict *old_options) 1909 { 1910 bool has_new_overlap_template = 1911 qdict_haskey(options, QCOW2_OPT_OVERLAP) || 1912 qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE); 1913 bool has_new_total_cache_size = 1914 qdict_haskey(options, QCOW2_OPT_CACHE_SIZE); 1915 bool has_all_cache_options; 1916 1917 /* New overlap template overrides all old overlap options */ 1918 if (has_new_overlap_template) { 1919 qdict_del(old_options, QCOW2_OPT_OVERLAP); 1920 qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE); 1921 qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER); 1922 qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1); 1923 qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2); 1924 qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE); 1925 qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK); 1926 qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE); 1927 qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1); 1928 qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2); 1929 } 1930 1931 /* New total cache size overrides all old options */ 1932 if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) { 1933 qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE); 1934 qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE); 1935 } 1936 1937 qdict_join(options, old_options, false); 1938 1939 /* 1940 * If after merging all cache size options are set, an old total size is 1941 * overwritten. Do keep all options, however, if all three are new. The 1942 * resulting error message is what we want to happen. 1943 */ 1944 has_all_cache_options = 1945 qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) || 1946 qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) || 1947 qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE); 1948 1949 if (has_all_cache_options && !has_new_total_cache_size) { 1950 qdict_del(options, QCOW2_OPT_CACHE_SIZE); 1951 } 1952 } 1953 1954 static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs, 1955 bool want_zero, 1956 int64_t offset, int64_t count, 1957 int64_t *pnum, int64_t *map, 1958 BlockDriverState **file) 1959 { 1960 BDRVQcow2State *s = bs->opaque; 1961 uint64_t cluster_offset; 1962 unsigned int bytes; 1963 int ret, status = 0; 1964 1965 qemu_co_mutex_lock(&s->lock); 1966 1967 if (!s->metadata_preallocation_checked) { 1968 ret = qcow2_detect_metadata_preallocation(bs); 1969 s->metadata_preallocation = (ret == 1); 1970 s->metadata_preallocation_checked = true; 1971 } 1972 1973 bytes = MIN(INT_MAX, count); 1974 ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset); 1975 qemu_co_mutex_unlock(&s->lock); 1976 if (ret < 0) { 1977 return ret; 1978 } 1979 1980 *pnum = bytes; 1981 1982 if ((ret == QCOW2_CLUSTER_NORMAL || ret == QCOW2_CLUSTER_ZERO_ALLOC) && 1983 !s->crypto) { 1984 *map = cluster_offset | offset_into_cluster(s, offset); 1985 *file = s->data_file->bs; 1986 status |= BDRV_BLOCK_OFFSET_VALID; 1987 } 1988 if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) { 1989 status |= BDRV_BLOCK_ZERO; 1990 } else if (ret != QCOW2_CLUSTER_UNALLOCATED) { 1991 status |= BDRV_BLOCK_DATA; 1992 } 1993 if (s->metadata_preallocation && (status & BDRV_BLOCK_DATA) && 1994 (status & BDRV_BLOCK_OFFSET_VALID)) 1995 { 1996 status |= BDRV_BLOCK_RECURSE; 1997 } 1998 return status; 1999 } 2000 2001 static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs, 2002 QCowL2Meta **pl2meta, 2003 bool link_l2) 2004 { 2005 int ret = 0; 2006 QCowL2Meta *l2meta = *pl2meta; 2007 2008 while (l2meta != NULL) { 2009 QCowL2Meta *next; 2010 2011 if (link_l2) { 2012 ret = qcow2_alloc_cluster_link_l2(bs, l2meta); 2013 if (ret) { 2014 goto out; 2015 } 2016 } else { 2017 qcow2_alloc_cluster_abort(bs, l2meta); 2018 } 2019 2020 /* Take the request off the list of running requests */ 2021 if (l2meta->nb_clusters != 0) { 2022 QLIST_REMOVE(l2meta, next_in_flight); 2023 } 2024 2025 qemu_co_queue_restart_all(&l2meta->dependent_requests); 2026 2027 next = l2meta->next; 2028 g_free(l2meta); 2029 l2meta = next; 2030 } 2031 out: 2032 *pl2meta = l2meta; 2033 return ret; 2034 } 2035 2036 static coroutine_fn int 2037 qcow2_co_preadv_encrypted(BlockDriverState *bs, 2038 uint64_t file_cluster_offset, 2039 uint64_t offset, 2040 uint64_t bytes, 2041 QEMUIOVector *qiov, 2042 uint64_t qiov_offset) 2043 { 2044 int ret; 2045 BDRVQcow2State *s = bs->opaque; 2046 uint8_t *buf; 2047 2048 assert(bs->encrypted && s->crypto); 2049 assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); 2050 2051 /* 2052 * For encrypted images, read everything into a temporary 2053 * contiguous buffer on which the AES functions can work. 2054 * Also, decryption in a separate buffer is better as it 2055 * prevents the guest from learning information about the 2056 * encrypted nature of the virtual disk. 2057 */ 2058 2059 buf = qemu_try_blockalign(s->data_file->bs, bytes); 2060 if (buf == NULL) { 2061 return -ENOMEM; 2062 } 2063 2064 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); 2065 ret = bdrv_co_pread(s->data_file, 2066 file_cluster_offset + offset_into_cluster(s, offset), 2067 bytes, buf, 0); 2068 if (ret < 0) { 2069 goto fail; 2070 } 2071 2072 assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); 2073 assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); 2074 if (qcow2_co_decrypt(bs, 2075 file_cluster_offset + offset_into_cluster(s, offset), 2076 offset, buf, bytes) < 0) 2077 { 2078 ret = -EIO; 2079 goto fail; 2080 } 2081 qemu_iovec_from_buf(qiov, qiov_offset, buf, bytes); 2082 2083 fail: 2084 qemu_vfree(buf); 2085 2086 return ret; 2087 } 2088 2089 typedef struct Qcow2AioTask { 2090 AioTask task; 2091 2092 BlockDriverState *bs; 2093 QCow2ClusterType cluster_type; /* only for read */ 2094 uint64_t file_cluster_offset; 2095 uint64_t offset; 2096 uint64_t bytes; 2097 QEMUIOVector *qiov; 2098 uint64_t qiov_offset; 2099 QCowL2Meta *l2meta; /* only for write */ 2100 } Qcow2AioTask; 2101 2102 static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task); 2103 static coroutine_fn int qcow2_add_task(BlockDriverState *bs, 2104 AioTaskPool *pool, 2105 AioTaskFunc func, 2106 QCow2ClusterType cluster_type, 2107 uint64_t file_cluster_offset, 2108 uint64_t offset, 2109 uint64_t bytes, 2110 QEMUIOVector *qiov, 2111 size_t qiov_offset, 2112 QCowL2Meta *l2meta) 2113 { 2114 Qcow2AioTask local_task; 2115 Qcow2AioTask *task = pool ? g_new(Qcow2AioTask, 1) : &local_task; 2116 2117 *task = (Qcow2AioTask) { 2118 .task.func = func, 2119 .bs = bs, 2120 .cluster_type = cluster_type, 2121 .qiov = qiov, 2122 .file_cluster_offset = file_cluster_offset, 2123 .offset = offset, 2124 .bytes = bytes, 2125 .qiov_offset = qiov_offset, 2126 .l2meta = l2meta, 2127 }; 2128 2129 trace_qcow2_add_task(qemu_coroutine_self(), bs, pool, 2130 func == qcow2_co_preadv_task_entry ? "read" : "write", 2131 cluster_type, file_cluster_offset, offset, bytes, 2132 qiov, qiov_offset); 2133 2134 if (!pool) { 2135 return func(&task->task); 2136 } 2137 2138 aio_task_pool_start_task(pool, &task->task); 2139 2140 return 0; 2141 } 2142 2143 static coroutine_fn int qcow2_co_preadv_task(BlockDriverState *bs, 2144 QCow2ClusterType cluster_type, 2145 uint64_t file_cluster_offset, 2146 uint64_t offset, uint64_t bytes, 2147 QEMUIOVector *qiov, 2148 size_t qiov_offset) 2149 { 2150 BDRVQcow2State *s = bs->opaque; 2151 int offset_in_cluster = offset_into_cluster(s, offset); 2152 2153 switch (cluster_type) { 2154 case QCOW2_CLUSTER_ZERO_PLAIN: 2155 case QCOW2_CLUSTER_ZERO_ALLOC: 2156 /* Both zero types are handled in qcow2_co_preadv_part */ 2157 g_assert_not_reached(); 2158 2159 case QCOW2_CLUSTER_UNALLOCATED: 2160 assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */ 2161 2162 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); 2163 return bdrv_co_preadv_part(bs->backing, offset, bytes, 2164 qiov, qiov_offset, 0); 2165 2166 case QCOW2_CLUSTER_COMPRESSED: 2167 return qcow2_co_preadv_compressed(bs, file_cluster_offset, 2168 offset, bytes, qiov, qiov_offset); 2169 2170 case QCOW2_CLUSTER_NORMAL: 2171 if ((file_cluster_offset & 511) != 0) { 2172 return -EIO; 2173 } 2174 2175 if (bs->encrypted) { 2176 return qcow2_co_preadv_encrypted(bs, file_cluster_offset, 2177 offset, bytes, qiov, qiov_offset); 2178 } 2179 2180 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); 2181 return bdrv_co_preadv_part(s->data_file, 2182 file_cluster_offset + offset_in_cluster, 2183 bytes, qiov, qiov_offset, 0); 2184 2185 default: 2186 g_assert_not_reached(); 2187 } 2188 2189 g_assert_not_reached(); 2190 } 2191 2192 static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task) 2193 { 2194 Qcow2AioTask *t = container_of(task, Qcow2AioTask, task); 2195 2196 assert(!t->l2meta); 2197 2198 return qcow2_co_preadv_task(t->bs, t->cluster_type, t->file_cluster_offset, 2199 t->offset, t->bytes, t->qiov, t->qiov_offset); 2200 } 2201 2202 static coroutine_fn int qcow2_co_preadv_part(BlockDriverState *bs, 2203 uint64_t offset, uint64_t bytes, 2204 QEMUIOVector *qiov, 2205 size_t qiov_offset, int flags) 2206 { 2207 BDRVQcow2State *s = bs->opaque; 2208 int ret = 0; 2209 unsigned int cur_bytes; /* number of bytes in current iteration */ 2210 uint64_t cluster_offset = 0; 2211 AioTaskPool *aio = NULL; 2212 2213 while (bytes != 0 && aio_task_pool_status(aio) == 0) { 2214 /* prepare next request */ 2215 cur_bytes = MIN(bytes, INT_MAX); 2216 if (s->crypto) { 2217 cur_bytes = MIN(cur_bytes, 2218 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); 2219 } 2220 2221 qemu_co_mutex_lock(&s->lock); 2222 ret = qcow2_get_cluster_offset(bs, offset, &cur_bytes, &cluster_offset); 2223 qemu_co_mutex_unlock(&s->lock); 2224 if (ret < 0) { 2225 goto out; 2226 } 2227 2228 if (ret == QCOW2_CLUSTER_ZERO_PLAIN || 2229 ret == QCOW2_CLUSTER_ZERO_ALLOC || 2230 (ret == QCOW2_CLUSTER_UNALLOCATED && !bs->backing)) 2231 { 2232 qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes); 2233 } else { 2234 if (!aio && cur_bytes != bytes) { 2235 aio = aio_task_pool_new(QCOW2_MAX_WORKERS); 2236 } 2237 ret = qcow2_add_task(bs, aio, qcow2_co_preadv_task_entry, ret, 2238 cluster_offset, offset, cur_bytes, 2239 qiov, qiov_offset, NULL); 2240 if (ret < 0) { 2241 goto out; 2242 } 2243 } 2244 2245 bytes -= cur_bytes; 2246 offset += cur_bytes; 2247 qiov_offset += cur_bytes; 2248 } 2249 2250 out: 2251 if (aio) { 2252 aio_task_pool_wait_all(aio); 2253 if (ret == 0) { 2254 ret = aio_task_pool_status(aio); 2255 } 2256 g_free(aio); 2257 } 2258 2259 return ret; 2260 } 2261 2262 /* Check if it's possible to merge a write request with the writing of 2263 * the data from the COW regions */ 2264 static bool merge_cow(uint64_t offset, unsigned bytes, 2265 QEMUIOVector *qiov, size_t qiov_offset, 2266 QCowL2Meta *l2meta) 2267 { 2268 QCowL2Meta *m; 2269 2270 for (m = l2meta; m != NULL; m = m->next) { 2271 /* If both COW regions are empty then there's nothing to merge */ 2272 if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) { 2273 continue; 2274 } 2275 2276 /* If COW regions are handled already, skip this too */ 2277 if (m->skip_cow) { 2278 continue; 2279 } 2280 2281 /* The data (middle) region must be immediately after the 2282 * start region */ 2283 if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) { 2284 continue; 2285 } 2286 2287 /* The end region must be immediately after the data (middle) 2288 * region */ 2289 if (m->offset + m->cow_end.offset != offset + bytes) { 2290 continue; 2291 } 2292 2293 /* Make sure that adding both COW regions to the QEMUIOVector 2294 * does not exceed IOV_MAX */ 2295 if (qemu_iovec_subvec_niov(qiov, qiov_offset, bytes) > IOV_MAX - 2) { 2296 continue; 2297 } 2298 2299 m->data_qiov = qiov; 2300 m->data_qiov_offset = qiov_offset; 2301 return true; 2302 } 2303 2304 return false; 2305 } 2306 2307 static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes) 2308 { 2309 int64_t nr; 2310 return !bytes || 2311 (!bdrv_is_allocated_above(bs, NULL, false, offset, bytes, &nr) && 2312 nr == bytes); 2313 } 2314 2315 static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m) 2316 { 2317 /* 2318 * This check is designed for optimization shortcut so it must be 2319 * efficient. 2320 * Instead of is_zero(), use is_unallocated() as it is faster (but not 2321 * as accurate and can result in false negatives). 2322 */ 2323 return is_unallocated(bs, m->offset + m->cow_start.offset, 2324 m->cow_start.nb_bytes) && 2325 is_unallocated(bs, m->offset + m->cow_end.offset, 2326 m->cow_end.nb_bytes); 2327 } 2328 2329 static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta) 2330 { 2331 BDRVQcow2State *s = bs->opaque; 2332 QCowL2Meta *m; 2333 2334 if (!(s->data_file->bs->supported_zero_flags & BDRV_REQ_NO_FALLBACK)) { 2335 return 0; 2336 } 2337 2338 if (bs->encrypted) { 2339 return 0; 2340 } 2341 2342 for (m = l2meta; m != NULL; m = m->next) { 2343 int ret; 2344 2345 if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) { 2346 continue; 2347 } 2348 2349 if (!is_zero_cow(bs, m)) { 2350 continue; 2351 } 2352 2353 /* 2354 * instead of writing zero COW buffers, 2355 * efficiently zero out the whole clusters 2356 */ 2357 2358 ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset, 2359 m->nb_clusters * s->cluster_size, 2360 true); 2361 if (ret < 0) { 2362 return ret; 2363 } 2364 2365 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE); 2366 ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset, 2367 m->nb_clusters * s->cluster_size, 2368 BDRV_REQ_NO_FALLBACK); 2369 if (ret < 0) { 2370 if (ret != -ENOTSUP && ret != -EAGAIN) { 2371 return ret; 2372 } 2373 continue; 2374 } 2375 2376 trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters); 2377 m->skip_cow = true; 2378 } 2379 return 0; 2380 } 2381 2382 /* 2383 * qcow2_co_pwritev_task 2384 * Called with s->lock unlocked 2385 * l2meta - if not NULL, qcow2_co_pwritev_task() will consume it. Caller must 2386 * not use it somehow after qcow2_co_pwritev_task() call 2387 */ 2388 static coroutine_fn int qcow2_co_pwritev_task(BlockDriverState *bs, 2389 uint64_t file_cluster_offset, 2390 uint64_t offset, uint64_t bytes, 2391 QEMUIOVector *qiov, 2392 uint64_t qiov_offset, 2393 QCowL2Meta *l2meta) 2394 { 2395 int ret; 2396 BDRVQcow2State *s = bs->opaque; 2397 void *crypt_buf = NULL; 2398 int offset_in_cluster = offset_into_cluster(s, offset); 2399 QEMUIOVector encrypted_qiov; 2400 2401 if (bs->encrypted) { 2402 assert(s->crypto); 2403 assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); 2404 crypt_buf = qemu_try_blockalign(bs->file->bs, bytes); 2405 if (crypt_buf == NULL) { 2406 ret = -ENOMEM; 2407 goto out_unlocked; 2408 } 2409 qemu_iovec_to_buf(qiov, qiov_offset, crypt_buf, bytes); 2410 2411 if (qcow2_co_encrypt(bs, file_cluster_offset + offset_in_cluster, 2412 offset, crypt_buf, bytes) < 0) 2413 { 2414 ret = -EIO; 2415 goto out_unlocked; 2416 } 2417 2418 qemu_iovec_init_buf(&encrypted_qiov, crypt_buf, bytes); 2419 qiov = &encrypted_qiov; 2420 qiov_offset = 0; 2421 } 2422 2423 /* Try to efficiently initialize the physical space with zeroes */ 2424 ret = handle_alloc_space(bs, l2meta); 2425 if (ret < 0) { 2426 goto out_unlocked; 2427 } 2428 2429 /* 2430 * If we need to do COW, check if it's possible to merge the 2431 * writing of the guest data together with that of the COW regions. 2432 * If it's not possible (or not necessary) then write the 2433 * guest data now. 2434 */ 2435 if (!merge_cow(offset, bytes, qiov, qiov_offset, l2meta)) { 2436 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); 2437 trace_qcow2_writev_data(qemu_coroutine_self(), 2438 file_cluster_offset + offset_in_cluster); 2439 ret = bdrv_co_pwritev_part(s->data_file, 2440 file_cluster_offset + offset_in_cluster, 2441 bytes, qiov, qiov_offset, 0); 2442 if (ret < 0) { 2443 goto out_unlocked; 2444 } 2445 } 2446 2447 qemu_co_mutex_lock(&s->lock); 2448 2449 ret = qcow2_handle_l2meta(bs, &l2meta, true); 2450 goto out_locked; 2451 2452 out_unlocked: 2453 qemu_co_mutex_lock(&s->lock); 2454 2455 out_locked: 2456 qcow2_handle_l2meta(bs, &l2meta, false); 2457 qemu_co_mutex_unlock(&s->lock); 2458 2459 qemu_vfree(crypt_buf); 2460 2461 return ret; 2462 } 2463 2464 static coroutine_fn int qcow2_co_pwritev_task_entry(AioTask *task) 2465 { 2466 Qcow2AioTask *t = container_of(task, Qcow2AioTask, task); 2467 2468 assert(!t->cluster_type); 2469 2470 return qcow2_co_pwritev_task(t->bs, t->file_cluster_offset, 2471 t->offset, t->bytes, t->qiov, t->qiov_offset, 2472 t->l2meta); 2473 } 2474 2475 static coroutine_fn int qcow2_co_pwritev_part( 2476 BlockDriverState *bs, uint64_t offset, uint64_t bytes, 2477 QEMUIOVector *qiov, size_t qiov_offset, int flags) 2478 { 2479 BDRVQcow2State *s = bs->opaque; 2480 int offset_in_cluster; 2481 int ret; 2482 unsigned int cur_bytes; /* number of sectors in current iteration */ 2483 uint64_t cluster_offset; 2484 QCowL2Meta *l2meta = NULL; 2485 AioTaskPool *aio = NULL; 2486 2487 trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes); 2488 2489 while (bytes != 0 && aio_task_pool_status(aio) == 0) { 2490 2491 l2meta = NULL; 2492 2493 trace_qcow2_writev_start_part(qemu_coroutine_self()); 2494 offset_in_cluster = offset_into_cluster(s, offset); 2495 cur_bytes = MIN(bytes, INT_MAX); 2496 if (bs->encrypted) { 2497 cur_bytes = MIN(cur_bytes, 2498 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size 2499 - offset_in_cluster); 2500 } 2501 2502 qemu_co_mutex_lock(&s->lock); 2503 2504 ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes, 2505 &cluster_offset, &l2meta); 2506 if (ret < 0) { 2507 goto out_locked; 2508 } 2509 2510 assert((cluster_offset & 511) == 0); 2511 2512 ret = qcow2_pre_write_overlap_check(bs, 0, 2513 cluster_offset + offset_in_cluster, 2514 cur_bytes, true); 2515 if (ret < 0) { 2516 goto out_locked; 2517 } 2518 2519 qemu_co_mutex_unlock(&s->lock); 2520 2521 if (!aio && cur_bytes != bytes) { 2522 aio = aio_task_pool_new(QCOW2_MAX_WORKERS); 2523 } 2524 ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_task_entry, 0, 2525 cluster_offset, offset, cur_bytes, 2526 qiov, qiov_offset, l2meta); 2527 l2meta = NULL; /* l2meta is consumed by qcow2_co_pwritev_task() */ 2528 if (ret < 0) { 2529 goto fail_nometa; 2530 } 2531 2532 bytes -= cur_bytes; 2533 offset += cur_bytes; 2534 qiov_offset += cur_bytes; 2535 trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes); 2536 } 2537 ret = 0; 2538 2539 qemu_co_mutex_lock(&s->lock); 2540 2541 out_locked: 2542 qcow2_handle_l2meta(bs, &l2meta, false); 2543 2544 qemu_co_mutex_unlock(&s->lock); 2545 2546 fail_nometa: 2547 if (aio) { 2548 aio_task_pool_wait_all(aio); 2549 if (ret == 0) { 2550 ret = aio_task_pool_status(aio); 2551 } 2552 g_free(aio); 2553 } 2554 2555 trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); 2556 2557 return ret; 2558 } 2559 2560 static int qcow2_inactivate(BlockDriverState *bs) 2561 { 2562 BDRVQcow2State *s = bs->opaque; 2563 int ret, result = 0; 2564 Error *local_err = NULL; 2565 2566 qcow2_store_persistent_dirty_bitmaps(bs, true, &local_err); 2567 if (local_err != NULL) { 2568 result = -EINVAL; 2569 error_reportf_err(local_err, "Lost persistent bitmaps during " 2570 "inactivation of node '%s': ", 2571 bdrv_get_device_or_node_name(bs)); 2572 } 2573 2574 ret = qcow2_cache_flush(bs, s->l2_table_cache); 2575 if (ret) { 2576 result = ret; 2577 error_report("Failed to flush the L2 table cache: %s", 2578 strerror(-ret)); 2579 } 2580 2581 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 2582 if (ret) { 2583 result = ret; 2584 error_report("Failed to flush the refcount block cache: %s", 2585 strerror(-ret)); 2586 } 2587 2588 if (result == 0) { 2589 qcow2_mark_clean(bs); 2590 } 2591 2592 return result; 2593 } 2594 2595 static void qcow2_close(BlockDriverState *bs) 2596 { 2597 BDRVQcow2State *s = bs->opaque; 2598 qemu_vfree(s->l1_table); 2599 /* else pre-write overlap checks in cache_destroy may crash */ 2600 s->l1_table = NULL; 2601 2602 if (!(s->flags & BDRV_O_INACTIVE)) { 2603 qcow2_inactivate(bs); 2604 } 2605 2606 cache_clean_timer_del(bs); 2607 qcow2_cache_destroy(s->l2_table_cache); 2608 qcow2_cache_destroy(s->refcount_block_cache); 2609 2610 qcrypto_block_free(s->crypto); 2611 s->crypto = NULL; 2612 2613 g_free(s->unknown_header_fields); 2614 cleanup_unknown_header_ext(bs); 2615 2616 g_free(s->image_data_file); 2617 g_free(s->image_backing_file); 2618 g_free(s->image_backing_format); 2619 2620 if (has_data_file(bs)) { 2621 bdrv_unref_child(bs, s->data_file); 2622 } 2623 2624 qcow2_refcount_close(bs); 2625 qcow2_free_snapshots(bs); 2626 } 2627 2628 static void coroutine_fn qcow2_co_invalidate_cache(BlockDriverState *bs, 2629 Error **errp) 2630 { 2631 BDRVQcow2State *s = bs->opaque; 2632 int flags = s->flags; 2633 QCryptoBlock *crypto = NULL; 2634 QDict *options; 2635 Error *local_err = NULL; 2636 int ret; 2637 2638 /* 2639 * Backing files are read-only which makes all of their metadata immutable, 2640 * that means we don't have to worry about reopening them here. 2641 */ 2642 2643 crypto = s->crypto; 2644 s->crypto = NULL; 2645 2646 qcow2_close(bs); 2647 2648 memset(s, 0, sizeof(BDRVQcow2State)); 2649 options = qdict_clone_shallow(bs->options); 2650 2651 flags &= ~BDRV_O_INACTIVE; 2652 qemu_co_mutex_lock(&s->lock); 2653 ret = qcow2_do_open(bs, options, flags, &local_err); 2654 qemu_co_mutex_unlock(&s->lock); 2655 qobject_unref(options); 2656 if (local_err) { 2657 error_propagate_prepend(errp, local_err, 2658 "Could not reopen qcow2 layer: "); 2659 bs->drv = NULL; 2660 return; 2661 } else if (ret < 0) { 2662 error_setg_errno(errp, -ret, "Could not reopen qcow2 layer"); 2663 bs->drv = NULL; 2664 return; 2665 } 2666 2667 s->crypto = crypto; 2668 } 2669 2670 static size_t header_ext_add(char *buf, uint32_t magic, const void *s, 2671 size_t len, size_t buflen) 2672 { 2673 QCowExtension *ext_backing_fmt = (QCowExtension*) buf; 2674 size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); 2675 2676 if (buflen < ext_len) { 2677 return -ENOSPC; 2678 } 2679 2680 *ext_backing_fmt = (QCowExtension) { 2681 .magic = cpu_to_be32(magic), 2682 .len = cpu_to_be32(len), 2683 }; 2684 2685 if (len) { 2686 memcpy(buf + sizeof(QCowExtension), s, len); 2687 } 2688 2689 return ext_len; 2690 } 2691 2692 /* 2693 * Updates the qcow2 header, including the variable length parts of it, i.e. 2694 * the backing file name and all extensions. qcow2 was not designed to allow 2695 * such changes, so if we run out of space (we can only use the first cluster) 2696 * this function may fail. 2697 * 2698 * Returns 0 on success, -errno in error cases. 2699 */ 2700 int qcow2_update_header(BlockDriverState *bs) 2701 { 2702 BDRVQcow2State *s = bs->opaque; 2703 QCowHeader *header; 2704 char *buf; 2705 size_t buflen = s->cluster_size; 2706 int ret; 2707 uint64_t total_size; 2708 uint32_t refcount_table_clusters; 2709 size_t header_length; 2710 Qcow2UnknownHeaderExtension *uext; 2711 2712 buf = qemu_blockalign(bs, buflen); 2713 2714 /* Header structure */ 2715 header = (QCowHeader*) buf; 2716 2717 if (buflen < sizeof(*header)) { 2718 ret = -ENOSPC; 2719 goto fail; 2720 } 2721 2722 header_length = sizeof(*header) + s->unknown_header_fields_size; 2723 total_size = bs->total_sectors * BDRV_SECTOR_SIZE; 2724 refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); 2725 2726 *header = (QCowHeader) { 2727 /* Version 2 fields */ 2728 .magic = cpu_to_be32(QCOW_MAGIC), 2729 .version = cpu_to_be32(s->qcow_version), 2730 .backing_file_offset = 0, 2731 .backing_file_size = 0, 2732 .cluster_bits = cpu_to_be32(s->cluster_bits), 2733 .size = cpu_to_be64(total_size), 2734 .crypt_method = cpu_to_be32(s->crypt_method_header), 2735 .l1_size = cpu_to_be32(s->l1_size), 2736 .l1_table_offset = cpu_to_be64(s->l1_table_offset), 2737 .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), 2738 .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), 2739 .nb_snapshots = cpu_to_be32(s->nb_snapshots), 2740 .snapshots_offset = cpu_to_be64(s->snapshots_offset), 2741 2742 /* Version 3 fields */ 2743 .incompatible_features = cpu_to_be64(s->incompatible_features), 2744 .compatible_features = cpu_to_be64(s->compatible_features), 2745 .autoclear_features = cpu_to_be64(s->autoclear_features), 2746 .refcount_order = cpu_to_be32(s->refcount_order), 2747 .header_length = cpu_to_be32(header_length), 2748 }; 2749 2750 /* For older versions, write a shorter header */ 2751 switch (s->qcow_version) { 2752 case 2: 2753 ret = offsetof(QCowHeader, incompatible_features); 2754 break; 2755 case 3: 2756 ret = sizeof(*header); 2757 break; 2758 default: 2759 ret = -EINVAL; 2760 goto fail; 2761 } 2762 2763 buf += ret; 2764 buflen -= ret; 2765 memset(buf, 0, buflen); 2766 2767 /* Preserve any unknown field in the header */ 2768 if (s->unknown_header_fields_size) { 2769 if (buflen < s->unknown_header_fields_size) { 2770 ret = -ENOSPC; 2771 goto fail; 2772 } 2773 2774 memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); 2775 buf += s->unknown_header_fields_size; 2776 buflen -= s->unknown_header_fields_size; 2777 } 2778 2779 /* Backing file format header extension */ 2780 if (s->image_backing_format) { 2781 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, 2782 s->image_backing_format, 2783 strlen(s->image_backing_format), 2784 buflen); 2785 if (ret < 0) { 2786 goto fail; 2787 } 2788 2789 buf += ret; 2790 buflen -= ret; 2791 } 2792 2793 /* External data file header extension */ 2794 if (has_data_file(bs) && s->image_data_file) { 2795 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_DATA_FILE, 2796 s->image_data_file, strlen(s->image_data_file), 2797 buflen); 2798 if (ret < 0) { 2799 goto fail; 2800 } 2801 2802 buf += ret; 2803 buflen -= ret; 2804 } 2805 2806 /* Full disk encryption header pointer extension */ 2807 if (s->crypto_header.offset != 0) { 2808 s->crypto_header.offset = cpu_to_be64(s->crypto_header.offset); 2809 s->crypto_header.length = cpu_to_be64(s->crypto_header.length); 2810 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER, 2811 &s->crypto_header, sizeof(s->crypto_header), 2812 buflen); 2813 s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset); 2814 s->crypto_header.length = be64_to_cpu(s->crypto_header.length); 2815 if (ret < 0) { 2816 goto fail; 2817 } 2818 buf += ret; 2819 buflen -= ret; 2820 } 2821 2822 /* Feature table */ 2823 if (s->qcow_version >= 3) { 2824 Qcow2Feature features[] = { 2825 { 2826 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, 2827 .bit = QCOW2_INCOMPAT_DIRTY_BITNR, 2828 .name = "dirty bit", 2829 }, 2830 { 2831 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, 2832 .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, 2833 .name = "corrupt bit", 2834 }, 2835 { 2836 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, 2837 .bit = QCOW2_INCOMPAT_DATA_FILE_BITNR, 2838 .name = "external data file", 2839 }, 2840 { 2841 .type = QCOW2_FEAT_TYPE_COMPATIBLE, 2842 .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, 2843 .name = "lazy refcounts", 2844 }, 2845 }; 2846 2847 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, 2848 features, sizeof(features), buflen); 2849 if (ret < 0) { 2850 goto fail; 2851 } 2852 buf += ret; 2853 buflen -= ret; 2854 } 2855 2856 /* Bitmap extension */ 2857 if (s->nb_bitmaps > 0) { 2858 Qcow2BitmapHeaderExt bitmaps_header = { 2859 .nb_bitmaps = cpu_to_be32(s->nb_bitmaps), 2860 .bitmap_directory_size = 2861 cpu_to_be64(s->bitmap_directory_size), 2862 .bitmap_directory_offset = 2863 cpu_to_be64(s->bitmap_directory_offset) 2864 }; 2865 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS, 2866 &bitmaps_header, sizeof(bitmaps_header), 2867 buflen); 2868 if (ret < 0) { 2869 goto fail; 2870 } 2871 buf += ret; 2872 buflen -= ret; 2873 } 2874 2875 /* Keep unknown header extensions */ 2876 QLIST_FOREACH(uext, &s->unknown_header_ext, next) { 2877 ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); 2878 if (ret < 0) { 2879 goto fail; 2880 } 2881 2882 buf += ret; 2883 buflen -= ret; 2884 } 2885 2886 /* End of header extensions */ 2887 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); 2888 if (ret < 0) { 2889 goto fail; 2890 } 2891 2892 buf += ret; 2893 buflen -= ret; 2894 2895 /* Backing file name */ 2896 if (s->image_backing_file) { 2897 size_t backing_file_len = strlen(s->image_backing_file); 2898 2899 if (buflen < backing_file_len) { 2900 ret = -ENOSPC; 2901 goto fail; 2902 } 2903 2904 /* Using strncpy is ok here, since buf is not NUL-terminated. */ 2905 strncpy(buf, s->image_backing_file, buflen); 2906 2907 header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); 2908 header->backing_file_size = cpu_to_be32(backing_file_len); 2909 } 2910 2911 /* Write the new header */ 2912 ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); 2913 if (ret < 0) { 2914 goto fail; 2915 } 2916 2917 ret = 0; 2918 fail: 2919 qemu_vfree(header); 2920 return ret; 2921 } 2922 2923 static int qcow2_change_backing_file(BlockDriverState *bs, 2924 const char *backing_file, const char *backing_fmt) 2925 { 2926 BDRVQcow2State *s = bs->opaque; 2927 2928 /* Adding a backing file means that the external data file alone won't be 2929 * enough to make sense of the content */ 2930 if (backing_file && data_file_is_raw(bs)) { 2931 return -EINVAL; 2932 } 2933 2934 if (backing_file && strlen(backing_file) > 1023) { 2935 return -EINVAL; 2936 } 2937 2938 pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file), 2939 backing_file ?: ""); 2940 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 2941 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 2942 2943 g_free(s->image_backing_file); 2944 g_free(s->image_backing_format); 2945 2946 s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL; 2947 s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL; 2948 2949 return qcow2_update_header(bs); 2950 } 2951 2952 static int qcow2_crypt_method_from_format(const char *encryptfmt) 2953 { 2954 if (g_str_equal(encryptfmt, "luks")) { 2955 return QCOW_CRYPT_LUKS; 2956 } else if (g_str_equal(encryptfmt, "aes")) { 2957 return QCOW_CRYPT_AES; 2958 } else { 2959 return -EINVAL; 2960 } 2961 } 2962 2963 static int qcow2_set_up_encryption(BlockDriverState *bs, 2964 QCryptoBlockCreateOptions *cryptoopts, 2965 Error **errp) 2966 { 2967 BDRVQcow2State *s = bs->opaque; 2968 QCryptoBlock *crypto = NULL; 2969 int fmt, ret; 2970 2971 switch (cryptoopts->format) { 2972 case Q_CRYPTO_BLOCK_FORMAT_LUKS: 2973 fmt = QCOW_CRYPT_LUKS; 2974 break; 2975 case Q_CRYPTO_BLOCK_FORMAT_QCOW: 2976 fmt = QCOW_CRYPT_AES; 2977 break; 2978 default: 2979 error_setg(errp, "Crypto format not supported in qcow2"); 2980 return -EINVAL; 2981 } 2982 2983 s->crypt_method_header = fmt; 2984 2985 crypto = qcrypto_block_create(cryptoopts, "encrypt.", 2986 qcow2_crypto_hdr_init_func, 2987 qcow2_crypto_hdr_write_func, 2988 bs, errp); 2989 if (!crypto) { 2990 return -EINVAL; 2991 } 2992 2993 ret = qcow2_update_header(bs); 2994 if (ret < 0) { 2995 error_setg_errno(errp, -ret, "Could not write encryption header"); 2996 goto out; 2997 } 2998 2999 ret = 0; 3000 out: 3001 qcrypto_block_free(crypto); 3002 return ret; 3003 } 3004 3005 /** 3006 * Preallocates metadata structures for data clusters between @offset (in the 3007 * guest disk) and @new_length (which is thus generally the new guest disk 3008 * size). 3009 * 3010 * Returns: 0 on success, -errno on failure. 3011 */ 3012 static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset, 3013 uint64_t new_length, PreallocMode mode, 3014 Error **errp) 3015 { 3016 BDRVQcow2State *s = bs->opaque; 3017 uint64_t bytes; 3018 uint64_t host_offset = 0; 3019 int64_t file_length; 3020 unsigned int cur_bytes; 3021 int ret; 3022 QCowL2Meta *meta; 3023 3024 assert(offset <= new_length); 3025 bytes = new_length - offset; 3026 3027 while (bytes) { 3028 cur_bytes = MIN(bytes, QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size)); 3029 ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes, 3030 &host_offset, &meta); 3031 if (ret < 0) { 3032 error_setg_errno(errp, -ret, "Allocating clusters failed"); 3033 return ret; 3034 } 3035 3036 while (meta) { 3037 QCowL2Meta *next = meta->next; 3038 3039 ret = qcow2_alloc_cluster_link_l2(bs, meta); 3040 if (ret < 0) { 3041 error_setg_errno(errp, -ret, "Mapping clusters failed"); 3042 qcow2_free_any_clusters(bs, meta->alloc_offset, 3043 meta->nb_clusters, QCOW2_DISCARD_NEVER); 3044 return ret; 3045 } 3046 3047 /* There are no dependent requests, but we need to remove our 3048 * request from the list of in-flight requests */ 3049 QLIST_REMOVE(meta, next_in_flight); 3050 3051 g_free(meta); 3052 meta = next; 3053 } 3054 3055 /* TODO Preallocate data if requested */ 3056 3057 bytes -= cur_bytes; 3058 offset += cur_bytes; 3059 } 3060 3061 /* 3062 * It is expected that the image file is large enough to actually contain 3063 * all of the allocated clusters (otherwise we get failing reads after 3064 * EOF). Extend the image to the last allocated sector. 3065 */ 3066 file_length = bdrv_getlength(s->data_file->bs); 3067 if (file_length < 0) { 3068 error_setg_errno(errp, -file_length, "Could not get file size"); 3069 return file_length; 3070 } 3071 3072 if (host_offset + cur_bytes > file_length) { 3073 if (mode == PREALLOC_MODE_METADATA) { 3074 mode = PREALLOC_MODE_OFF; 3075 } 3076 ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false, 3077 mode, errp); 3078 if (ret < 0) { 3079 return ret; 3080 } 3081 } 3082 3083 return 0; 3084 } 3085 3086 /* qcow2_refcount_metadata_size: 3087 * @clusters: number of clusters to refcount (including data and L1/L2 tables) 3088 * @cluster_size: size of a cluster, in bytes 3089 * @refcount_order: refcount bits power-of-2 exponent 3090 * @generous_increase: allow for the refcount table to be 1.5x as large as it 3091 * needs to be 3092 * 3093 * Returns: Number of bytes required for refcount blocks and table metadata. 3094 */ 3095 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size, 3096 int refcount_order, bool generous_increase, 3097 uint64_t *refblock_count) 3098 { 3099 /* 3100 * Every host cluster is reference-counted, including metadata (even 3101 * refcount metadata is recursively included). 3102 * 3103 * An accurate formula for the size of refcount metadata size is difficult 3104 * to derive. An easier method of calculation is finding the fixed point 3105 * where no further refcount blocks or table clusters are required to 3106 * reference count every cluster. 3107 */ 3108 int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t); 3109 int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order); 3110 int64_t table = 0; /* number of refcount table clusters */ 3111 int64_t blocks = 0; /* number of refcount block clusters */ 3112 int64_t last; 3113 int64_t n = 0; 3114 3115 do { 3116 last = n; 3117 blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block); 3118 table = DIV_ROUND_UP(blocks, blocks_per_table_cluster); 3119 n = clusters + blocks + table; 3120 3121 if (n == last && generous_increase) { 3122 clusters += DIV_ROUND_UP(table, 2); 3123 n = 0; /* force another loop */ 3124 generous_increase = false; 3125 } 3126 } while (n != last); 3127 3128 if (refblock_count) { 3129 *refblock_count = blocks; 3130 } 3131 3132 return (blocks + table) * cluster_size; 3133 } 3134 3135 /** 3136 * qcow2_calc_prealloc_size: 3137 * @total_size: virtual disk size in bytes 3138 * @cluster_size: cluster size in bytes 3139 * @refcount_order: refcount bits power-of-2 exponent 3140 * 3141 * Returns: Total number of bytes required for the fully allocated image 3142 * (including metadata). 3143 */ 3144 static int64_t qcow2_calc_prealloc_size(int64_t total_size, 3145 size_t cluster_size, 3146 int refcount_order) 3147 { 3148 int64_t meta_size = 0; 3149 uint64_t nl1e, nl2e; 3150 int64_t aligned_total_size = ROUND_UP(total_size, cluster_size); 3151 3152 /* header: 1 cluster */ 3153 meta_size += cluster_size; 3154 3155 /* total size of L2 tables */ 3156 nl2e = aligned_total_size / cluster_size; 3157 nl2e = ROUND_UP(nl2e, cluster_size / sizeof(uint64_t)); 3158 meta_size += nl2e * sizeof(uint64_t); 3159 3160 /* total size of L1 tables */ 3161 nl1e = nl2e * sizeof(uint64_t) / cluster_size; 3162 nl1e = ROUND_UP(nl1e, cluster_size / sizeof(uint64_t)); 3163 meta_size += nl1e * sizeof(uint64_t); 3164 3165 /* total size of refcount table and blocks */ 3166 meta_size += qcow2_refcount_metadata_size( 3167 (meta_size + aligned_total_size) / cluster_size, 3168 cluster_size, refcount_order, false, NULL); 3169 3170 return meta_size + aligned_total_size; 3171 } 3172 3173 static bool validate_cluster_size(size_t cluster_size, Error **errp) 3174 { 3175 int cluster_bits = ctz32(cluster_size); 3176 if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || 3177 (1 << cluster_bits) != cluster_size) 3178 { 3179 error_setg(errp, "Cluster size must be a power of two between %d and " 3180 "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); 3181 return false; 3182 } 3183 return true; 3184 } 3185 3186 static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp) 3187 { 3188 size_t cluster_size; 3189 3190 cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 3191 DEFAULT_CLUSTER_SIZE); 3192 if (!validate_cluster_size(cluster_size, errp)) { 3193 return 0; 3194 } 3195 return cluster_size; 3196 } 3197 3198 static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp) 3199 { 3200 char *buf; 3201 int ret; 3202 3203 buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL); 3204 if (!buf) { 3205 ret = 3; /* default */ 3206 } else if (!strcmp(buf, "0.10")) { 3207 ret = 2; 3208 } else if (!strcmp(buf, "1.1")) { 3209 ret = 3; 3210 } else { 3211 error_setg(errp, "Invalid compatibility level: '%s'", buf); 3212 ret = -EINVAL; 3213 } 3214 g_free(buf); 3215 return ret; 3216 } 3217 3218 static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version, 3219 Error **errp) 3220 { 3221 uint64_t refcount_bits; 3222 3223 refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16); 3224 if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) { 3225 error_setg(errp, "Refcount width must be a power of two and may not " 3226 "exceed 64 bits"); 3227 return 0; 3228 } 3229 3230 if (version < 3 && refcount_bits != 16) { 3231 error_setg(errp, "Different refcount widths than 16 bits require " 3232 "compatibility level 1.1 or above (use compat=1.1 or " 3233 "greater)"); 3234 return 0; 3235 } 3236 3237 return refcount_bits; 3238 } 3239 3240 static int coroutine_fn 3241 qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) 3242 { 3243 BlockdevCreateOptionsQcow2 *qcow2_opts; 3244 QDict *options; 3245 3246 /* 3247 * Open the image file and write a minimal qcow2 header. 3248 * 3249 * We keep things simple and start with a zero-sized image. We also 3250 * do without refcount blocks or a L1 table for now. We'll fix the 3251 * inconsistency later. 3252 * 3253 * We do need a refcount table because growing the refcount table means 3254 * allocating two new refcount blocks - the seconds of which would be at 3255 * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file 3256 * size for any qcow2 image. 3257 */ 3258 BlockBackend *blk = NULL; 3259 BlockDriverState *bs = NULL; 3260 BlockDriverState *data_bs = NULL; 3261 QCowHeader *header; 3262 size_t cluster_size; 3263 int version; 3264 int refcount_order; 3265 uint64_t* refcount_table; 3266 Error *local_err = NULL; 3267 int ret; 3268 3269 assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2); 3270 qcow2_opts = &create_options->u.qcow2; 3271 3272 bs = bdrv_open_blockdev_ref(qcow2_opts->file, errp); 3273 if (bs == NULL) { 3274 return -EIO; 3275 } 3276 3277 /* Validate options and set default values */ 3278 if (!QEMU_IS_ALIGNED(qcow2_opts->size, BDRV_SECTOR_SIZE)) { 3279 error_setg(errp, "Image size must be a multiple of 512 bytes"); 3280 ret = -EINVAL; 3281 goto out; 3282 } 3283 3284 if (qcow2_opts->has_version) { 3285 switch (qcow2_opts->version) { 3286 case BLOCKDEV_QCOW2_VERSION_V2: 3287 version = 2; 3288 break; 3289 case BLOCKDEV_QCOW2_VERSION_V3: 3290 version = 3; 3291 break; 3292 default: 3293 g_assert_not_reached(); 3294 } 3295 } else { 3296 version = 3; 3297 } 3298 3299 if (qcow2_opts->has_cluster_size) { 3300 cluster_size = qcow2_opts->cluster_size; 3301 } else { 3302 cluster_size = DEFAULT_CLUSTER_SIZE; 3303 } 3304 3305 if (!validate_cluster_size(cluster_size, errp)) { 3306 ret = -EINVAL; 3307 goto out; 3308 } 3309 3310 if (!qcow2_opts->has_preallocation) { 3311 qcow2_opts->preallocation = PREALLOC_MODE_OFF; 3312 } 3313 if (qcow2_opts->has_backing_file && 3314 qcow2_opts->preallocation != PREALLOC_MODE_OFF) 3315 { 3316 error_setg(errp, "Backing file and preallocation cannot be used at " 3317 "the same time"); 3318 ret = -EINVAL; 3319 goto out; 3320 } 3321 if (qcow2_opts->has_backing_fmt && !qcow2_opts->has_backing_file) { 3322 error_setg(errp, "Backing format cannot be used without backing file"); 3323 ret = -EINVAL; 3324 goto out; 3325 } 3326 3327 if (!qcow2_opts->has_lazy_refcounts) { 3328 qcow2_opts->lazy_refcounts = false; 3329 } 3330 if (version < 3 && qcow2_opts->lazy_refcounts) { 3331 error_setg(errp, "Lazy refcounts only supported with compatibility " 3332 "level 1.1 and above (use version=v3 or greater)"); 3333 ret = -EINVAL; 3334 goto out; 3335 } 3336 3337 if (!qcow2_opts->has_refcount_bits) { 3338 qcow2_opts->refcount_bits = 16; 3339 } 3340 if (qcow2_opts->refcount_bits > 64 || 3341 !is_power_of_2(qcow2_opts->refcount_bits)) 3342 { 3343 error_setg(errp, "Refcount width must be a power of two and may not " 3344 "exceed 64 bits"); 3345 ret = -EINVAL; 3346 goto out; 3347 } 3348 if (version < 3 && qcow2_opts->refcount_bits != 16) { 3349 error_setg(errp, "Different refcount widths than 16 bits require " 3350 "compatibility level 1.1 or above (use version=v3 or " 3351 "greater)"); 3352 ret = -EINVAL; 3353 goto out; 3354 } 3355 refcount_order = ctz32(qcow2_opts->refcount_bits); 3356 3357 if (qcow2_opts->data_file_raw && !qcow2_opts->data_file) { 3358 error_setg(errp, "data-file-raw requires data-file"); 3359 ret = -EINVAL; 3360 goto out; 3361 } 3362 if (qcow2_opts->data_file_raw && qcow2_opts->has_backing_file) { 3363 error_setg(errp, "Backing file and data-file-raw cannot be used at " 3364 "the same time"); 3365 ret = -EINVAL; 3366 goto out; 3367 } 3368 3369 if (qcow2_opts->data_file) { 3370 if (version < 3) { 3371 error_setg(errp, "External data files are only supported with " 3372 "compatibility level 1.1 and above (use version=v3 or " 3373 "greater)"); 3374 ret = -EINVAL; 3375 goto out; 3376 } 3377 data_bs = bdrv_open_blockdev_ref(qcow2_opts->data_file, errp); 3378 if (data_bs == NULL) { 3379 ret = -EIO; 3380 goto out; 3381 } 3382 } 3383 3384 /* Create BlockBackend to write to the image */ 3385 blk = blk_new(bdrv_get_aio_context(bs), 3386 BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL); 3387 ret = blk_insert_bs(blk, bs, errp); 3388 if (ret < 0) { 3389 goto out; 3390 } 3391 blk_set_allow_write_beyond_eof(blk, true); 3392 3393 /* Write the header */ 3394 QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); 3395 header = g_malloc0(cluster_size); 3396 *header = (QCowHeader) { 3397 .magic = cpu_to_be32(QCOW_MAGIC), 3398 .version = cpu_to_be32(version), 3399 .cluster_bits = cpu_to_be32(ctz32(cluster_size)), 3400 .size = cpu_to_be64(0), 3401 .l1_table_offset = cpu_to_be64(0), 3402 .l1_size = cpu_to_be32(0), 3403 .refcount_table_offset = cpu_to_be64(cluster_size), 3404 .refcount_table_clusters = cpu_to_be32(1), 3405 .refcount_order = cpu_to_be32(refcount_order), 3406 .header_length = cpu_to_be32(sizeof(*header)), 3407 }; 3408 3409 /* We'll update this to correct value later */ 3410 header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); 3411 3412 if (qcow2_opts->lazy_refcounts) { 3413 header->compatible_features |= 3414 cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); 3415 } 3416 if (data_bs) { 3417 header->incompatible_features |= 3418 cpu_to_be64(QCOW2_INCOMPAT_DATA_FILE); 3419 } 3420 if (qcow2_opts->data_file_raw) { 3421 header->autoclear_features |= 3422 cpu_to_be64(QCOW2_AUTOCLEAR_DATA_FILE_RAW); 3423 } 3424 3425 ret = blk_pwrite(blk, 0, header, cluster_size, 0); 3426 g_free(header); 3427 if (ret < 0) { 3428 error_setg_errno(errp, -ret, "Could not write qcow2 header"); 3429 goto out; 3430 } 3431 3432 /* Write a refcount table with one refcount block */ 3433 refcount_table = g_malloc0(2 * cluster_size); 3434 refcount_table[0] = cpu_to_be64(2 * cluster_size); 3435 ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size, 0); 3436 g_free(refcount_table); 3437 3438 if (ret < 0) { 3439 error_setg_errno(errp, -ret, "Could not write refcount table"); 3440 goto out; 3441 } 3442 3443 blk_unref(blk); 3444 blk = NULL; 3445 3446 /* 3447 * And now open the image and make it consistent first (i.e. increase the 3448 * refcount of the cluster that is occupied by the header and the refcount 3449 * table) 3450 */ 3451 options = qdict_new(); 3452 qdict_put_str(options, "driver", "qcow2"); 3453 qdict_put_str(options, "file", bs->node_name); 3454 if (data_bs) { 3455 qdict_put_str(options, "data-file", data_bs->node_name); 3456 } 3457 blk = blk_new_open(NULL, NULL, options, 3458 BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH, 3459 &local_err); 3460 if (blk == NULL) { 3461 error_propagate(errp, local_err); 3462 ret = -EIO; 3463 goto out; 3464 } 3465 3466 ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size); 3467 if (ret < 0) { 3468 error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " 3469 "header and refcount table"); 3470 goto out; 3471 3472 } else if (ret != 0) { 3473 error_report("Huh, first cluster in empty image is already in use?"); 3474 abort(); 3475 } 3476 3477 /* Set the external data file if necessary */ 3478 if (data_bs) { 3479 BDRVQcow2State *s = blk_bs(blk)->opaque; 3480 s->image_data_file = g_strdup(data_bs->filename); 3481 } 3482 3483 /* Create a full header (including things like feature table) */ 3484 ret = qcow2_update_header(blk_bs(blk)); 3485 if (ret < 0) { 3486 error_setg_errno(errp, -ret, "Could not update qcow2 header"); 3487 goto out; 3488 } 3489 3490 /* Okay, now that we have a valid image, let's give it the right size */ 3491 ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation, 3492 errp); 3493 if (ret < 0) { 3494 error_prepend(errp, "Could not resize image: "); 3495 goto out; 3496 } 3497 3498 /* Want a backing file? There you go.*/ 3499 if (qcow2_opts->has_backing_file) { 3500 const char *backing_format = NULL; 3501 3502 if (qcow2_opts->has_backing_fmt) { 3503 backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt); 3504 } 3505 3506 ret = bdrv_change_backing_file(blk_bs(blk), qcow2_opts->backing_file, 3507 backing_format); 3508 if (ret < 0) { 3509 error_setg_errno(errp, -ret, "Could not assign backing file '%s' " 3510 "with format '%s'", qcow2_opts->backing_file, 3511 backing_format); 3512 goto out; 3513 } 3514 } 3515 3516 /* Want encryption? There you go. */ 3517 if (qcow2_opts->has_encrypt) { 3518 ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp); 3519 if (ret < 0) { 3520 goto out; 3521 } 3522 } 3523 3524 blk_unref(blk); 3525 blk = NULL; 3526 3527 /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning. 3528 * Using BDRV_O_NO_IO, since encryption is now setup we don't want to 3529 * have to setup decryption context. We're not doing any I/O on the top 3530 * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does 3531 * not have effect. 3532 */ 3533 options = qdict_new(); 3534 qdict_put_str(options, "driver", "qcow2"); 3535 qdict_put_str(options, "file", bs->node_name); 3536 if (data_bs) { 3537 qdict_put_str(options, "data-file", data_bs->node_name); 3538 } 3539 blk = blk_new_open(NULL, NULL, options, 3540 BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO, 3541 &local_err); 3542 if (blk == NULL) { 3543 error_propagate(errp, local_err); 3544 ret = -EIO; 3545 goto out; 3546 } 3547 3548 ret = 0; 3549 out: 3550 blk_unref(blk); 3551 bdrv_unref(bs); 3552 bdrv_unref(data_bs); 3553 return ret; 3554 } 3555 3556 static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts, 3557 Error **errp) 3558 { 3559 BlockdevCreateOptions *create_options = NULL; 3560 QDict *qdict; 3561 Visitor *v; 3562 BlockDriverState *bs = NULL; 3563 BlockDriverState *data_bs = NULL; 3564 Error *local_err = NULL; 3565 const char *val; 3566 int ret; 3567 3568 /* Only the keyval visitor supports the dotted syntax needed for 3569 * encryption, so go through a QDict before getting a QAPI type. Ignore 3570 * options meant for the protocol layer so that the visitor doesn't 3571 * complain. */ 3572 qdict = qemu_opts_to_qdict_filtered(opts, NULL, bdrv_qcow2.create_opts, 3573 true); 3574 3575 /* Handle encryption options */ 3576 val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT); 3577 if (val && !strcmp(val, "on")) { 3578 qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow"); 3579 } else if (val && !strcmp(val, "off")) { 3580 qdict_del(qdict, BLOCK_OPT_ENCRYPT); 3581 } 3582 3583 val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT); 3584 if (val && !strcmp(val, "aes")) { 3585 qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow"); 3586 } 3587 3588 /* Convert compat=0.10/1.1 into compat=v2/v3, to be renamed into 3589 * version=v2/v3 below. */ 3590 val = qdict_get_try_str(qdict, BLOCK_OPT_COMPAT_LEVEL); 3591 if (val && !strcmp(val, "0.10")) { 3592 qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v2"); 3593 } else if (val && !strcmp(val, "1.1")) { 3594 qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v3"); 3595 } 3596 3597 /* Change legacy command line options into QMP ones */ 3598 static const QDictRenames opt_renames[] = { 3599 { BLOCK_OPT_BACKING_FILE, "backing-file" }, 3600 { BLOCK_OPT_BACKING_FMT, "backing-fmt" }, 3601 { BLOCK_OPT_CLUSTER_SIZE, "cluster-size" }, 3602 { BLOCK_OPT_LAZY_REFCOUNTS, "lazy-refcounts" }, 3603 { BLOCK_OPT_REFCOUNT_BITS, "refcount-bits" }, 3604 { BLOCK_OPT_ENCRYPT, BLOCK_OPT_ENCRYPT_FORMAT }, 3605 { BLOCK_OPT_COMPAT_LEVEL, "version" }, 3606 { BLOCK_OPT_DATA_FILE_RAW, "data-file-raw" }, 3607 { NULL, NULL }, 3608 }; 3609 3610 if (!qdict_rename_keys(qdict, opt_renames, errp)) { 3611 ret = -EINVAL; 3612 goto finish; 3613 } 3614 3615 /* Create and open the file (protocol layer) */ 3616 ret = bdrv_create_file(filename, opts, errp); 3617 if (ret < 0) { 3618 goto finish; 3619 } 3620 3621 bs = bdrv_open(filename, NULL, NULL, 3622 BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp); 3623 if (bs == NULL) { 3624 ret = -EIO; 3625 goto finish; 3626 } 3627 3628 /* Create and open an external data file (protocol layer) */ 3629 val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE); 3630 if (val) { 3631 ret = bdrv_create_file(val, opts, errp); 3632 if (ret < 0) { 3633 goto finish; 3634 } 3635 3636 data_bs = bdrv_open(val, NULL, NULL, 3637 BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, 3638 errp); 3639 if (data_bs == NULL) { 3640 ret = -EIO; 3641 goto finish; 3642 } 3643 3644 qdict_del(qdict, BLOCK_OPT_DATA_FILE); 3645 qdict_put_str(qdict, "data-file", data_bs->node_name); 3646 } 3647 3648 /* Set 'driver' and 'node' options */ 3649 qdict_put_str(qdict, "driver", "qcow2"); 3650 qdict_put_str(qdict, "file", bs->node_name); 3651 3652 /* Now get the QAPI type BlockdevCreateOptions */ 3653 v = qobject_input_visitor_new_flat_confused(qdict, errp); 3654 if (!v) { 3655 ret = -EINVAL; 3656 goto finish; 3657 } 3658 3659 visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err); 3660 visit_free(v); 3661 3662 if (local_err) { 3663 error_propagate(errp, local_err); 3664 ret = -EINVAL; 3665 goto finish; 3666 } 3667 3668 /* Silently round up size */ 3669 create_options->u.qcow2.size = ROUND_UP(create_options->u.qcow2.size, 3670 BDRV_SECTOR_SIZE); 3671 3672 /* Create the qcow2 image (format layer) */ 3673 ret = qcow2_co_create(create_options, errp); 3674 if (ret < 0) { 3675 goto finish; 3676 } 3677 3678 ret = 0; 3679 finish: 3680 qobject_unref(qdict); 3681 bdrv_unref(bs); 3682 bdrv_unref(data_bs); 3683 qapi_free_BlockdevCreateOptions(create_options); 3684 return ret; 3685 } 3686 3687 3688 static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes) 3689 { 3690 int64_t nr; 3691 int res; 3692 3693 /* Clamp to image length, before checking status of underlying sectors */ 3694 if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) { 3695 bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset; 3696 } 3697 3698 if (!bytes) { 3699 return true; 3700 } 3701 res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL); 3702 return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes; 3703 } 3704 3705 static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, 3706 int64_t offset, int bytes, BdrvRequestFlags flags) 3707 { 3708 int ret; 3709 BDRVQcow2State *s = bs->opaque; 3710 3711 uint32_t head = offset % s->cluster_size; 3712 uint32_t tail = (offset + bytes) % s->cluster_size; 3713 3714 trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes); 3715 if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) { 3716 tail = 0; 3717 } 3718 3719 if (head || tail) { 3720 uint64_t off; 3721 unsigned int nr; 3722 3723 assert(head + bytes <= s->cluster_size); 3724 3725 /* check whether remainder of cluster already reads as zero */ 3726 if (!(is_zero(bs, offset - head, head) && 3727 is_zero(bs, offset + bytes, 3728 tail ? s->cluster_size - tail : 0))) { 3729 return -ENOTSUP; 3730 } 3731 3732 qemu_co_mutex_lock(&s->lock); 3733 /* We can have new write after previous check */ 3734 offset = QEMU_ALIGN_DOWN(offset, s->cluster_size); 3735 bytes = s->cluster_size; 3736 nr = s->cluster_size; 3737 ret = qcow2_get_cluster_offset(bs, offset, &nr, &off); 3738 if (ret != QCOW2_CLUSTER_UNALLOCATED && 3739 ret != QCOW2_CLUSTER_ZERO_PLAIN && 3740 ret != QCOW2_CLUSTER_ZERO_ALLOC) { 3741 qemu_co_mutex_unlock(&s->lock); 3742 return -ENOTSUP; 3743 } 3744 } else { 3745 qemu_co_mutex_lock(&s->lock); 3746 } 3747 3748 trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes); 3749 3750 /* Whatever is left can use real zero clusters */ 3751 ret = qcow2_cluster_zeroize(bs, offset, bytes, flags); 3752 qemu_co_mutex_unlock(&s->lock); 3753 3754 return ret; 3755 } 3756 3757 static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs, 3758 int64_t offset, int bytes) 3759 { 3760 int ret; 3761 BDRVQcow2State *s = bs->opaque; 3762 3763 if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) { 3764 assert(bytes < s->cluster_size); 3765 /* Ignore partial clusters, except for the special case of the 3766 * complete partial cluster at the end of an unaligned file */ 3767 if (!QEMU_IS_ALIGNED(offset, s->cluster_size) || 3768 offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) { 3769 return -ENOTSUP; 3770 } 3771 } 3772 3773 qemu_co_mutex_lock(&s->lock); 3774 ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST, 3775 false); 3776 qemu_co_mutex_unlock(&s->lock); 3777 return ret; 3778 } 3779 3780 static int coroutine_fn 3781 qcow2_co_copy_range_from(BlockDriverState *bs, 3782 BdrvChild *src, uint64_t src_offset, 3783 BdrvChild *dst, uint64_t dst_offset, 3784 uint64_t bytes, BdrvRequestFlags read_flags, 3785 BdrvRequestFlags write_flags) 3786 { 3787 BDRVQcow2State *s = bs->opaque; 3788 int ret; 3789 unsigned int cur_bytes; /* number of bytes in current iteration */ 3790 BdrvChild *child = NULL; 3791 BdrvRequestFlags cur_write_flags; 3792 3793 assert(!bs->encrypted); 3794 qemu_co_mutex_lock(&s->lock); 3795 3796 while (bytes != 0) { 3797 uint64_t copy_offset = 0; 3798 /* prepare next request */ 3799 cur_bytes = MIN(bytes, INT_MAX); 3800 cur_write_flags = write_flags; 3801 3802 ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, ©_offset); 3803 if (ret < 0) { 3804 goto out; 3805 } 3806 3807 switch (ret) { 3808 case QCOW2_CLUSTER_UNALLOCATED: 3809 if (bs->backing && bs->backing->bs) { 3810 int64_t backing_length = bdrv_getlength(bs->backing->bs); 3811 if (src_offset >= backing_length) { 3812 cur_write_flags |= BDRV_REQ_ZERO_WRITE; 3813 } else { 3814 child = bs->backing; 3815 cur_bytes = MIN(cur_bytes, backing_length - src_offset); 3816 copy_offset = src_offset; 3817 } 3818 } else { 3819 cur_write_flags |= BDRV_REQ_ZERO_WRITE; 3820 } 3821 break; 3822 3823 case QCOW2_CLUSTER_ZERO_PLAIN: 3824 case QCOW2_CLUSTER_ZERO_ALLOC: 3825 cur_write_flags |= BDRV_REQ_ZERO_WRITE; 3826 break; 3827 3828 case QCOW2_CLUSTER_COMPRESSED: 3829 ret = -ENOTSUP; 3830 goto out; 3831 3832 case QCOW2_CLUSTER_NORMAL: 3833 child = s->data_file; 3834 copy_offset += offset_into_cluster(s, src_offset); 3835 if ((copy_offset & 511) != 0) { 3836 ret = -EIO; 3837 goto out; 3838 } 3839 break; 3840 3841 default: 3842 abort(); 3843 } 3844 qemu_co_mutex_unlock(&s->lock); 3845 ret = bdrv_co_copy_range_from(child, 3846 copy_offset, 3847 dst, dst_offset, 3848 cur_bytes, read_flags, cur_write_flags); 3849 qemu_co_mutex_lock(&s->lock); 3850 if (ret < 0) { 3851 goto out; 3852 } 3853 3854 bytes -= cur_bytes; 3855 src_offset += cur_bytes; 3856 dst_offset += cur_bytes; 3857 } 3858 ret = 0; 3859 3860 out: 3861 qemu_co_mutex_unlock(&s->lock); 3862 return ret; 3863 } 3864 3865 static int coroutine_fn 3866 qcow2_co_copy_range_to(BlockDriverState *bs, 3867 BdrvChild *src, uint64_t src_offset, 3868 BdrvChild *dst, uint64_t dst_offset, 3869 uint64_t bytes, BdrvRequestFlags read_flags, 3870 BdrvRequestFlags write_flags) 3871 { 3872 BDRVQcow2State *s = bs->opaque; 3873 int offset_in_cluster; 3874 int ret; 3875 unsigned int cur_bytes; /* number of sectors in current iteration */ 3876 uint64_t cluster_offset; 3877 QCowL2Meta *l2meta = NULL; 3878 3879 assert(!bs->encrypted); 3880 3881 qemu_co_mutex_lock(&s->lock); 3882 3883 while (bytes != 0) { 3884 3885 l2meta = NULL; 3886 3887 offset_in_cluster = offset_into_cluster(s, dst_offset); 3888 cur_bytes = MIN(bytes, INT_MAX); 3889 3890 /* TODO: 3891 * If src->bs == dst->bs, we could simply copy by incrementing 3892 * the refcnt, without copying user data. 3893 * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */ 3894 ret = qcow2_alloc_cluster_offset(bs, dst_offset, &cur_bytes, 3895 &cluster_offset, &l2meta); 3896 if (ret < 0) { 3897 goto fail; 3898 } 3899 3900 assert((cluster_offset & 511) == 0); 3901 3902 ret = qcow2_pre_write_overlap_check(bs, 0, 3903 cluster_offset + offset_in_cluster, cur_bytes, true); 3904 if (ret < 0) { 3905 goto fail; 3906 } 3907 3908 qemu_co_mutex_unlock(&s->lock); 3909 ret = bdrv_co_copy_range_to(src, src_offset, 3910 s->data_file, 3911 cluster_offset + offset_in_cluster, 3912 cur_bytes, read_flags, write_flags); 3913 qemu_co_mutex_lock(&s->lock); 3914 if (ret < 0) { 3915 goto fail; 3916 } 3917 3918 ret = qcow2_handle_l2meta(bs, &l2meta, true); 3919 if (ret) { 3920 goto fail; 3921 } 3922 3923 bytes -= cur_bytes; 3924 src_offset += cur_bytes; 3925 dst_offset += cur_bytes; 3926 } 3927 ret = 0; 3928 3929 fail: 3930 qcow2_handle_l2meta(bs, &l2meta, false); 3931 3932 qemu_co_mutex_unlock(&s->lock); 3933 3934 trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); 3935 3936 return ret; 3937 } 3938 3939 static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset, 3940 bool exact, PreallocMode prealloc, 3941 Error **errp) 3942 { 3943 BDRVQcow2State *s = bs->opaque; 3944 uint64_t old_length; 3945 int64_t new_l1_size; 3946 int ret; 3947 QDict *options; 3948 3949 if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA && 3950 prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL) 3951 { 3952 error_setg(errp, "Unsupported preallocation mode '%s'", 3953 PreallocMode_str(prealloc)); 3954 return -ENOTSUP; 3955 } 3956 3957 if (offset & 511) { 3958 error_setg(errp, "The new size must be a multiple of 512"); 3959 return -EINVAL; 3960 } 3961 3962 qemu_co_mutex_lock(&s->lock); 3963 3964 /* cannot proceed if image has snapshots */ 3965 if (s->nb_snapshots) { 3966 error_setg(errp, "Can't resize an image which has snapshots"); 3967 ret = -ENOTSUP; 3968 goto fail; 3969 } 3970 3971 /* cannot proceed if image has bitmaps */ 3972 if (qcow2_truncate_bitmaps_check(bs, errp)) { 3973 ret = -ENOTSUP; 3974 goto fail; 3975 } 3976 3977 old_length = bs->total_sectors * BDRV_SECTOR_SIZE; 3978 new_l1_size = size_to_l1(s, offset); 3979 3980 if (offset < old_length) { 3981 int64_t last_cluster, old_file_size; 3982 if (prealloc != PREALLOC_MODE_OFF) { 3983 error_setg(errp, 3984 "Preallocation can't be used for shrinking an image"); 3985 ret = -EINVAL; 3986 goto fail; 3987 } 3988 3989 ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size), 3990 old_length - ROUND_UP(offset, 3991 s->cluster_size), 3992 QCOW2_DISCARD_ALWAYS, true); 3993 if (ret < 0) { 3994 error_setg_errno(errp, -ret, "Failed to discard cropped clusters"); 3995 goto fail; 3996 } 3997 3998 ret = qcow2_shrink_l1_table(bs, new_l1_size); 3999 if (ret < 0) { 4000 error_setg_errno(errp, -ret, 4001 "Failed to reduce the number of L2 tables"); 4002 goto fail; 4003 } 4004 4005 ret = qcow2_shrink_reftable(bs); 4006 if (ret < 0) { 4007 error_setg_errno(errp, -ret, 4008 "Failed to discard unused refblocks"); 4009 goto fail; 4010 } 4011 4012 old_file_size = bdrv_getlength(bs->file->bs); 4013 if (old_file_size < 0) { 4014 error_setg_errno(errp, -old_file_size, 4015 "Failed to inquire current file length"); 4016 ret = old_file_size; 4017 goto fail; 4018 } 4019 last_cluster = qcow2_get_last_cluster(bs, old_file_size); 4020 if (last_cluster < 0) { 4021 error_setg_errno(errp, -last_cluster, 4022 "Failed to find the last cluster"); 4023 ret = last_cluster; 4024 goto fail; 4025 } 4026 if ((last_cluster + 1) * s->cluster_size < old_file_size) { 4027 Error *local_err = NULL; 4028 4029 /* 4030 * Do not pass @exact here: It will not help the user if 4031 * we get an error here just because they wanted to shrink 4032 * their qcow2 image (on a block device) with qemu-img. 4033 * (And on the qcow2 layer, the @exact requirement is 4034 * always fulfilled, so there is no need to pass it on.) 4035 */ 4036 bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size, 4037 false, PREALLOC_MODE_OFF, &local_err); 4038 if (local_err) { 4039 warn_reportf_err(local_err, 4040 "Failed to truncate the tail of the image: "); 4041 } 4042 } 4043 } else { 4044 ret = qcow2_grow_l1_table(bs, new_l1_size, true); 4045 if (ret < 0) { 4046 error_setg_errno(errp, -ret, "Failed to grow the L1 table"); 4047 goto fail; 4048 } 4049 } 4050 4051 switch (prealloc) { 4052 case PREALLOC_MODE_OFF: 4053 if (has_data_file(bs)) { 4054 /* 4055 * If the caller wants an exact resize, the external data 4056 * file should be resized to the exact target size, too, 4057 * so we pass @exact here. 4058 */ 4059 ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp); 4060 if (ret < 0) { 4061 goto fail; 4062 } 4063 } 4064 break; 4065 4066 case PREALLOC_MODE_METADATA: 4067 ret = preallocate_co(bs, old_length, offset, prealloc, errp); 4068 if (ret < 0) { 4069 goto fail; 4070 } 4071 break; 4072 4073 case PREALLOC_MODE_FALLOC: 4074 case PREALLOC_MODE_FULL: 4075 { 4076 int64_t allocation_start, host_offset, guest_offset; 4077 int64_t clusters_allocated; 4078 int64_t old_file_size, new_file_size; 4079 uint64_t nb_new_data_clusters, nb_new_l2_tables; 4080 4081 /* With a data file, preallocation means just allocating the metadata 4082 * and forwarding the truncate request to the data file */ 4083 if (has_data_file(bs)) { 4084 ret = preallocate_co(bs, old_length, offset, prealloc, errp); 4085 if (ret < 0) { 4086 goto fail; 4087 } 4088 break; 4089 } 4090 4091 old_file_size = bdrv_getlength(bs->file->bs); 4092 if (old_file_size < 0) { 4093 error_setg_errno(errp, -old_file_size, 4094 "Failed to inquire current file length"); 4095 ret = old_file_size; 4096 goto fail; 4097 } 4098 old_file_size = ROUND_UP(old_file_size, s->cluster_size); 4099 4100 nb_new_data_clusters = DIV_ROUND_UP(offset - old_length, 4101 s->cluster_size); 4102 4103 /* This is an overestimation; we will not actually allocate space for 4104 * these in the file but just make sure the new refcount structures are 4105 * able to cover them so we will not have to allocate new refblocks 4106 * while entering the data blocks in the potentially new L2 tables. 4107 * (We do not actually care where the L2 tables are placed. Maybe they 4108 * are already allocated or they can be placed somewhere before 4109 * @old_file_size. It does not matter because they will be fully 4110 * allocated automatically, so they do not need to be covered by the 4111 * preallocation. All that matters is that we will not have to allocate 4112 * new refcount structures for them.) */ 4113 nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters, 4114 s->cluster_size / sizeof(uint64_t)); 4115 /* The cluster range may not be aligned to L2 boundaries, so add one L2 4116 * table for a potential head/tail */ 4117 nb_new_l2_tables++; 4118 4119 allocation_start = qcow2_refcount_area(bs, old_file_size, 4120 nb_new_data_clusters + 4121 nb_new_l2_tables, 4122 true, 0, 0); 4123 if (allocation_start < 0) { 4124 error_setg_errno(errp, -allocation_start, 4125 "Failed to resize refcount structures"); 4126 ret = allocation_start; 4127 goto fail; 4128 } 4129 4130 clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start, 4131 nb_new_data_clusters); 4132 if (clusters_allocated < 0) { 4133 error_setg_errno(errp, -clusters_allocated, 4134 "Failed to allocate data clusters"); 4135 ret = clusters_allocated; 4136 goto fail; 4137 } 4138 4139 assert(clusters_allocated == nb_new_data_clusters); 4140 4141 /* Allocate the data area */ 4142 new_file_size = allocation_start + 4143 nb_new_data_clusters * s->cluster_size; 4144 /* Image file grows, so @exact does not matter */ 4145 ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp); 4146 if (ret < 0) { 4147 error_prepend(errp, "Failed to resize underlying file: "); 4148 qcow2_free_clusters(bs, allocation_start, 4149 nb_new_data_clusters * s->cluster_size, 4150 QCOW2_DISCARD_OTHER); 4151 goto fail; 4152 } 4153 4154 /* Create the necessary L2 entries */ 4155 host_offset = allocation_start; 4156 guest_offset = old_length; 4157 while (nb_new_data_clusters) { 4158 int64_t nb_clusters = MIN( 4159 nb_new_data_clusters, 4160 s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset)); 4161 QCowL2Meta allocation = { 4162 .offset = guest_offset, 4163 .alloc_offset = host_offset, 4164 .nb_clusters = nb_clusters, 4165 }; 4166 qemu_co_queue_init(&allocation.dependent_requests); 4167 4168 ret = qcow2_alloc_cluster_link_l2(bs, &allocation); 4169 if (ret < 0) { 4170 error_setg_errno(errp, -ret, "Failed to update L2 tables"); 4171 qcow2_free_clusters(bs, host_offset, 4172 nb_new_data_clusters * s->cluster_size, 4173 QCOW2_DISCARD_OTHER); 4174 goto fail; 4175 } 4176 4177 guest_offset += nb_clusters * s->cluster_size; 4178 host_offset += nb_clusters * s->cluster_size; 4179 nb_new_data_clusters -= nb_clusters; 4180 } 4181 break; 4182 } 4183 4184 default: 4185 g_assert_not_reached(); 4186 } 4187 4188 if (prealloc != PREALLOC_MODE_OFF) { 4189 /* Flush metadata before actually changing the image size */ 4190 ret = qcow2_write_caches(bs); 4191 if (ret < 0) { 4192 error_setg_errno(errp, -ret, 4193 "Failed to flush the preallocated area to disk"); 4194 goto fail; 4195 } 4196 } 4197 4198 bs->total_sectors = offset / BDRV_SECTOR_SIZE; 4199 4200 /* write updated header.size */ 4201 offset = cpu_to_be64(offset); 4202 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), 4203 &offset, sizeof(uint64_t)); 4204 if (ret < 0) { 4205 error_setg_errno(errp, -ret, "Failed to update the image size"); 4206 goto fail; 4207 } 4208 4209 s->l1_vm_state_index = new_l1_size; 4210 4211 /* Update cache sizes */ 4212 options = qdict_clone_shallow(bs->options); 4213 ret = qcow2_update_options(bs, options, s->flags, errp); 4214 qobject_unref(options); 4215 if (ret < 0) { 4216 goto fail; 4217 } 4218 ret = 0; 4219 fail: 4220 qemu_co_mutex_unlock(&s->lock); 4221 return ret; 4222 } 4223 4224 /* XXX: put compressed sectors first, then all the cluster aligned 4225 tables to avoid losing bytes in alignment */ 4226 static coroutine_fn int 4227 qcow2_co_pwritev_compressed_part(BlockDriverState *bs, 4228 uint64_t offset, uint64_t bytes, 4229 QEMUIOVector *qiov, size_t qiov_offset) 4230 { 4231 BDRVQcow2State *s = bs->opaque; 4232 int ret; 4233 ssize_t out_len; 4234 uint8_t *buf, *out_buf; 4235 uint64_t cluster_offset; 4236 4237 if (has_data_file(bs)) { 4238 return -ENOTSUP; 4239 } 4240 4241 if (bytes == 0) { 4242 /* align end of file to a sector boundary to ease reading with 4243 sector based I/Os */ 4244 int64_t len = bdrv_getlength(bs->file->bs); 4245 if (len < 0) { 4246 return len; 4247 } 4248 return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL); 4249 } 4250 4251 if (offset_into_cluster(s, offset)) { 4252 return -EINVAL; 4253 } 4254 4255 buf = qemu_blockalign(bs, s->cluster_size); 4256 if (bytes != s->cluster_size) { 4257 if (bytes > s->cluster_size || 4258 offset + bytes != bs->total_sectors << BDRV_SECTOR_BITS) 4259 { 4260 qemu_vfree(buf); 4261 return -EINVAL; 4262 } 4263 /* Zero-pad last write if image size is not cluster aligned */ 4264 memset(buf + bytes, 0, s->cluster_size - bytes); 4265 } 4266 qemu_iovec_to_buf(qiov, qiov_offset, buf, bytes); 4267 4268 out_buf = g_malloc(s->cluster_size); 4269 4270 out_len = qcow2_co_compress(bs, out_buf, s->cluster_size - 1, 4271 buf, s->cluster_size); 4272 if (out_len == -ENOMEM) { 4273 /* could not compress: write normal cluster */ 4274 ret = qcow2_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 0); 4275 if (ret < 0) { 4276 goto fail; 4277 } 4278 goto success; 4279 } else if (out_len < 0) { 4280 ret = -EINVAL; 4281 goto fail; 4282 } 4283 4284 qemu_co_mutex_lock(&s->lock); 4285 ret = qcow2_alloc_compressed_cluster_offset(bs, offset, out_len, 4286 &cluster_offset); 4287 if (ret < 0) { 4288 qemu_co_mutex_unlock(&s->lock); 4289 goto fail; 4290 } 4291 4292 ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true); 4293 qemu_co_mutex_unlock(&s->lock); 4294 if (ret < 0) { 4295 goto fail; 4296 } 4297 4298 BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED); 4299 ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0); 4300 if (ret < 0) { 4301 goto fail; 4302 } 4303 success: 4304 ret = 0; 4305 fail: 4306 qemu_vfree(buf); 4307 g_free(out_buf); 4308 return ret; 4309 } 4310 4311 static int coroutine_fn 4312 qcow2_co_preadv_compressed(BlockDriverState *bs, 4313 uint64_t file_cluster_offset, 4314 uint64_t offset, 4315 uint64_t bytes, 4316 QEMUIOVector *qiov, 4317 size_t qiov_offset) 4318 { 4319 BDRVQcow2State *s = bs->opaque; 4320 int ret = 0, csize, nb_csectors; 4321 uint64_t coffset; 4322 uint8_t *buf, *out_buf; 4323 int offset_in_cluster = offset_into_cluster(s, offset); 4324 4325 coffset = file_cluster_offset & s->cluster_offset_mask; 4326 nb_csectors = ((file_cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 4327 csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE - 4328 (coffset & ~QCOW2_COMPRESSED_SECTOR_MASK); 4329 4330 buf = g_try_malloc(csize); 4331 if (!buf) { 4332 return -ENOMEM; 4333 } 4334 4335 out_buf = qemu_blockalign(bs, s->cluster_size); 4336 4337 BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 4338 ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0); 4339 if (ret < 0) { 4340 goto fail; 4341 } 4342 4343 if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) { 4344 ret = -EIO; 4345 goto fail; 4346 } 4347 4348 qemu_iovec_from_buf(qiov, qiov_offset, out_buf + offset_in_cluster, bytes); 4349 4350 fail: 4351 qemu_vfree(out_buf); 4352 g_free(buf); 4353 4354 return ret; 4355 } 4356 4357 static int make_completely_empty(BlockDriverState *bs) 4358 { 4359 BDRVQcow2State *s = bs->opaque; 4360 Error *local_err = NULL; 4361 int ret, l1_clusters; 4362 int64_t offset; 4363 uint64_t *new_reftable = NULL; 4364 uint64_t rt_entry, l1_size2; 4365 struct { 4366 uint64_t l1_offset; 4367 uint64_t reftable_offset; 4368 uint32_t reftable_clusters; 4369 } QEMU_PACKED l1_ofs_rt_ofs_cls; 4370 4371 ret = qcow2_cache_empty(bs, s->l2_table_cache); 4372 if (ret < 0) { 4373 goto fail; 4374 } 4375 4376 ret = qcow2_cache_empty(bs, s->refcount_block_cache); 4377 if (ret < 0) { 4378 goto fail; 4379 } 4380 4381 /* Refcounts will be broken utterly */ 4382 ret = qcow2_mark_dirty(bs); 4383 if (ret < 0) { 4384 goto fail; 4385 } 4386 4387 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 4388 4389 l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); 4390 l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t); 4391 4392 /* After this call, neither the in-memory nor the on-disk refcount 4393 * information accurately describe the actual references */ 4394 4395 ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset, 4396 l1_clusters * s->cluster_size, 0); 4397 if (ret < 0) { 4398 goto fail_broken_refcounts; 4399 } 4400 memset(s->l1_table, 0, l1_size2); 4401 4402 BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE); 4403 4404 /* Overwrite enough clusters at the beginning of the sectors to place 4405 * the refcount table, a refcount block and the L1 table in; this may 4406 * overwrite parts of the existing refcount and L1 table, which is not 4407 * an issue because the dirty flag is set, complete data loss is in fact 4408 * desired and partial data loss is consequently fine as well */ 4409 ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size, 4410 (2 + l1_clusters) * s->cluster_size, 0); 4411 /* This call (even if it failed overall) may have overwritten on-disk 4412 * refcount structures; in that case, the in-memory refcount information 4413 * will probably differ from the on-disk information which makes the BDS 4414 * unusable */ 4415 if (ret < 0) { 4416 goto fail_broken_refcounts; 4417 } 4418 4419 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 4420 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); 4421 4422 /* "Create" an empty reftable (one cluster) directly after the image 4423 * header and an empty L1 table three clusters after the image header; 4424 * the cluster between those two will be used as the first refblock */ 4425 l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size); 4426 l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size); 4427 l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1); 4428 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset), 4429 &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); 4430 if (ret < 0) { 4431 goto fail_broken_refcounts; 4432 } 4433 4434 s->l1_table_offset = 3 * s->cluster_size; 4435 4436 new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t)); 4437 if (!new_reftable) { 4438 ret = -ENOMEM; 4439 goto fail_broken_refcounts; 4440 } 4441 4442 s->refcount_table_offset = s->cluster_size; 4443 s->refcount_table_size = s->cluster_size / sizeof(uint64_t); 4444 s->max_refcount_table_index = 0; 4445 4446 g_free(s->refcount_table); 4447 s->refcount_table = new_reftable; 4448 new_reftable = NULL; 4449 4450 /* Now the in-memory refcount information again corresponds to the on-disk 4451 * information (reftable is empty and no refblocks (the refblock cache is 4452 * empty)); however, this means some clusters (e.g. the image header) are 4453 * referenced, but not refcounted, but the normal qcow2 code assumes that 4454 * the in-memory information is always correct */ 4455 4456 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); 4457 4458 /* Enter the first refblock into the reftable */ 4459 rt_entry = cpu_to_be64(2 * s->cluster_size); 4460 ret = bdrv_pwrite_sync(bs->file, s->cluster_size, 4461 &rt_entry, sizeof(rt_entry)); 4462 if (ret < 0) { 4463 goto fail_broken_refcounts; 4464 } 4465 s->refcount_table[0] = 2 * s->cluster_size; 4466 4467 s->free_cluster_index = 0; 4468 assert(3 + l1_clusters <= s->refcount_block_size); 4469 offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2); 4470 if (offset < 0) { 4471 ret = offset; 4472 goto fail_broken_refcounts; 4473 } else if (offset > 0) { 4474 error_report("First cluster in emptied image is in use"); 4475 abort(); 4476 } 4477 4478 /* Now finally the in-memory information corresponds to the on-disk 4479 * structures and is correct */ 4480 ret = qcow2_mark_clean(bs); 4481 if (ret < 0) { 4482 goto fail; 4483 } 4484 4485 ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false, 4486 PREALLOC_MODE_OFF, &local_err); 4487 if (ret < 0) { 4488 error_report_err(local_err); 4489 goto fail; 4490 } 4491 4492 return 0; 4493 4494 fail_broken_refcounts: 4495 /* The BDS is unusable at this point. If we wanted to make it usable, we 4496 * would have to call qcow2_refcount_close(), qcow2_refcount_init(), 4497 * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init() 4498 * again. However, because the functions which could have caused this error 4499 * path to be taken are used by those functions as well, it's very likely 4500 * that that sequence will fail as well. Therefore, just eject the BDS. */ 4501 bs->drv = NULL; 4502 4503 fail: 4504 g_free(new_reftable); 4505 return ret; 4506 } 4507 4508 static int qcow2_make_empty(BlockDriverState *bs) 4509 { 4510 BDRVQcow2State *s = bs->opaque; 4511 uint64_t offset, end_offset; 4512 int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size); 4513 int l1_clusters, ret = 0; 4514 4515 l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); 4516 4517 if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps && 4518 3 + l1_clusters <= s->refcount_block_size && 4519 s->crypt_method_header != QCOW_CRYPT_LUKS && 4520 !has_data_file(bs)) { 4521 /* The following function only works for qcow2 v3 images (it 4522 * requires the dirty flag) and only as long as there are no 4523 * features that reserve extra clusters (such as snapshots, 4524 * LUKS header, or persistent bitmaps), because it completely 4525 * empties the image. Furthermore, the L1 table and three 4526 * additional clusters (image header, refcount table, one 4527 * refcount block) have to fit inside one refcount block. It 4528 * only resets the image file, i.e. does not work with an 4529 * external data file. */ 4530 return make_completely_empty(bs); 4531 } 4532 4533 /* This fallback code simply discards every active cluster; this is slow, 4534 * but works in all cases */ 4535 end_offset = bs->total_sectors * BDRV_SECTOR_SIZE; 4536 for (offset = 0; offset < end_offset; offset += step) { 4537 /* As this function is generally used after committing an external 4538 * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the 4539 * default action for this kind of discard is to pass the discard, 4540 * which will ideally result in an actually smaller image file, as 4541 * is probably desired. */ 4542 ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset), 4543 QCOW2_DISCARD_SNAPSHOT, true); 4544 if (ret < 0) { 4545 break; 4546 } 4547 } 4548 4549 return ret; 4550 } 4551 4552 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) 4553 { 4554 BDRVQcow2State *s = bs->opaque; 4555 int ret; 4556 4557 qemu_co_mutex_lock(&s->lock); 4558 ret = qcow2_write_caches(bs); 4559 qemu_co_mutex_unlock(&s->lock); 4560 4561 return ret; 4562 } 4563 4564 static ssize_t qcow2_measure_crypto_hdr_init_func(QCryptoBlock *block, 4565 size_t headerlen, void *opaque, Error **errp) 4566 { 4567 size_t *headerlenp = opaque; 4568 4569 /* Stash away the payload size */ 4570 *headerlenp = headerlen; 4571 return 0; 4572 } 4573 4574 static ssize_t qcow2_measure_crypto_hdr_write_func(QCryptoBlock *block, 4575 size_t offset, const uint8_t *buf, size_t buflen, 4576 void *opaque, Error **errp) 4577 { 4578 /* Discard the bytes, we're not actually writing to an image */ 4579 return buflen; 4580 } 4581 4582 /* Determine the number of bytes for the LUKS payload */ 4583 static bool qcow2_measure_luks_headerlen(QemuOpts *opts, size_t *len, 4584 Error **errp) 4585 { 4586 QDict *opts_qdict; 4587 QDict *cryptoopts_qdict; 4588 QCryptoBlockCreateOptions *cryptoopts; 4589 QCryptoBlock *crypto; 4590 4591 /* Extract "encrypt." options into a qdict */ 4592 opts_qdict = qemu_opts_to_qdict(opts, NULL); 4593 qdict_extract_subqdict(opts_qdict, &cryptoopts_qdict, "encrypt."); 4594 qobject_unref(opts_qdict); 4595 4596 /* Build QCryptoBlockCreateOptions object from qdict */ 4597 qdict_put_str(cryptoopts_qdict, "format", "luks"); 4598 cryptoopts = block_crypto_create_opts_init(cryptoopts_qdict, errp); 4599 qobject_unref(cryptoopts_qdict); 4600 if (!cryptoopts) { 4601 return false; 4602 } 4603 4604 /* Fake LUKS creation in order to determine the payload size */ 4605 crypto = qcrypto_block_create(cryptoopts, "encrypt.", 4606 qcow2_measure_crypto_hdr_init_func, 4607 qcow2_measure_crypto_hdr_write_func, 4608 len, errp); 4609 qapi_free_QCryptoBlockCreateOptions(cryptoopts); 4610 if (!crypto) { 4611 return false; 4612 } 4613 4614 qcrypto_block_free(crypto); 4615 return true; 4616 } 4617 4618 static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs, 4619 Error **errp) 4620 { 4621 Error *local_err = NULL; 4622 BlockMeasureInfo *info; 4623 uint64_t required = 0; /* bytes that contribute to required size */ 4624 uint64_t virtual_size; /* disk size as seen by guest */ 4625 uint64_t refcount_bits; 4626 uint64_t l2_tables; 4627 uint64_t luks_payload_size = 0; 4628 size_t cluster_size; 4629 int version; 4630 char *optstr; 4631 PreallocMode prealloc; 4632 bool has_backing_file; 4633 bool has_luks; 4634 4635 /* Parse image creation options */ 4636 cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err); 4637 if (local_err) { 4638 goto err; 4639 } 4640 4641 version = qcow2_opt_get_version_del(opts, &local_err); 4642 if (local_err) { 4643 goto err; 4644 } 4645 4646 refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err); 4647 if (local_err) { 4648 goto err; 4649 } 4650 4651 optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 4652 prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr, 4653 PREALLOC_MODE_OFF, &local_err); 4654 g_free(optstr); 4655 if (local_err) { 4656 goto err; 4657 } 4658 4659 optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); 4660 has_backing_file = !!optstr; 4661 g_free(optstr); 4662 4663 optstr = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT); 4664 has_luks = optstr && strcmp(optstr, "luks") == 0; 4665 g_free(optstr); 4666 4667 if (has_luks) { 4668 size_t headerlen; 4669 4670 if (!qcow2_measure_luks_headerlen(opts, &headerlen, &local_err)) { 4671 goto err; 4672 } 4673 4674 luks_payload_size = ROUND_UP(headerlen, cluster_size); 4675 } 4676 4677 virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); 4678 virtual_size = ROUND_UP(virtual_size, cluster_size); 4679 4680 /* Check that virtual disk size is valid */ 4681 l2_tables = DIV_ROUND_UP(virtual_size / cluster_size, 4682 cluster_size / sizeof(uint64_t)); 4683 if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) { 4684 error_setg(&local_err, "The image size is too large " 4685 "(try using a larger cluster size)"); 4686 goto err; 4687 } 4688 4689 /* Account for input image */ 4690 if (in_bs) { 4691 int64_t ssize = bdrv_getlength(in_bs); 4692 if (ssize < 0) { 4693 error_setg_errno(&local_err, -ssize, 4694 "Unable to get image virtual_size"); 4695 goto err; 4696 } 4697 4698 virtual_size = ROUND_UP(ssize, cluster_size); 4699 4700 if (has_backing_file) { 4701 /* We don't how much of the backing chain is shared by the input 4702 * image and the new image file. In the worst case the new image's 4703 * backing file has nothing in common with the input image. Be 4704 * conservative and assume all clusters need to be written. 4705 */ 4706 required = virtual_size; 4707 } else { 4708 int64_t offset; 4709 int64_t pnum = 0; 4710 4711 for (offset = 0; offset < ssize; offset += pnum) { 4712 int ret; 4713 4714 ret = bdrv_block_status_above(in_bs, NULL, offset, 4715 ssize - offset, &pnum, NULL, 4716 NULL); 4717 if (ret < 0) { 4718 error_setg_errno(&local_err, -ret, 4719 "Unable to get block status"); 4720 goto err; 4721 } 4722 4723 if (ret & BDRV_BLOCK_ZERO) { 4724 /* Skip zero regions (safe with no backing file) */ 4725 } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) == 4726 (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) { 4727 /* Extend pnum to end of cluster for next iteration */ 4728 pnum = ROUND_UP(offset + pnum, cluster_size) - offset; 4729 4730 /* Count clusters we've seen */ 4731 required += offset % cluster_size + pnum; 4732 } 4733 } 4734 } 4735 } 4736 4737 /* Take into account preallocation. Nothing special is needed for 4738 * PREALLOC_MODE_METADATA since metadata is always counted. 4739 */ 4740 if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { 4741 required = virtual_size; 4742 } 4743 4744 info = g_new(BlockMeasureInfo, 1); 4745 info->fully_allocated = 4746 qcow2_calc_prealloc_size(virtual_size, cluster_size, 4747 ctz32(refcount_bits)) + luks_payload_size; 4748 4749 /* Remove data clusters that are not required. This overestimates the 4750 * required size because metadata needed for the fully allocated file is 4751 * still counted. 4752 */ 4753 info->required = info->fully_allocated - virtual_size + required; 4754 return info; 4755 4756 err: 4757 error_propagate(errp, local_err); 4758 return NULL; 4759 } 4760 4761 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 4762 { 4763 BDRVQcow2State *s = bs->opaque; 4764 bdi->unallocated_blocks_are_zero = true; 4765 bdi->cluster_size = s->cluster_size; 4766 bdi->vm_state_offset = qcow2_vm_state_offset(s); 4767 return 0; 4768 } 4769 4770 static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs, 4771 Error **errp) 4772 { 4773 BDRVQcow2State *s = bs->opaque; 4774 ImageInfoSpecific *spec_info; 4775 QCryptoBlockInfo *encrypt_info = NULL; 4776 Error *local_err = NULL; 4777 4778 if (s->crypto != NULL) { 4779 encrypt_info = qcrypto_block_get_info(s->crypto, &local_err); 4780 if (local_err) { 4781 error_propagate(errp, local_err); 4782 return NULL; 4783 } 4784 } 4785 4786 spec_info = g_new(ImageInfoSpecific, 1); 4787 *spec_info = (ImageInfoSpecific){ 4788 .type = IMAGE_INFO_SPECIFIC_KIND_QCOW2, 4789 .u.qcow2.data = g_new0(ImageInfoSpecificQCow2, 1), 4790 }; 4791 if (s->qcow_version == 2) { 4792 *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){ 4793 .compat = g_strdup("0.10"), 4794 .refcount_bits = s->refcount_bits, 4795 }; 4796 } else if (s->qcow_version == 3) { 4797 Qcow2BitmapInfoList *bitmaps; 4798 bitmaps = qcow2_get_bitmap_info_list(bs, &local_err); 4799 if (local_err) { 4800 error_propagate(errp, local_err); 4801 qapi_free_ImageInfoSpecific(spec_info); 4802 return NULL; 4803 } 4804 *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){ 4805 .compat = g_strdup("1.1"), 4806 .lazy_refcounts = s->compatible_features & 4807 QCOW2_COMPAT_LAZY_REFCOUNTS, 4808 .has_lazy_refcounts = true, 4809 .corrupt = s->incompatible_features & 4810 QCOW2_INCOMPAT_CORRUPT, 4811 .has_corrupt = true, 4812 .refcount_bits = s->refcount_bits, 4813 .has_bitmaps = !!bitmaps, 4814 .bitmaps = bitmaps, 4815 .has_data_file = !!s->image_data_file, 4816 .data_file = g_strdup(s->image_data_file), 4817 .has_data_file_raw = has_data_file(bs), 4818 .data_file_raw = data_file_is_raw(bs), 4819 }; 4820 } else { 4821 /* if this assertion fails, this probably means a new version was 4822 * added without having it covered here */ 4823 assert(false); 4824 } 4825 4826 if (encrypt_info) { 4827 ImageInfoSpecificQCow2Encryption *qencrypt = 4828 g_new(ImageInfoSpecificQCow2Encryption, 1); 4829 switch (encrypt_info->format) { 4830 case Q_CRYPTO_BLOCK_FORMAT_QCOW: 4831 qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES; 4832 break; 4833 case Q_CRYPTO_BLOCK_FORMAT_LUKS: 4834 qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS; 4835 qencrypt->u.luks = encrypt_info->u.luks; 4836 break; 4837 default: 4838 abort(); 4839 } 4840 /* Since we did shallow copy above, erase any pointers 4841 * in the original info */ 4842 memset(&encrypt_info->u, 0, sizeof(encrypt_info->u)); 4843 qapi_free_QCryptoBlockInfo(encrypt_info); 4844 4845 spec_info->u.qcow2.data->has_encrypt = true; 4846 spec_info->u.qcow2.data->encrypt = qencrypt; 4847 } 4848 4849 return spec_info; 4850 } 4851 4852 static int qcow2_has_zero_init(BlockDriverState *bs) 4853 { 4854 BDRVQcow2State *s = bs->opaque; 4855 bool preallocated; 4856 4857 if (qemu_in_coroutine()) { 4858 qemu_co_mutex_lock(&s->lock); 4859 } 4860 /* 4861 * Check preallocation status: Preallocated images have all L2 4862 * tables allocated, nonpreallocated images have none. It is 4863 * therefore enough to check the first one. 4864 */ 4865 preallocated = s->l1_size > 0 && s->l1_table[0] != 0; 4866 if (qemu_in_coroutine()) { 4867 qemu_co_mutex_unlock(&s->lock); 4868 } 4869 4870 if (!preallocated) { 4871 return 1; 4872 } else if (bs->encrypted) { 4873 return 0; 4874 } else { 4875 return bdrv_has_zero_init(s->data_file->bs); 4876 } 4877 } 4878 4879 static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, 4880 int64_t pos) 4881 { 4882 BDRVQcow2State *s = bs->opaque; 4883 4884 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); 4885 return bs->drv->bdrv_co_pwritev_part(bs, qcow2_vm_state_offset(s) + pos, 4886 qiov->size, qiov, 0, 0); 4887 } 4888 4889 static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, 4890 int64_t pos) 4891 { 4892 BDRVQcow2State *s = bs->opaque; 4893 4894 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); 4895 return bs->drv->bdrv_co_preadv_part(bs, qcow2_vm_state_offset(s) + pos, 4896 qiov->size, qiov, 0, 0); 4897 } 4898 4899 /* 4900 * Downgrades an image's version. To achieve this, any incompatible features 4901 * have to be removed. 4902 */ 4903 static int qcow2_downgrade(BlockDriverState *bs, int target_version, 4904 BlockDriverAmendStatusCB *status_cb, void *cb_opaque, 4905 Error **errp) 4906 { 4907 BDRVQcow2State *s = bs->opaque; 4908 int current_version = s->qcow_version; 4909 int ret; 4910 4911 /* This is qcow2_downgrade(), not qcow2_upgrade() */ 4912 assert(target_version < current_version); 4913 4914 /* There are no other versions (now) that you can downgrade to */ 4915 assert(target_version == 2); 4916 4917 if (s->refcount_order != 4) { 4918 error_setg(errp, "compat=0.10 requires refcount_bits=16"); 4919 return -ENOTSUP; 4920 } 4921 4922 if (has_data_file(bs)) { 4923 error_setg(errp, "Cannot downgrade an image with a data file"); 4924 return -ENOTSUP; 4925 } 4926 4927 /* clear incompatible features */ 4928 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { 4929 ret = qcow2_mark_clean(bs); 4930 if (ret < 0) { 4931 error_setg_errno(errp, -ret, "Failed to make the image clean"); 4932 return ret; 4933 } 4934 } 4935 4936 /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in 4937 * the first place; if that happens nonetheless, returning -ENOTSUP is the 4938 * best thing to do anyway */ 4939 4940 if (s->incompatible_features) { 4941 error_setg(errp, "Cannot downgrade an image with incompatible features " 4942 "%#" PRIx64 " set", s->incompatible_features); 4943 return -ENOTSUP; 4944 } 4945 4946 /* since we can ignore compatible features, we can set them to 0 as well */ 4947 s->compatible_features = 0; 4948 /* if lazy refcounts have been used, they have already been fixed through 4949 * clearing the dirty flag */ 4950 4951 /* clearing autoclear features is trivial */ 4952 s->autoclear_features = 0; 4953 4954 ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque); 4955 if (ret < 0) { 4956 error_setg_errno(errp, -ret, "Failed to turn zero into data clusters"); 4957 return ret; 4958 } 4959 4960 s->qcow_version = target_version; 4961 ret = qcow2_update_header(bs); 4962 if (ret < 0) { 4963 s->qcow_version = current_version; 4964 error_setg_errno(errp, -ret, "Failed to update the image header"); 4965 return ret; 4966 } 4967 return 0; 4968 } 4969 4970 /* 4971 * Upgrades an image's version. While newer versions encompass all 4972 * features of older versions, some things may have to be presented 4973 * differently. 4974 */ 4975 static int qcow2_upgrade(BlockDriverState *bs, int target_version, 4976 BlockDriverAmendStatusCB *status_cb, void *cb_opaque, 4977 Error **errp) 4978 { 4979 BDRVQcow2State *s = bs->opaque; 4980 bool need_snapshot_update; 4981 int current_version = s->qcow_version; 4982 int i; 4983 int ret; 4984 4985 /* This is qcow2_upgrade(), not qcow2_downgrade() */ 4986 assert(target_version > current_version); 4987 4988 /* There are no other versions (yet) that you can upgrade to */ 4989 assert(target_version == 3); 4990 4991 status_cb(bs, 0, 2, cb_opaque); 4992 4993 /* 4994 * In v2, snapshots do not need to have extra data. v3 requires 4995 * the 64-bit VM state size and the virtual disk size to be 4996 * present. 4997 * qcow2_write_snapshots() will always write the list in the 4998 * v3-compliant format. 4999 */ 5000 need_snapshot_update = false; 5001 for (i = 0; i < s->nb_snapshots; i++) { 5002 if (s->snapshots[i].extra_data_size < 5003 sizeof_field(QCowSnapshotExtraData, vm_state_size_large) + 5004 sizeof_field(QCowSnapshotExtraData, disk_size)) 5005 { 5006 need_snapshot_update = true; 5007 break; 5008 } 5009 } 5010 if (need_snapshot_update) { 5011 ret = qcow2_write_snapshots(bs); 5012 if (ret < 0) { 5013 error_setg_errno(errp, -ret, "Failed to update the snapshot table"); 5014 return ret; 5015 } 5016 } 5017 status_cb(bs, 1, 2, cb_opaque); 5018 5019 s->qcow_version = target_version; 5020 ret = qcow2_update_header(bs); 5021 if (ret < 0) { 5022 s->qcow_version = current_version; 5023 error_setg_errno(errp, -ret, "Failed to update the image header"); 5024 return ret; 5025 } 5026 status_cb(bs, 2, 2, cb_opaque); 5027 5028 return 0; 5029 } 5030 5031 typedef enum Qcow2AmendOperation { 5032 /* This is the value Qcow2AmendHelperCBInfo::last_operation will be 5033 * statically initialized to so that the helper CB can discern the first 5034 * invocation from an operation change */ 5035 QCOW2_NO_OPERATION = 0, 5036 5037 QCOW2_UPGRADING, 5038 QCOW2_CHANGING_REFCOUNT_ORDER, 5039 QCOW2_DOWNGRADING, 5040 } Qcow2AmendOperation; 5041 5042 typedef struct Qcow2AmendHelperCBInfo { 5043 /* The code coordinating the amend operations should only modify 5044 * these four fields; the rest will be managed by the CB */ 5045 BlockDriverAmendStatusCB *original_status_cb; 5046 void *original_cb_opaque; 5047 5048 Qcow2AmendOperation current_operation; 5049 5050 /* Total number of operations to perform (only set once) */ 5051 int total_operations; 5052 5053 /* The following fields are managed by the CB */ 5054 5055 /* Number of operations completed */ 5056 int operations_completed; 5057 5058 /* Cumulative offset of all completed operations */ 5059 int64_t offset_completed; 5060 5061 Qcow2AmendOperation last_operation; 5062 int64_t last_work_size; 5063 } Qcow2AmendHelperCBInfo; 5064 5065 static void qcow2_amend_helper_cb(BlockDriverState *bs, 5066 int64_t operation_offset, 5067 int64_t operation_work_size, void *opaque) 5068 { 5069 Qcow2AmendHelperCBInfo *info = opaque; 5070 int64_t current_work_size; 5071 int64_t projected_work_size; 5072 5073 if (info->current_operation != info->last_operation) { 5074 if (info->last_operation != QCOW2_NO_OPERATION) { 5075 info->offset_completed += info->last_work_size; 5076 info->operations_completed++; 5077 } 5078 5079 info->last_operation = info->current_operation; 5080 } 5081 5082 assert(info->total_operations > 0); 5083 assert(info->operations_completed < info->total_operations); 5084 5085 info->last_work_size = operation_work_size; 5086 5087 current_work_size = info->offset_completed + operation_work_size; 5088 5089 /* current_work_size is the total work size for (operations_completed + 1) 5090 * operations (which includes this one), so multiply it by the number of 5091 * operations not covered and divide it by the number of operations 5092 * covered to get a projection for the operations not covered */ 5093 projected_work_size = current_work_size * (info->total_operations - 5094 info->operations_completed - 1) 5095 / (info->operations_completed + 1); 5096 5097 info->original_status_cb(bs, info->offset_completed + operation_offset, 5098 current_work_size + projected_work_size, 5099 info->original_cb_opaque); 5100 } 5101 5102 static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, 5103 BlockDriverAmendStatusCB *status_cb, 5104 void *cb_opaque, 5105 Error **errp) 5106 { 5107 BDRVQcow2State *s = bs->opaque; 5108 int old_version = s->qcow_version, new_version = old_version; 5109 uint64_t new_size = 0; 5110 const char *backing_file = NULL, *backing_format = NULL, *data_file = NULL; 5111 bool lazy_refcounts = s->use_lazy_refcounts; 5112 bool data_file_raw = data_file_is_raw(bs); 5113 const char *compat = NULL; 5114 uint64_t cluster_size = s->cluster_size; 5115 bool encrypt; 5116 int encformat; 5117 int refcount_bits = s->refcount_bits; 5118 int ret; 5119 QemuOptDesc *desc = opts->list->desc; 5120 Qcow2AmendHelperCBInfo helper_cb_info; 5121 5122 while (desc && desc->name) { 5123 if (!qemu_opt_find(opts, desc->name)) { 5124 /* only change explicitly defined options */ 5125 desc++; 5126 continue; 5127 } 5128 5129 if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) { 5130 compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL); 5131 if (!compat) { 5132 /* preserve default */ 5133 } else if (!strcmp(compat, "0.10") || !strcmp(compat, "v2")) { 5134 new_version = 2; 5135 } else if (!strcmp(compat, "1.1") || !strcmp(compat, "v3")) { 5136 new_version = 3; 5137 } else { 5138 error_setg(errp, "Unknown compatibility level %s", compat); 5139 return -EINVAL; 5140 } 5141 } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) { 5142 error_setg(errp, "Cannot change preallocation mode"); 5143 return -ENOTSUP; 5144 } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) { 5145 new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); 5146 } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) { 5147 backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); 5148 } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) { 5149 backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); 5150 } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) { 5151 encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT, 5152 !!s->crypto); 5153 5154 if (encrypt != !!s->crypto) { 5155 error_setg(errp, 5156 "Changing the encryption flag is not supported"); 5157 return -ENOTSUP; 5158 } 5159 } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT_FORMAT)) { 5160 encformat = qcow2_crypt_method_from_format( 5161 qemu_opt_get(opts, BLOCK_OPT_ENCRYPT_FORMAT)); 5162 5163 if (encformat != s->crypt_method_header) { 5164 error_setg(errp, 5165 "Changing the encryption format is not supported"); 5166 return -ENOTSUP; 5167 } 5168 } else if (g_str_has_prefix(desc->name, "encrypt.")) { 5169 error_setg(errp, 5170 "Changing the encryption parameters is not supported"); 5171 return -ENOTSUP; 5172 } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) { 5173 cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, 5174 cluster_size); 5175 if (cluster_size != s->cluster_size) { 5176 error_setg(errp, "Changing the cluster size is not supported"); 5177 return -ENOTSUP; 5178 } 5179 } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) { 5180 lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS, 5181 lazy_refcounts); 5182 } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) { 5183 refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS, 5184 refcount_bits); 5185 5186 if (refcount_bits <= 0 || refcount_bits > 64 || 5187 !is_power_of_2(refcount_bits)) 5188 { 5189 error_setg(errp, "Refcount width must be a power of two and " 5190 "may not exceed 64 bits"); 5191 return -EINVAL; 5192 } 5193 } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE)) { 5194 data_file = qemu_opt_get(opts, BLOCK_OPT_DATA_FILE); 5195 if (data_file && !has_data_file(bs)) { 5196 error_setg(errp, "data-file can only be set for images that " 5197 "use an external data file"); 5198 return -EINVAL; 5199 } 5200 } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE_RAW)) { 5201 data_file_raw = qemu_opt_get_bool(opts, BLOCK_OPT_DATA_FILE_RAW, 5202 data_file_raw); 5203 if (data_file_raw && !data_file_is_raw(bs)) { 5204 error_setg(errp, "data-file-raw cannot be set on existing " 5205 "images"); 5206 return -EINVAL; 5207 } 5208 } else { 5209 /* if this point is reached, this probably means a new option was 5210 * added without having it covered here */ 5211 abort(); 5212 } 5213 5214 desc++; 5215 } 5216 5217 helper_cb_info = (Qcow2AmendHelperCBInfo){ 5218 .original_status_cb = status_cb, 5219 .original_cb_opaque = cb_opaque, 5220 .total_operations = (new_version != old_version) 5221 + (s->refcount_bits != refcount_bits) 5222 }; 5223 5224 /* Upgrade first (some features may require compat=1.1) */ 5225 if (new_version > old_version) { 5226 helper_cb_info.current_operation = QCOW2_UPGRADING; 5227 ret = qcow2_upgrade(bs, new_version, &qcow2_amend_helper_cb, 5228 &helper_cb_info, errp); 5229 if (ret < 0) { 5230 return ret; 5231 } 5232 } 5233 5234 if (s->refcount_bits != refcount_bits) { 5235 int refcount_order = ctz32(refcount_bits); 5236 5237 if (new_version < 3 && refcount_bits != 16) { 5238 error_setg(errp, "Refcount widths other than 16 bits require " 5239 "compatibility level 1.1 or above (use compat=1.1 or " 5240 "greater)"); 5241 return -EINVAL; 5242 } 5243 5244 helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER; 5245 ret = qcow2_change_refcount_order(bs, refcount_order, 5246 &qcow2_amend_helper_cb, 5247 &helper_cb_info, errp); 5248 if (ret < 0) { 5249 return ret; 5250 } 5251 } 5252 5253 /* data-file-raw blocks backing files, so clear it first if requested */ 5254 if (data_file_raw) { 5255 s->autoclear_features |= QCOW2_AUTOCLEAR_DATA_FILE_RAW; 5256 } else { 5257 s->autoclear_features &= ~QCOW2_AUTOCLEAR_DATA_FILE_RAW; 5258 } 5259 5260 if (data_file) { 5261 g_free(s->image_data_file); 5262 s->image_data_file = *data_file ? g_strdup(data_file) : NULL; 5263 } 5264 5265 ret = qcow2_update_header(bs); 5266 if (ret < 0) { 5267 error_setg_errno(errp, -ret, "Failed to update the image header"); 5268 return ret; 5269 } 5270 5271 if (backing_file || backing_format) { 5272 ret = qcow2_change_backing_file(bs, 5273 backing_file ?: s->image_backing_file, 5274 backing_format ?: s->image_backing_format); 5275 if (ret < 0) { 5276 error_setg_errno(errp, -ret, "Failed to change the backing file"); 5277 return ret; 5278 } 5279 } 5280 5281 if (s->use_lazy_refcounts != lazy_refcounts) { 5282 if (lazy_refcounts) { 5283 if (new_version < 3) { 5284 error_setg(errp, "Lazy refcounts only supported with " 5285 "compatibility level 1.1 and above (use compat=1.1 " 5286 "or greater)"); 5287 return -EINVAL; 5288 } 5289 s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; 5290 ret = qcow2_update_header(bs); 5291 if (ret < 0) { 5292 s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; 5293 error_setg_errno(errp, -ret, "Failed to update the image header"); 5294 return ret; 5295 } 5296 s->use_lazy_refcounts = true; 5297 } else { 5298 /* make image clean first */ 5299 ret = qcow2_mark_clean(bs); 5300 if (ret < 0) { 5301 error_setg_errno(errp, -ret, "Failed to make the image clean"); 5302 return ret; 5303 } 5304 /* now disallow lazy refcounts */ 5305 s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; 5306 ret = qcow2_update_header(bs); 5307 if (ret < 0) { 5308 s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; 5309 error_setg_errno(errp, -ret, "Failed to update the image header"); 5310 return ret; 5311 } 5312 s->use_lazy_refcounts = false; 5313 } 5314 } 5315 5316 if (new_size) { 5317 BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), 5318 BLK_PERM_RESIZE, BLK_PERM_ALL); 5319 ret = blk_insert_bs(blk, bs, errp); 5320 if (ret < 0) { 5321 blk_unref(blk); 5322 return ret; 5323 } 5324 5325 /* 5326 * Amending image options should ensure that the image has 5327 * exactly the given new values, so pass exact=true here. 5328 */ 5329 ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp); 5330 blk_unref(blk); 5331 if (ret < 0) { 5332 return ret; 5333 } 5334 } 5335 5336 /* Downgrade last (so unsupported features can be removed before) */ 5337 if (new_version < old_version) { 5338 helper_cb_info.current_operation = QCOW2_DOWNGRADING; 5339 ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb, 5340 &helper_cb_info, errp); 5341 if (ret < 0) { 5342 return ret; 5343 } 5344 } 5345 5346 return 0; 5347 } 5348 5349 /* 5350 * If offset or size are negative, respectively, they will not be included in 5351 * the BLOCK_IMAGE_CORRUPTED event emitted. 5352 * fatal will be ignored for read-only BDS; corruptions found there will always 5353 * be considered non-fatal. 5354 */ 5355 void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, 5356 int64_t size, const char *message_format, ...) 5357 { 5358 BDRVQcow2State *s = bs->opaque; 5359 const char *node_name; 5360 char *message; 5361 va_list ap; 5362 5363 fatal = fatal && bdrv_is_writable(bs); 5364 5365 if (s->signaled_corruption && 5366 (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT))) 5367 { 5368 return; 5369 } 5370 5371 va_start(ap, message_format); 5372 message = g_strdup_vprintf(message_format, ap); 5373 va_end(ap); 5374 5375 if (fatal) { 5376 fprintf(stderr, "qcow2: Marking image as corrupt: %s; further " 5377 "corruption events will be suppressed\n", message); 5378 } else { 5379 fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal " 5380 "corruption events will be suppressed\n", message); 5381 } 5382 5383 node_name = bdrv_get_node_name(bs); 5384 qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), 5385 *node_name != '\0', node_name, 5386 message, offset >= 0, offset, 5387 size >= 0, size, 5388 fatal); 5389 g_free(message); 5390 5391 if (fatal) { 5392 qcow2_mark_corrupt(bs); 5393 bs->drv = NULL; /* make BDS unusable */ 5394 } 5395 5396 s->signaled_corruption = true; 5397 } 5398 5399 static QemuOptsList qcow2_create_opts = { 5400 .name = "qcow2-create-opts", 5401 .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head), 5402 .desc = { 5403 { 5404 .name = BLOCK_OPT_SIZE, 5405 .type = QEMU_OPT_SIZE, 5406 .help = "Virtual disk size" 5407 }, 5408 { 5409 .name = BLOCK_OPT_COMPAT_LEVEL, 5410 .type = QEMU_OPT_STRING, 5411 .help = "Compatibility level (v2 [0.10] or v3 [1.1])" 5412 }, 5413 { 5414 .name = BLOCK_OPT_BACKING_FILE, 5415 .type = QEMU_OPT_STRING, 5416 .help = "File name of a base image" 5417 }, 5418 { 5419 .name = BLOCK_OPT_BACKING_FMT, 5420 .type = QEMU_OPT_STRING, 5421 .help = "Image format of the base image" 5422 }, 5423 { 5424 .name = BLOCK_OPT_DATA_FILE, 5425 .type = QEMU_OPT_STRING, 5426 .help = "File name of an external data file" 5427 }, 5428 { 5429 .name = BLOCK_OPT_DATA_FILE_RAW, 5430 .type = QEMU_OPT_BOOL, 5431 .help = "The external data file must stay valid as a raw image" 5432 }, 5433 { 5434 .name = BLOCK_OPT_ENCRYPT, 5435 .type = QEMU_OPT_BOOL, 5436 .help = "Encrypt the image with format 'aes'. (Deprecated " 5437 "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)", 5438 }, 5439 { 5440 .name = BLOCK_OPT_ENCRYPT_FORMAT, 5441 .type = QEMU_OPT_STRING, 5442 .help = "Encrypt the image, format choices: 'aes', 'luks'", 5443 }, 5444 BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.", 5445 "ID of secret providing qcow AES key or LUKS passphrase"), 5446 BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."), 5447 BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."), 5448 BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."), 5449 BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."), 5450 BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."), 5451 BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."), 5452 { 5453 .name = BLOCK_OPT_CLUSTER_SIZE, 5454 .type = QEMU_OPT_SIZE, 5455 .help = "qcow2 cluster size", 5456 .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) 5457 }, 5458 { 5459 .name = BLOCK_OPT_PREALLOC, 5460 .type = QEMU_OPT_STRING, 5461 .help = "Preallocation mode (allowed values: off, metadata, " 5462 "falloc, full)" 5463 }, 5464 { 5465 .name = BLOCK_OPT_LAZY_REFCOUNTS, 5466 .type = QEMU_OPT_BOOL, 5467 .help = "Postpone refcount updates", 5468 .def_value_str = "off" 5469 }, 5470 { 5471 .name = BLOCK_OPT_REFCOUNT_BITS, 5472 .type = QEMU_OPT_NUMBER, 5473 .help = "Width of a reference count entry in bits", 5474 .def_value_str = "16" 5475 }, 5476 { /* end of list */ } 5477 } 5478 }; 5479 5480 static const char *const qcow2_strong_runtime_opts[] = { 5481 "encrypt." BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET, 5482 5483 NULL 5484 }; 5485 5486 BlockDriver bdrv_qcow2 = { 5487 .format_name = "qcow2", 5488 .instance_size = sizeof(BDRVQcow2State), 5489 .bdrv_probe = qcow2_probe, 5490 .bdrv_open = qcow2_open, 5491 .bdrv_close = qcow2_close, 5492 .bdrv_reopen_prepare = qcow2_reopen_prepare, 5493 .bdrv_reopen_commit = qcow2_reopen_commit, 5494 .bdrv_reopen_abort = qcow2_reopen_abort, 5495 .bdrv_join_options = qcow2_join_options, 5496 .bdrv_child_perm = bdrv_format_default_perms, 5497 .bdrv_co_create_opts = qcow2_co_create_opts, 5498 .bdrv_co_create = qcow2_co_create, 5499 .bdrv_has_zero_init = qcow2_has_zero_init, 5500 .bdrv_has_zero_init_truncate = bdrv_has_zero_init_1, 5501 .bdrv_co_block_status = qcow2_co_block_status, 5502 5503 .bdrv_co_preadv_part = qcow2_co_preadv_part, 5504 .bdrv_co_pwritev_part = qcow2_co_pwritev_part, 5505 .bdrv_co_flush_to_os = qcow2_co_flush_to_os, 5506 5507 .bdrv_co_pwrite_zeroes = qcow2_co_pwrite_zeroes, 5508 .bdrv_co_pdiscard = qcow2_co_pdiscard, 5509 .bdrv_co_copy_range_from = qcow2_co_copy_range_from, 5510 .bdrv_co_copy_range_to = qcow2_co_copy_range_to, 5511 .bdrv_co_truncate = qcow2_co_truncate, 5512 .bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part, 5513 .bdrv_make_empty = qcow2_make_empty, 5514 5515 .bdrv_snapshot_create = qcow2_snapshot_create, 5516 .bdrv_snapshot_goto = qcow2_snapshot_goto, 5517 .bdrv_snapshot_delete = qcow2_snapshot_delete, 5518 .bdrv_snapshot_list = qcow2_snapshot_list, 5519 .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, 5520 .bdrv_measure = qcow2_measure, 5521 .bdrv_get_info = qcow2_get_info, 5522 .bdrv_get_specific_info = qcow2_get_specific_info, 5523 5524 .bdrv_save_vmstate = qcow2_save_vmstate, 5525 .bdrv_load_vmstate = qcow2_load_vmstate, 5526 5527 .supports_backing = true, 5528 .bdrv_change_backing_file = qcow2_change_backing_file, 5529 5530 .bdrv_refresh_limits = qcow2_refresh_limits, 5531 .bdrv_co_invalidate_cache = qcow2_co_invalidate_cache, 5532 .bdrv_inactivate = qcow2_inactivate, 5533 5534 .create_opts = &qcow2_create_opts, 5535 .strong_runtime_opts = qcow2_strong_runtime_opts, 5536 .mutable_opts = mutable_opts, 5537 .bdrv_co_check = qcow2_co_check, 5538 .bdrv_amend_options = qcow2_amend_options, 5539 5540 .bdrv_detach_aio_context = qcow2_detach_aio_context, 5541 .bdrv_attach_aio_context = qcow2_attach_aio_context, 5542 5543 .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap, 5544 .bdrv_co_remove_persistent_dirty_bitmap = 5545 qcow2_co_remove_persistent_dirty_bitmap, 5546 }; 5547 5548 static void bdrv_qcow2_init(void) 5549 { 5550 bdrv_register(&bdrv_qcow2); 5551 } 5552 5553 block_init(bdrv_qcow2_init); 5554