1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu-common.h" 25 #include "block/block_int.h" 26 #include "qemu/module.h" 27 #include <zlib.h> 28 #include "qemu/aes.h" 29 #include "block/qcow2.h" 30 #include "qemu/error-report.h" 31 #include "qapi/qmp/qerror.h" 32 #include "qapi/qmp/qbool.h" 33 #include "trace.h" 34 35 /* 36 Differences with QCOW: 37 38 - Support for multiple incremental snapshots. 39 - Memory management by reference counts. 40 - Clusters which have a reference count of one have the bit 41 QCOW_OFLAG_COPIED to optimize write performance. 42 - Size of compressed clusters is stored in sectors to reduce bit usage 43 in the cluster offsets. 44 - Support for storing additional data (such as the VM state) in the 45 snapshots. 46 - If a backing store is used, the cluster size is not constrained 47 (could be backported to QCOW). 48 - L2 tables have always a size of one cluster. 49 */ 50 51 52 typedef struct { 53 uint32_t magic; 54 uint32_t len; 55 } QCowExtension; 56 57 #define QCOW2_EXT_MAGIC_END 0 58 #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA 59 #define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 60 61 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) 62 { 63 const QCowHeader *cow_header = (const void *)buf; 64 65 if (buf_size >= sizeof(QCowHeader) && 66 be32_to_cpu(cow_header->magic) == QCOW_MAGIC && 67 be32_to_cpu(cow_header->version) >= 2) 68 return 100; 69 else 70 return 0; 71 } 72 73 74 /* 75 * read qcow2 extension and fill bs 76 * start reading from start_offset 77 * finish reading upon magic of value 0 or when end_offset reached 78 * unknown magic is skipped (future extension this version knows nothing about) 79 * return 0 upon success, non-0 otherwise 80 */ 81 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, 82 uint64_t end_offset, void **p_feature_table) 83 { 84 BDRVQcowState *s = bs->opaque; 85 QCowExtension ext; 86 uint64_t offset; 87 int ret; 88 89 #ifdef DEBUG_EXT 90 printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); 91 #endif 92 offset = start_offset; 93 while (offset < end_offset) { 94 95 #ifdef DEBUG_EXT 96 /* Sanity check */ 97 if (offset > s->cluster_size) 98 printf("qcow2_read_extension: suspicious offset %lu\n", offset); 99 100 printf("attempting to read extended header in offset %lu\n", offset); 101 #endif 102 103 if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) { 104 fprintf(stderr, "qcow2_read_extension: ERROR: " 105 "pread fail from offset %" PRIu64 "\n", 106 offset); 107 return 1; 108 } 109 be32_to_cpus(&ext.magic); 110 be32_to_cpus(&ext.len); 111 offset += sizeof(ext); 112 #ifdef DEBUG_EXT 113 printf("ext.magic = 0x%x\n", ext.magic); 114 #endif 115 if (ext.len > end_offset - offset) { 116 error_report("Header extension too large"); 117 return -EINVAL; 118 } 119 120 switch (ext.magic) { 121 case QCOW2_EXT_MAGIC_END: 122 return 0; 123 124 case QCOW2_EXT_MAGIC_BACKING_FORMAT: 125 if (ext.len >= sizeof(bs->backing_format)) { 126 fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" 127 " (>=%zu)\n", 128 ext.len, sizeof(bs->backing_format)); 129 return 2; 130 } 131 if (bdrv_pread(bs->file, offset , bs->backing_format, 132 ext.len) != ext.len) 133 return 3; 134 bs->backing_format[ext.len] = '\0'; 135 #ifdef DEBUG_EXT 136 printf("Qcow2: Got format extension %s\n", bs->backing_format); 137 #endif 138 break; 139 140 case QCOW2_EXT_MAGIC_FEATURE_TABLE: 141 if (p_feature_table != NULL) { 142 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); 143 ret = bdrv_pread(bs->file, offset , feature_table, ext.len); 144 if (ret < 0) { 145 return ret; 146 } 147 148 *p_feature_table = feature_table; 149 } 150 break; 151 152 default: 153 /* unknown magic - save it in case we need to rewrite the header */ 154 { 155 Qcow2UnknownHeaderExtension *uext; 156 157 uext = g_malloc0(sizeof(*uext) + ext.len); 158 uext->magic = ext.magic; 159 uext->len = ext.len; 160 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); 161 162 ret = bdrv_pread(bs->file, offset , uext->data, uext->len); 163 if (ret < 0) { 164 return ret; 165 } 166 } 167 break; 168 } 169 170 offset += ((ext.len + 7) & ~7); 171 } 172 173 return 0; 174 } 175 176 static void cleanup_unknown_header_ext(BlockDriverState *bs) 177 { 178 BDRVQcowState *s = bs->opaque; 179 Qcow2UnknownHeaderExtension *uext, *next; 180 181 QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { 182 QLIST_REMOVE(uext, next); 183 g_free(uext); 184 } 185 } 186 187 static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, 188 const char *fmt, ...) 189 { 190 char msg[64]; 191 va_list ap; 192 193 va_start(ap, fmt); 194 vsnprintf(msg, sizeof(msg), fmt, ap); 195 va_end(ap); 196 197 qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, 198 bs->device_name, "qcow2", msg); 199 } 200 201 static void report_unsupported_feature(BlockDriverState *bs, 202 Qcow2Feature *table, uint64_t mask) 203 { 204 while (table && table->name[0] != '\0') { 205 if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { 206 if (mask & (1 << table->bit)) { 207 report_unsupported(bs, "%.46s",table->name); 208 mask &= ~(1 << table->bit); 209 } 210 } 211 table++; 212 } 213 214 if (mask) { 215 report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask); 216 } 217 } 218 219 /* 220 * Sets the dirty bit and flushes afterwards if necessary. 221 * 222 * The incompatible_features bit is only set if the image file header was 223 * updated successfully. Therefore it is not required to check the return 224 * value of this function. 225 */ 226 int qcow2_mark_dirty(BlockDriverState *bs) 227 { 228 BDRVQcowState *s = bs->opaque; 229 uint64_t val; 230 int ret; 231 232 assert(s->qcow_version >= 3); 233 234 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { 235 return 0; /* already dirty */ 236 } 237 238 val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); 239 ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), 240 &val, sizeof(val)); 241 if (ret < 0) { 242 return ret; 243 } 244 ret = bdrv_flush(bs->file); 245 if (ret < 0) { 246 return ret; 247 } 248 249 /* Only treat image as dirty if the header was updated successfully */ 250 s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; 251 return 0; 252 } 253 254 /* 255 * Clears the dirty bit and flushes before if necessary. Only call this 256 * function when there are no pending requests, it does not guard against 257 * concurrent requests dirtying the image. 258 */ 259 static int qcow2_mark_clean(BlockDriverState *bs) 260 { 261 BDRVQcowState *s = bs->opaque; 262 263 if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { 264 int ret = bdrv_flush(bs); 265 if (ret < 0) { 266 return ret; 267 } 268 269 s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; 270 return qcow2_update_header(bs); 271 } 272 return 0; 273 } 274 275 /* 276 * Marks the image as corrupt. 277 */ 278 int qcow2_mark_corrupt(BlockDriverState *bs) 279 { 280 BDRVQcowState *s = bs->opaque; 281 282 s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; 283 return qcow2_update_header(bs); 284 } 285 286 /* 287 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes 288 * before if necessary. 289 */ 290 int qcow2_mark_consistent(BlockDriverState *bs) 291 { 292 BDRVQcowState *s = bs->opaque; 293 294 if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { 295 int ret = bdrv_flush(bs); 296 if (ret < 0) { 297 return ret; 298 } 299 300 s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT; 301 return qcow2_update_header(bs); 302 } 303 return 0; 304 } 305 306 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, 307 BdrvCheckMode fix) 308 { 309 int ret = qcow2_check_refcounts(bs, result, fix); 310 if (ret < 0) { 311 return ret; 312 } 313 314 if (fix && result->check_errors == 0 && result->corruptions == 0) { 315 ret = qcow2_mark_clean(bs); 316 if (ret < 0) { 317 return ret; 318 } 319 return qcow2_mark_consistent(bs); 320 } 321 return ret; 322 } 323 324 static QemuOptsList qcow2_runtime_opts = { 325 .name = "qcow2", 326 .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), 327 .desc = { 328 { 329 .name = QCOW2_OPT_LAZY_REFCOUNTS, 330 .type = QEMU_OPT_BOOL, 331 .help = "Postpone refcount updates", 332 }, 333 { 334 .name = QCOW2_OPT_DISCARD_REQUEST, 335 .type = QEMU_OPT_BOOL, 336 .help = "Pass guest discard requests to the layer below", 337 }, 338 { 339 .name = QCOW2_OPT_DISCARD_SNAPSHOT, 340 .type = QEMU_OPT_BOOL, 341 .help = "Generate discard requests when snapshot related space " 342 "is freed", 343 }, 344 { 345 .name = QCOW2_OPT_DISCARD_OTHER, 346 .type = QEMU_OPT_BOOL, 347 .help = "Generate discard requests when other clusters are freed", 348 }, 349 { /* end of list */ } 350 }, 351 }; 352 353 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) 354 { 355 BDRVQcowState *s = bs->opaque; 356 int len, i, ret = 0; 357 QCowHeader header; 358 QemuOpts *opts; 359 Error *local_err = NULL; 360 uint64_t ext_end; 361 uint64_t l1_vm_state_index; 362 363 ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); 364 if (ret < 0) { 365 goto fail; 366 } 367 be32_to_cpus(&header.magic); 368 be32_to_cpus(&header.version); 369 be64_to_cpus(&header.backing_file_offset); 370 be32_to_cpus(&header.backing_file_size); 371 be64_to_cpus(&header.size); 372 be32_to_cpus(&header.cluster_bits); 373 be32_to_cpus(&header.crypt_method); 374 be64_to_cpus(&header.l1_table_offset); 375 be32_to_cpus(&header.l1_size); 376 be64_to_cpus(&header.refcount_table_offset); 377 be32_to_cpus(&header.refcount_table_clusters); 378 be64_to_cpus(&header.snapshots_offset); 379 be32_to_cpus(&header.nb_snapshots); 380 381 if (header.magic != QCOW_MAGIC) { 382 ret = -EMEDIUMTYPE; 383 goto fail; 384 } 385 if (header.version < 2 || header.version > 3) { 386 report_unsupported(bs, "QCOW version %d", header.version); 387 ret = -ENOTSUP; 388 goto fail; 389 } 390 391 s->qcow_version = header.version; 392 393 /* Initialise version 3 header fields */ 394 if (header.version == 2) { 395 header.incompatible_features = 0; 396 header.compatible_features = 0; 397 header.autoclear_features = 0; 398 header.refcount_order = 4; 399 header.header_length = 72; 400 } else { 401 be64_to_cpus(&header.incompatible_features); 402 be64_to_cpus(&header.compatible_features); 403 be64_to_cpus(&header.autoclear_features); 404 be32_to_cpus(&header.refcount_order); 405 be32_to_cpus(&header.header_length); 406 } 407 408 if (header.header_length > sizeof(header)) { 409 s->unknown_header_fields_size = header.header_length - sizeof(header); 410 s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); 411 ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, 412 s->unknown_header_fields_size); 413 if (ret < 0) { 414 goto fail; 415 } 416 } 417 418 if (header.backing_file_offset) { 419 ext_end = header.backing_file_offset; 420 } else { 421 ext_end = 1 << header.cluster_bits; 422 } 423 424 /* Handle feature bits */ 425 s->incompatible_features = header.incompatible_features; 426 s->compatible_features = header.compatible_features; 427 s->autoclear_features = header.autoclear_features; 428 429 if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { 430 void *feature_table = NULL; 431 qcow2_read_extensions(bs, header.header_length, ext_end, 432 &feature_table); 433 report_unsupported_feature(bs, feature_table, 434 s->incompatible_features & 435 ~QCOW2_INCOMPAT_MASK); 436 ret = -ENOTSUP; 437 goto fail; 438 } 439 440 if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { 441 /* Corrupt images may not be written to unless they are being repaired 442 */ 443 if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { 444 error_report("qcow2: Image is corrupt; cannot be opened " 445 "read/write."); 446 ret = -EACCES; 447 goto fail; 448 } 449 } 450 451 /* Check support for various header values */ 452 if (header.refcount_order != 4) { 453 report_unsupported(bs, "%d bit reference counts", 454 1 << header.refcount_order); 455 ret = -ENOTSUP; 456 goto fail; 457 } 458 459 if (header.cluster_bits < MIN_CLUSTER_BITS || 460 header.cluster_bits > MAX_CLUSTER_BITS) { 461 ret = -EINVAL; 462 goto fail; 463 } 464 if (header.crypt_method > QCOW_CRYPT_AES) { 465 ret = -EINVAL; 466 goto fail; 467 } 468 s->crypt_method_header = header.crypt_method; 469 if (s->crypt_method_header) { 470 bs->encrypted = 1; 471 } 472 s->cluster_bits = header.cluster_bits; 473 s->cluster_size = 1 << s->cluster_bits; 474 s->cluster_sectors = 1 << (s->cluster_bits - 9); 475 s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ 476 s->l2_size = 1 << s->l2_bits; 477 bs->total_sectors = header.size / 512; 478 s->csize_shift = (62 - (s->cluster_bits - 8)); 479 s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; 480 s->cluster_offset_mask = (1LL << s->csize_shift) - 1; 481 s->refcount_table_offset = header.refcount_table_offset; 482 s->refcount_table_size = 483 header.refcount_table_clusters << (s->cluster_bits - 3); 484 485 s->snapshots_offset = header.snapshots_offset; 486 s->nb_snapshots = header.nb_snapshots; 487 488 /* read the level 1 table */ 489 s->l1_size = header.l1_size; 490 491 l1_vm_state_index = size_to_l1(s, header.size); 492 if (l1_vm_state_index > INT_MAX) { 493 ret = -EFBIG; 494 goto fail; 495 } 496 s->l1_vm_state_index = l1_vm_state_index; 497 498 /* the L1 table must contain at least enough entries to put 499 header.size bytes */ 500 if (s->l1_size < s->l1_vm_state_index) { 501 ret = -EINVAL; 502 goto fail; 503 } 504 s->l1_table_offset = header.l1_table_offset; 505 if (s->l1_size > 0) { 506 s->l1_table = g_malloc0( 507 align_offset(s->l1_size * sizeof(uint64_t), 512)); 508 ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, 509 s->l1_size * sizeof(uint64_t)); 510 if (ret < 0) { 511 goto fail; 512 } 513 for(i = 0;i < s->l1_size; i++) { 514 be64_to_cpus(&s->l1_table[i]); 515 } 516 } 517 518 /* alloc L2 table/refcount block cache */ 519 s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE); 520 s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE); 521 522 s->cluster_cache = g_malloc(s->cluster_size); 523 /* one more sector for decompressed data alignment */ 524 s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size 525 + 512); 526 s->cluster_cache_offset = -1; 527 s->flags = flags; 528 529 ret = qcow2_refcount_init(bs); 530 if (ret != 0) { 531 goto fail; 532 } 533 534 QLIST_INIT(&s->cluster_allocs); 535 QTAILQ_INIT(&s->discards); 536 537 /* read qcow2 extensions */ 538 if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) { 539 ret = -EINVAL; 540 goto fail; 541 } 542 543 /* read the backing file name */ 544 if (header.backing_file_offset != 0) { 545 len = header.backing_file_size; 546 if (len > 1023) { 547 len = 1023; 548 } 549 ret = bdrv_pread(bs->file, header.backing_file_offset, 550 bs->backing_file, len); 551 if (ret < 0) { 552 goto fail; 553 } 554 bs->backing_file[len] = '\0'; 555 } 556 557 ret = qcow2_read_snapshots(bs); 558 if (ret < 0) { 559 goto fail; 560 } 561 562 /* Clear unknown autoclear feature bits */ 563 if (!bs->read_only && s->autoclear_features != 0) { 564 s->autoclear_features = 0; 565 ret = qcow2_update_header(bs); 566 if (ret < 0) { 567 goto fail; 568 } 569 } 570 571 /* Initialise locks */ 572 qemu_co_mutex_init(&s->lock); 573 574 /* Repair image if dirty */ 575 if (!(flags & BDRV_O_CHECK) && !bs->read_only && 576 (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { 577 BdrvCheckResult result = {0}; 578 579 ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS); 580 if (ret < 0) { 581 goto fail; 582 } 583 } 584 585 /* Enable lazy_refcounts according to image and command line options */ 586 opts = qemu_opts_create_nofail(&qcow2_runtime_opts); 587 qemu_opts_absorb_qdict(opts, options, &local_err); 588 if (error_is_set(&local_err)) { 589 qerror_report_err(local_err); 590 error_free(local_err); 591 ret = -EINVAL; 592 goto fail; 593 } 594 595 s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, 596 (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); 597 598 s->discard_passthrough[QCOW2_DISCARD_NEVER] = false; 599 s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; 600 s->discard_passthrough[QCOW2_DISCARD_REQUEST] = 601 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, 602 flags & BDRV_O_UNMAP); 603 s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = 604 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); 605 s->discard_passthrough[QCOW2_DISCARD_OTHER] = 606 qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); 607 608 qemu_opts_del(opts); 609 610 if (s->use_lazy_refcounts && s->qcow_version < 3) { 611 qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require " 612 "a qcow2 image with at least qemu 1.1 compatibility level"); 613 ret = -EINVAL; 614 goto fail; 615 } 616 617 #ifdef DEBUG_ALLOC 618 { 619 BdrvCheckResult result = {0}; 620 qcow2_check_refcounts(bs, &result, 0); 621 } 622 #endif 623 return ret; 624 625 fail: 626 g_free(s->unknown_header_fields); 627 cleanup_unknown_header_ext(bs); 628 qcow2_free_snapshots(bs); 629 qcow2_refcount_close(bs); 630 g_free(s->l1_table); 631 /* else pre-write overlap checks in cache_destroy may crash */ 632 s->l1_table = NULL; 633 if (s->l2_table_cache) { 634 qcow2_cache_destroy(bs, s->l2_table_cache); 635 } 636 g_free(s->cluster_cache); 637 qemu_vfree(s->cluster_data); 638 return ret; 639 } 640 641 static int qcow2_set_key(BlockDriverState *bs, const char *key) 642 { 643 BDRVQcowState *s = bs->opaque; 644 uint8_t keybuf[16]; 645 int len, i; 646 647 memset(keybuf, 0, 16); 648 len = strlen(key); 649 if (len > 16) 650 len = 16; 651 /* XXX: we could compress the chars to 7 bits to increase 652 entropy */ 653 for(i = 0;i < len;i++) { 654 keybuf[i] = key[i]; 655 } 656 s->crypt_method = s->crypt_method_header; 657 658 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) 659 return -1; 660 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) 661 return -1; 662 #if 0 663 /* test */ 664 { 665 uint8_t in[16]; 666 uint8_t out[16]; 667 uint8_t tmp[16]; 668 for(i=0;i<16;i++) 669 in[i] = i; 670 AES_encrypt(in, tmp, &s->aes_encrypt_key); 671 AES_decrypt(tmp, out, &s->aes_decrypt_key); 672 for(i = 0; i < 16; i++) 673 printf(" %02x", tmp[i]); 674 printf("\n"); 675 for(i = 0; i < 16; i++) 676 printf(" %02x", out[i]); 677 printf("\n"); 678 } 679 #endif 680 return 0; 681 } 682 683 /* We have nothing to do for QCOW2 reopen, stubs just return 684 * success */ 685 static int qcow2_reopen_prepare(BDRVReopenState *state, 686 BlockReopenQueue *queue, Error **errp) 687 { 688 return 0; 689 } 690 691 static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, 692 int64_t sector_num, int nb_sectors, int *pnum) 693 { 694 BDRVQcowState *s = bs->opaque; 695 uint64_t cluster_offset; 696 int index_in_cluster, ret; 697 int64_t status = 0; 698 699 *pnum = nb_sectors; 700 qemu_co_mutex_lock(&s->lock); 701 ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); 702 qemu_co_mutex_unlock(&s->lock); 703 if (ret < 0) { 704 return ret; 705 } 706 707 if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && 708 !s->crypt_method) { 709 index_in_cluster = sector_num & (s->cluster_sectors - 1); 710 cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); 711 status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; 712 } 713 if (ret == QCOW2_CLUSTER_ZERO) { 714 status |= BDRV_BLOCK_ZERO; 715 } else if (ret != QCOW2_CLUSTER_UNALLOCATED) { 716 status |= BDRV_BLOCK_DATA; 717 } 718 return status; 719 } 720 721 /* handle reading after the end of the backing file */ 722 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, 723 int64_t sector_num, int nb_sectors) 724 { 725 int n1; 726 if ((sector_num + nb_sectors) <= bs->total_sectors) 727 return nb_sectors; 728 if (sector_num >= bs->total_sectors) 729 n1 = 0; 730 else 731 n1 = bs->total_sectors - sector_num; 732 733 qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); 734 735 return n1; 736 } 737 738 static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, 739 int remaining_sectors, QEMUIOVector *qiov) 740 { 741 BDRVQcowState *s = bs->opaque; 742 int index_in_cluster, n1; 743 int ret; 744 int cur_nr_sectors; /* number of sectors in current iteration */ 745 uint64_t cluster_offset = 0; 746 uint64_t bytes_done = 0; 747 QEMUIOVector hd_qiov; 748 uint8_t *cluster_data = NULL; 749 750 qemu_iovec_init(&hd_qiov, qiov->niov); 751 752 qemu_co_mutex_lock(&s->lock); 753 754 while (remaining_sectors != 0) { 755 756 /* prepare next request */ 757 cur_nr_sectors = remaining_sectors; 758 if (s->crypt_method) { 759 cur_nr_sectors = MIN(cur_nr_sectors, 760 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); 761 } 762 763 ret = qcow2_get_cluster_offset(bs, sector_num << 9, 764 &cur_nr_sectors, &cluster_offset); 765 if (ret < 0) { 766 goto fail; 767 } 768 769 index_in_cluster = sector_num & (s->cluster_sectors - 1); 770 771 qemu_iovec_reset(&hd_qiov); 772 qemu_iovec_concat(&hd_qiov, qiov, bytes_done, 773 cur_nr_sectors * 512); 774 775 switch (ret) { 776 case QCOW2_CLUSTER_UNALLOCATED: 777 778 if (bs->backing_hd) { 779 /* read from the base image */ 780 n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, 781 sector_num, cur_nr_sectors); 782 if (n1 > 0) { 783 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); 784 qemu_co_mutex_unlock(&s->lock); 785 ret = bdrv_co_readv(bs->backing_hd, sector_num, 786 n1, &hd_qiov); 787 qemu_co_mutex_lock(&s->lock); 788 if (ret < 0) { 789 goto fail; 790 } 791 } 792 } else { 793 /* Note: in this case, no need to wait */ 794 qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); 795 } 796 break; 797 798 case QCOW2_CLUSTER_ZERO: 799 qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); 800 break; 801 802 case QCOW2_CLUSTER_COMPRESSED: 803 /* add AIO support for compressed blocks ? */ 804 ret = qcow2_decompress_cluster(bs, cluster_offset); 805 if (ret < 0) { 806 goto fail; 807 } 808 809 qemu_iovec_from_buf(&hd_qiov, 0, 810 s->cluster_cache + index_in_cluster * 512, 811 512 * cur_nr_sectors); 812 break; 813 814 case QCOW2_CLUSTER_NORMAL: 815 if ((cluster_offset & 511) != 0) { 816 ret = -EIO; 817 goto fail; 818 } 819 820 if (s->crypt_method) { 821 /* 822 * For encrypted images, read everything into a temporary 823 * contiguous buffer on which the AES functions can work. 824 */ 825 if (!cluster_data) { 826 cluster_data = 827 qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); 828 } 829 830 assert(cur_nr_sectors <= 831 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); 832 qemu_iovec_reset(&hd_qiov); 833 qemu_iovec_add(&hd_qiov, cluster_data, 834 512 * cur_nr_sectors); 835 } 836 837 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); 838 qemu_co_mutex_unlock(&s->lock); 839 ret = bdrv_co_readv(bs->file, 840 (cluster_offset >> 9) + index_in_cluster, 841 cur_nr_sectors, &hd_qiov); 842 qemu_co_mutex_lock(&s->lock); 843 if (ret < 0) { 844 goto fail; 845 } 846 if (s->crypt_method) { 847 qcow2_encrypt_sectors(s, sector_num, cluster_data, 848 cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key); 849 qemu_iovec_from_buf(qiov, bytes_done, 850 cluster_data, 512 * cur_nr_sectors); 851 } 852 break; 853 854 default: 855 g_assert_not_reached(); 856 ret = -EIO; 857 goto fail; 858 } 859 860 remaining_sectors -= cur_nr_sectors; 861 sector_num += cur_nr_sectors; 862 bytes_done += cur_nr_sectors * 512; 863 } 864 ret = 0; 865 866 fail: 867 qemu_co_mutex_unlock(&s->lock); 868 869 qemu_iovec_destroy(&hd_qiov); 870 qemu_vfree(cluster_data); 871 872 return ret; 873 } 874 875 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, 876 int64_t sector_num, 877 int remaining_sectors, 878 QEMUIOVector *qiov) 879 { 880 BDRVQcowState *s = bs->opaque; 881 int index_in_cluster; 882 int n_end; 883 int ret; 884 int cur_nr_sectors; /* number of sectors in current iteration */ 885 uint64_t cluster_offset; 886 QEMUIOVector hd_qiov; 887 uint64_t bytes_done = 0; 888 uint8_t *cluster_data = NULL; 889 QCowL2Meta *l2meta = NULL; 890 891 trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, 892 remaining_sectors); 893 894 qemu_iovec_init(&hd_qiov, qiov->niov); 895 896 s->cluster_cache_offset = -1; /* disable compressed cache */ 897 898 qemu_co_mutex_lock(&s->lock); 899 900 while (remaining_sectors != 0) { 901 902 l2meta = NULL; 903 904 trace_qcow2_writev_start_part(qemu_coroutine_self()); 905 index_in_cluster = sector_num & (s->cluster_sectors - 1); 906 n_end = index_in_cluster + remaining_sectors; 907 if (s->crypt_method && 908 n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) { 909 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; 910 } 911 912 ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, 913 index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta); 914 if (ret < 0) { 915 goto fail; 916 } 917 918 assert((cluster_offset & 511) == 0); 919 920 qemu_iovec_reset(&hd_qiov); 921 qemu_iovec_concat(&hd_qiov, qiov, bytes_done, 922 cur_nr_sectors * 512); 923 924 if (s->crypt_method) { 925 if (!cluster_data) { 926 cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * 927 s->cluster_size); 928 } 929 930 assert(hd_qiov.size <= 931 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); 932 qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); 933 934 qcow2_encrypt_sectors(s, sector_num, cluster_data, 935 cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key); 936 937 qemu_iovec_reset(&hd_qiov); 938 qemu_iovec_add(&hd_qiov, cluster_data, 939 cur_nr_sectors * 512); 940 } 941 942 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, 943 cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE, 944 cur_nr_sectors * BDRV_SECTOR_SIZE); 945 if (ret < 0) { 946 goto fail; 947 } 948 949 qemu_co_mutex_unlock(&s->lock); 950 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); 951 trace_qcow2_writev_data(qemu_coroutine_self(), 952 (cluster_offset >> 9) + index_in_cluster); 953 ret = bdrv_co_writev(bs->file, 954 (cluster_offset >> 9) + index_in_cluster, 955 cur_nr_sectors, &hd_qiov); 956 qemu_co_mutex_lock(&s->lock); 957 if (ret < 0) { 958 goto fail; 959 } 960 961 while (l2meta != NULL) { 962 QCowL2Meta *next; 963 964 ret = qcow2_alloc_cluster_link_l2(bs, l2meta); 965 if (ret < 0) { 966 goto fail; 967 } 968 969 /* Take the request off the list of running requests */ 970 if (l2meta->nb_clusters != 0) { 971 QLIST_REMOVE(l2meta, next_in_flight); 972 } 973 974 qemu_co_queue_restart_all(&l2meta->dependent_requests); 975 976 next = l2meta->next; 977 g_free(l2meta); 978 l2meta = next; 979 } 980 981 remaining_sectors -= cur_nr_sectors; 982 sector_num += cur_nr_sectors; 983 bytes_done += cur_nr_sectors * 512; 984 trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); 985 } 986 ret = 0; 987 988 fail: 989 qemu_co_mutex_unlock(&s->lock); 990 991 while (l2meta != NULL) { 992 QCowL2Meta *next; 993 994 if (l2meta->nb_clusters != 0) { 995 QLIST_REMOVE(l2meta, next_in_flight); 996 } 997 qemu_co_queue_restart_all(&l2meta->dependent_requests); 998 999 next = l2meta->next; 1000 g_free(l2meta); 1001 l2meta = next; 1002 } 1003 1004 qemu_iovec_destroy(&hd_qiov); 1005 qemu_vfree(cluster_data); 1006 trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); 1007 1008 return ret; 1009 } 1010 1011 static void qcow2_close(BlockDriverState *bs) 1012 { 1013 BDRVQcowState *s = bs->opaque; 1014 g_free(s->l1_table); 1015 /* else pre-write overlap checks in cache_destroy may crash */ 1016 s->l1_table = NULL; 1017 1018 qcow2_cache_flush(bs, s->l2_table_cache); 1019 qcow2_cache_flush(bs, s->refcount_block_cache); 1020 1021 qcow2_mark_clean(bs); 1022 1023 qcow2_cache_destroy(bs, s->l2_table_cache); 1024 qcow2_cache_destroy(bs, s->refcount_block_cache); 1025 1026 g_free(s->unknown_header_fields); 1027 cleanup_unknown_header_ext(bs); 1028 1029 g_free(s->cluster_cache); 1030 qemu_vfree(s->cluster_data); 1031 qcow2_refcount_close(bs); 1032 qcow2_free_snapshots(bs); 1033 } 1034 1035 static void qcow2_invalidate_cache(BlockDriverState *bs) 1036 { 1037 BDRVQcowState *s = bs->opaque; 1038 int flags = s->flags; 1039 AES_KEY aes_encrypt_key; 1040 AES_KEY aes_decrypt_key; 1041 uint32_t crypt_method = 0; 1042 QDict *options; 1043 1044 /* 1045 * Backing files are read-only which makes all of their metadata immutable, 1046 * that means we don't have to worry about reopening them here. 1047 */ 1048 1049 if (s->crypt_method) { 1050 crypt_method = s->crypt_method; 1051 memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key)); 1052 memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key)); 1053 } 1054 1055 qcow2_close(bs); 1056 1057 options = qdict_new(); 1058 qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS, 1059 qbool_from_int(s->use_lazy_refcounts)); 1060 1061 memset(s, 0, sizeof(BDRVQcowState)); 1062 qcow2_open(bs, options, flags); 1063 1064 QDECREF(options); 1065 1066 if (crypt_method) { 1067 s->crypt_method = crypt_method; 1068 memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key)); 1069 memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key)); 1070 } 1071 } 1072 1073 static size_t header_ext_add(char *buf, uint32_t magic, const void *s, 1074 size_t len, size_t buflen) 1075 { 1076 QCowExtension *ext_backing_fmt = (QCowExtension*) buf; 1077 size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); 1078 1079 if (buflen < ext_len) { 1080 return -ENOSPC; 1081 } 1082 1083 *ext_backing_fmt = (QCowExtension) { 1084 .magic = cpu_to_be32(magic), 1085 .len = cpu_to_be32(len), 1086 }; 1087 memcpy(buf + sizeof(QCowExtension), s, len); 1088 1089 return ext_len; 1090 } 1091 1092 /* 1093 * Updates the qcow2 header, including the variable length parts of it, i.e. 1094 * the backing file name and all extensions. qcow2 was not designed to allow 1095 * such changes, so if we run out of space (we can only use the first cluster) 1096 * this function may fail. 1097 * 1098 * Returns 0 on success, -errno in error cases. 1099 */ 1100 int qcow2_update_header(BlockDriverState *bs) 1101 { 1102 BDRVQcowState *s = bs->opaque; 1103 QCowHeader *header; 1104 char *buf; 1105 size_t buflen = s->cluster_size; 1106 int ret; 1107 uint64_t total_size; 1108 uint32_t refcount_table_clusters; 1109 size_t header_length; 1110 Qcow2UnknownHeaderExtension *uext; 1111 1112 buf = qemu_blockalign(bs, buflen); 1113 1114 /* Header structure */ 1115 header = (QCowHeader*) buf; 1116 1117 if (buflen < sizeof(*header)) { 1118 ret = -ENOSPC; 1119 goto fail; 1120 } 1121 1122 header_length = sizeof(*header) + s->unknown_header_fields_size; 1123 total_size = bs->total_sectors * BDRV_SECTOR_SIZE; 1124 refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); 1125 1126 *header = (QCowHeader) { 1127 /* Version 2 fields */ 1128 .magic = cpu_to_be32(QCOW_MAGIC), 1129 .version = cpu_to_be32(s->qcow_version), 1130 .backing_file_offset = 0, 1131 .backing_file_size = 0, 1132 .cluster_bits = cpu_to_be32(s->cluster_bits), 1133 .size = cpu_to_be64(total_size), 1134 .crypt_method = cpu_to_be32(s->crypt_method_header), 1135 .l1_size = cpu_to_be32(s->l1_size), 1136 .l1_table_offset = cpu_to_be64(s->l1_table_offset), 1137 .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), 1138 .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), 1139 .nb_snapshots = cpu_to_be32(s->nb_snapshots), 1140 .snapshots_offset = cpu_to_be64(s->snapshots_offset), 1141 1142 /* Version 3 fields */ 1143 .incompatible_features = cpu_to_be64(s->incompatible_features), 1144 .compatible_features = cpu_to_be64(s->compatible_features), 1145 .autoclear_features = cpu_to_be64(s->autoclear_features), 1146 .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT), 1147 .header_length = cpu_to_be32(header_length), 1148 }; 1149 1150 /* For older versions, write a shorter header */ 1151 switch (s->qcow_version) { 1152 case 2: 1153 ret = offsetof(QCowHeader, incompatible_features); 1154 break; 1155 case 3: 1156 ret = sizeof(*header); 1157 break; 1158 default: 1159 ret = -EINVAL; 1160 goto fail; 1161 } 1162 1163 buf += ret; 1164 buflen -= ret; 1165 memset(buf, 0, buflen); 1166 1167 /* Preserve any unknown field in the header */ 1168 if (s->unknown_header_fields_size) { 1169 if (buflen < s->unknown_header_fields_size) { 1170 ret = -ENOSPC; 1171 goto fail; 1172 } 1173 1174 memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); 1175 buf += s->unknown_header_fields_size; 1176 buflen -= s->unknown_header_fields_size; 1177 } 1178 1179 /* Backing file format header extension */ 1180 if (*bs->backing_format) { 1181 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, 1182 bs->backing_format, strlen(bs->backing_format), 1183 buflen); 1184 if (ret < 0) { 1185 goto fail; 1186 } 1187 1188 buf += ret; 1189 buflen -= ret; 1190 } 1191 1192 /* Feature table */ 1193 Qcow2Feature features[] = { 1194 { 1195 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, 1196 .bit = QCOW2_INCOMPAT_DIRTY_BITNR, 1197 .name = "dirty bit", 1198 }, 1199 { 1200 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, 1201 .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, 1202 .name = "corrupt bit", 1203 }, 1204 { 1205 .type = QCOW2_FEAT_TYPE_COMPATIBLE, 1206 .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, 1207 .name = "lazy refcounts", 1208 }, 1209 }; 1210 1211 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, 1212 features, sizeof(features), buflen); 1213 if (ret < 0) { 1214 goto fail; 1215 } 1216 buf += ret; 1217 buflen -= ret; 1218 1219 /* Keep unknown header extensions */ 1220 QLIST_FOREACH(uext, &s->unknown_header_ext, next) { 1221 ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); 1222 if (ret < 0) { 1223 goto fail; 1224 } 1225 1226 buf += ret; 1227 buflen -= ret; 1228 } 1229 1230 /* End of header extensions */ 1231 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); 1232 if (ret < 0) { 1233 goto fail; 1234 } 1235 1236 buf += ret; 1237 buflen -= ret; 1238 1239 /* Backing file name */ 1240 if (*bs->backing_file) { 1241 size_t backing_file_len = strlen(bs->backing_file); 1242 1243 if (buflen < backing_file_len) { 1244 ret = -ENOSPC; 1245 goto fail; 1246 } 1247 1248 /* Using strncpy is ok here, since buf is not NUL-terminated. */ 1249 strncpy(buf, bs->backing_file, buflen); 1250 1251 header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); 1252 header->backing_file_size = cpu_to_be32(backing_file_len); 1253 } 1254 1255 /* Write the new header */ 1256 ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); 1257 if (ret < 0) { 1258 goto fail; 1259 } 1260 1261 ret = 0; 1262 fail: 1263 qemu_vfree(header); 1264 return ret; 1265 } 1266 1267 static int qcow2_change_backing_file(BlockDriverState *bs, 1268 const char *backing_file, const char *backing_fmt) 1269 { 1270 pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); 1271 pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); 1272 1273 return qcow2_update_header(bs); 1274 } 1275 1276 static int preallocate(BlockDriverState *bs) 1277 { 1278 uint64_t nb_sectors; 1279 uint64_t offset; 1280 uint64_t host_offset = 0; 1281 int num; 1282 int ret; 1283 QCowL2Meta *meta; 1284 1285 nb_sectors = bdrv_getlength(bs) >> 9; 1286 offset = 0; 1287 1288 while (nb_sectors) { 1289 num = MIN(nb_sectors, INT_MAX >> 9); 1290 ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, 1291 &host_offset, &meta); 1292 if (ret < 0) { 1293 return ret; 1294 } 1295 1296 ret = qcow2_alloc_cluster_link_l2(bs, meta); 1297 if (ret < 0) { 1298 qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters, 1299 QCOW2_DISCARD_NEVER); 1300 return ret; 1301 } 1302 1303 /* There are no dependent requests, but we need to remove our request 1304 * from the list of in-flight requests */ 1305 if (meta != NULL) { 1306 QLIST_REMOVE(meta, next_in_flight); 1307 } 1308 1309 /* TODO Preallocate data if requested */ 1310 1311 nb_sectors -= num; 1312 offset += num << 9; 1313 } 1314 1315 /* 1316 * It is expected that the image file is large enough to actually contain 1317 * all of the allocated clusters (otherwise we get failing reads after 1318 * EOF). Extend the image to the last allocated sector. 1319 */ 1320 if (host_offset != 0) { 1321 uint8_t buf[512]; 1322 memset(buf, 0, 512); 1323 ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1); 1324 if (ret < 0) { 1325 return ret; 1326 } 1327 } 1328 1329 return 0; 1330 } 1331 1332 static int qcow2_create2(const char *filename, int64_t total_size, 1333 const char *backing_file, const char *backing_format, 1334 int flags, size_t cluster_size, int prealloc, 1335 QEMUOptionParameter *options, int version) 1336 { 1337 /* Calculate cluster_bits */ 1338 int cluster_bits; 1339 cluster_bits = ffs(cluster_size) - 1; 1340 if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || 1341 (1 << cluster_bits) != cluster_size) 1342 { 1343 error_report( 1344 "Cluster size must be a power of two between %d and %dk", 1345 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); 1346 return -EINVAL; 1347 } 1348 1349 /* 1350 * Open the image file and write a minimal qcow2 header. 1351 * 1352 * We keep things simple and start with a zero-sized image. We also 1353 * do without refcount blocks or a L1 table for now. We'll fix the 1354 * inconsistency later. 1355 * 1356 * We do need a refcount table because growing the refcount table means 1357 * allocating two new refcount blocks - the seconds of which would be at 1358 * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file 1359 * size for any qcow2 image. 1360 */ 1361 BlockDriverState* bs; 1362 QCowHeader header; 1363 uint8_t* refcount_table; 1364 int ret; 1365 1366 ret = bdrv_create_file(filename, options); 1367 if (ret < 0) { 1368 return ret; 1369 } 1370 1371 ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); 1372 if (ret < 0) { 1373 return ret; 1374 } 1375 1376 /* Write the header */ 1377 memset(&header, 0, sizeof(header)); 1378 header.magic = cpu_to_be32(QCOW_MAGIC); 1379 header.version = cpu_to_be32(version); 1380 header.cluster_bits = cpu_to_be32(cluster_bits); 1381 header.size = cpu_to_be64(0); 1382 header.l1_table_offset = cpu_to_be64(0); 1383 header.l1_size = cpu_to_be32(0); 1384 header.refcount_table_offset = cpu_to_be64(cluster_size); 1385 header.refcount_table_clusters = cpu_to_be32(1); 1386 header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT); 1387 header.header_length = cpu_to_be32(sizeof(header)); 1388 1389 if (flags & BLOCK_FLAG_ENCRYPT) { 1390 header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); 1391 } else { 1392 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); 1393 } 1394 1395 if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { 1396 header.compatible_features |= 1397 cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); 1398 } 1399 1400 ret = bdrv_pwrite(bs, 0, &header, sizeof(header)); 1401 if (ret < 0) { 1402 goto out; 1403 } 1404 1405 /* Write an empty refcount table */ 1406 refcount_table = g_malloc0(cluster_size); 1407 ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size); 1408 g_free(refcount_table); 1409 1410 if (ret < 0) { 1411 goto out; 1412 } 1413 1414 bdrv_close(bs); 1415 1416 /* 1417 * And now open the image and make it consistent first (i.e. increase the 1418 * refcount of the cluster that is occupied by the header and the refcount 1419 * table) 1420 */ 1421 BlockDriver* drv = bdrv_find_format("qcow2"); 1422 assert(drv != NULL); 1423 ret = bdrv_open(bs, filename, NULL, 1424 BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); 1425 if (ret < 0) { 1426 goto out; 1427 } 1428 1429 ret = qcow2_alloc_clusters(bs, 2 * cluster_size); 1430 if (ret < 0) { 1431 goto out; 1432 1433 } else if (ret != 0) { 1434 error_report("Huh, first cluster in empty image is already in use?"); 1435 abort(); 1436 } 1437 1438 /* Okay, now that we have a valid image, let's give it the right size */ 1439 ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE); 1440 if (ret < 0) { 1441 goto out; 1442 } 1443 1444 /* Want a backing file? There you go.*/ 1445 if (backing_file) { 1446 ret = bdrv_change_backing_file(bs, backing_file, backing_format); 1447 if (ret < 0) { 1448 goto out; 1449 } 1450 } 1451 1452 /* And if we're supposed to preallocate metadata, do that now */ 1453 if (prealloc) { 1454 BDRVQcowState *s = bs->opaque; 1455 qemu_co_mutex_lock(&s->lock); 1456 ret = preallocate(bs); 1457 qemu_co_mutex_unlock(&s->lock); 1458 if (ret < 0) { 1459 goto out; 1460 } 1461 } 1462 1463 ret = 0; 1464 out: 1465 bdrv_unref(bs); 1466 return ret; 1467 } 1468 1469 static int qcow2_create(const char *filename, QEMUOptionParameter *options) 1470 { 1471 const char *backing_file = NULL; 1472 const char *backing_fmt = NULL; 1473 uint64_t sectors = 0; 1474 int flags = 0; 1475 size_t cluster_size = DEFAULT_CLUSTER_SIZE; 1476 int prealloc = 0; 1477 int version = 3; 1478 1479 /* Read out options */ 1480 while (options && options->name) { 1481 if (!strcmp(options->name, BLOCK_OPT_SIZE)) { 1482 sectors = options->value.n / 512; 1483 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { 1484 backing_file = options->value.s; 1485 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { 1486 backing_fmt = options->value.s; 1487 } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { 1488 flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; 1489 } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { 1490 if (options->value.n) { 1491 cluster_size = options->value.n; 1492 } 1493 } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { 1494 if (!options->value.s || !strcmp(options->value.s, "off")) { 1495 prealloc = 0; 1496 } else if (!strcmp(options->value.s, "metadata")) { 1497 prealloc = 1; 1498 } else { 1499 fprintf(stderr, "Invalid preallocation mode: '%s'\n", 1500 options->value.s); 1501 return -EINVAL; 1502 } 1503 } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) { 1504 if (!options->value.s) { 1505 /* keep the default */ 1506 } else if (!strcmp(options->value.s, "0.10")) { 1507 version = 2; 1508 } else if (!strcmp(options->value.s, "1.1")) { 1509 version = 3; 1510 } else { 1511 fprintf(stderr, "Invalid compatibility level: '%s'\n", 1512 options->value.s); 1513 return -EINVAL; 1514 } 1515 } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) { 1516 flags |= options->value.n ? BLOCK_FLAG_LAZY_REFCOUNTS : 0; 1517 } 1518 options++; 1519 } 1520 1521 if (backing_file && prealloc) { 1522 fprintf(stderr, "Backing file and preallocation cannot be used at " 1523 "the same time\n"); 1524 return -EINVAL; 1525 } 1526 1527 if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { 1528 fprintf(stderr, "Lazy refcounts only supported with compatibility " 1529 "level 1.1 and above (use compat=1.1 or greater)\n"); 1530 return -EINVAL; 1531 } 1532 1533 return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, 1534 cluster_size, prealloc, options, version); 1535 } 1536 1537 static int qcow2_make_empty(BlockDriverState *bs) 1538 { 1539 #if 0 1540 /* XXX: not correct */ 1541 BDRVQcowState *s = bs->opaque; 1542 uint32_t l1_length = s->l1_size * sizeof(uint64_t); 1543 int ret; 1544 1545 memset(s->l1_table, 0, l1_length); 1546 if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) 1547 return -1; 1548 ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); 1549 if (ret < 0) 1550 return ret; 1551 1552 l2_cache_reset(bs); 1553 #endif 1554 return 0; 1555 } 1556 1557 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, 1558 int64_t sector_num, int nb_sectors) 1559 { 1560 int ret; 1561 BDRVQcowState *s = bs->opaque; 1562 1563 /* Emulate misaligned zero writes */ 1564 if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { 1565 return -ENOTSUP; 1566 } 1567 1568 /* Whatever is left can use real zero clusters */ 1569 qemu_co_mutex_lock(&s->lock); 1570 ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, 1571 nb_sectors); 1572 qemu_co_mutex_unlock(&s->lock); 1573 1574 return ret; 1575 } 1576 1577 static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, 1578 int64_t sector_num, int nb_sectors) 1579 { 1580 int ret; 1581 BDRVQcowState *s = bs->opaque; 1582 1583 qemu_co_mutex_lock(&s->lock); 1584 ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, 1585 nb_sectors); 1586 qemu_co_mutex_unlock(&s->lock); 1587 return ret; 1588 } 1589 1590 static int qcow2_truncate(BlockDriverState *bs, int64_t offset) 1591 { 1592 BDRVQcowState *s = bs->opaque; 1593 int64_t new_l1_size; 1594 int ret; 1595 1596 if (offset & 511) { 1597 error_report("The new size must be a multiple of 512"); 1598 return -EINVAL; 1599 } 1600 1601 /* cannot proceed if image has snapshots */ 1602 if (s->nb_snapshots) { 1603 error_report("Can't resize an image which has snapshots"); 1604 return -ENOTSUP; 1605 } 1606 1607 /* shrinking is currently not supported */ 1608 if (offset < bs->total_sectors * 512) { 1609 error_report("qcow2 doesn't support shrinking images yet"); 1610 return -ENOTSUP; 1611 } 1612 1613 new_l1_size = size_to_l1(s, offset); 1614 ret = qcow2_grow_l1_table(bs, new_l1_size, true); 1615 if (ret < 0) { 1616 return ret; 1617 } 1618 1619 /* write updated header.size */ 1620 offset = cpu_to_be64(offset); 1621 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), 1622 &offset, sizeof(uint64_t)); 1623 if (ret < 0) { 1624 return ret; 1625 } 1626 1627 s->l1_vm_state_index = new_l1_size; 1628 return 0; 1629 } 1630 1631 /* XXX: put compressed sectors first, then all the cluster aligned 1632 tables to avoid losing bytes in alignment */ 1633 static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, 1634 const uint8_t *buf, int nb_sectors) 1635 { 1636 BDRVQcowState *s = bs->opaque; 1637 z_stream strm; 1638 int ret, out_len; 1639 uint8_t *out_buf; 1640 uint64_t cluster_offset; 1641 1642 if (nb_sectors == 0) { 1643 /* align end of file to a sector boundary to ease reading with 1644 sector based I/Os */ 1645 cluster_offset = bdrv_getlength(bs->file); 1646 cluster_offset = (cluster_offset + 511) & ~511; 1647 bdrv_truncate(bs->file, cluster_offset); 1648 return 0; 1649 } 1650 1651 if (nb_sectors != s->cluster_sectors) { 1652 ret = -EINVAL; 1653 1654 /* Zero-pad last write if image size is not cluster aligned */ 1655 if (sector_num + nb_sectors == bs->total_sectors && 1656 nb_sectors < s->cluster_sectors) { 1657 uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); 1658 memset(pad_buf, 0, s->cluster_size); 1659 memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); 1660 ret = qcow2_write_compressed(bs, sector_num, 1661 pad_buf, s->cluster_sectors); 1662 qemu_vfree(pad_buf); 1663 } 1664 return ret; 1665 } 1666 1667 out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); 1668 1669 /* best compression, small window, no zlib header */ 1670 memset(&strm, 0, sizeof(strm)); 1671 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, 1672 Z_DEFLATED, -12, 1673 9, Z_DEFAULT_STRATEGY); 1674 if (ret != 0) { 1675 ret = -EINVAL; 1676 goto fail; 1677 } 1678 1679 strm.avail_in = s->cluster_size; 1680 strm.next_in = (uint8_t *)buf; 1681 strm.avail_out = s->cluster_size; 1682 strm.next_out = out_buf; 1683 1684 ret = deflate(&strm, Z_FINISH); 1685 if (ret != Z_STREAM_END && ret != Z_OK) { 1686 deflateEnd(&strm); 1687 ret = -EINVAL; 1688 goto fail; 1689 } 1690 out_len = strm.next_out - out_buf; 1691 1692 deflateEnd(&strm); 1693 1694 if (ret != Z_STREAM_END || out_len >= s->cluster_size) { 1695 /* could not compress: write normal cluster */ 1696 1697 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, 1698 sector_num * BDRV_SECTOR_SIZE, 1699 s->cluster_sectors * BDRV_SECTOR_SIZE); 1700 if (ret < 0) { 1701 goto fail; 1702 } 1703 1704 ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); 1705 if (ret < 0) { 1706 goto fail; 1707 } 1708 } else { 1709 cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, 1710 sector_num << 9, out_len); 1711 if (!cluster_offset) { 1712 ret = -EIO; 1713 goto fail; 1714 } 1715 cluster_offset &= s->cluster_offset_mask; 1716 1717 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, 1718 cluster_offset, out_len); 1719 if (ret < 0) { 1720 goto fail; 1721 } 1722 1723 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); 1724 ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); 1725 if (ret < 0) { 1726 goto fail; 1727 } 1728 } 1729 1730 ret = 0; 1731 fail: 1732 g_free(out_buf); 1733 return ret; 1734 } 1735 1736 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) 1737 { 1738 BDRVQcowState *s = bs->opaque; 1739 int ret; 1740 1741 qemu_co_mutex_lock(&s->lock); 1742 ret = qcow2_cache_flush(bs, s->l2_table_cache); 1743 if (ret < 0) { 1744 qemu_co_mutex_unlock(&s->lock); 1745 return ret; 1746 } 1747 1748 if (qcow2_need_accurate_refcounts(s)) { 1749 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 1750 if (ret < 0) { 1751 qemu_co_mutex_unlock(&s->lock); 1752 return ret; 1753 } 1754 } 1755 qemu_co_mutex_unlock(&s->lock); 1756 1757 return 0; 1758 } 1759 1760 static int64_t qcow2_vm_state_offset(BDRVQcowState *s) 1761 { 1762 return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); 1763 } 1764 1765 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 1766 { 1767 BDRVQcowState *s = bs->opaque; 1768 bdi->cluster_size = s->cluster_size; 1769 bdi->vm_state_offset = qcow2_vm_state_offset(s); 1770 return 0; 1771 } 1772 1773 #if 0 1774 static void dump_refcounts(BlockDriverState *bs) 1775 { 1776 BDRVQcowState *s = bs->opaque; 1777 int64_t nb_clusters, k, k1, size; 1778 int refcount; 1779 1780 size = bdrv_getlength(bs->file); 1781 nb_clusters = size_to_clusters(s, size); 1782 for(k = 0; k < nb_clusters;) { 1783 k1 = k; 1784 refcount = get_refcount(bs, k); 1785 k++; 1786 while (k < nb_clusters && get_refcount(bs, k) == refcount) 1787 k++; 1788 printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, 1789 k - k1); 1790 } 1791 } 1792 #endif 1793 1794 static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, 1795 int64_t pos) 1796 { 1797 BDRVQcowState *s = bs->opaque; 1798 int growable = bs->growable; 1799 int ret; 1800 1801 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); 1802 bs->growable = 1; 1803 ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); 1804 bs->growable = growable; 1805 1806 return ret; 1807 } 1808 1809 static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, 1810 int64_t pos, int size) 1811 { 1812 BDRVQcowState *s = bs->opaque; 1813 int growable = bs->growable; 1814 bool zero_beyond_eof = bs->zero_beyond_eof; 1815 int ret; 1816 1817 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); 1818 bs->growable = 1; 1819 bs->zero_beyond_eof = false; 1820 ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); 1821 bs->growable = growable; 1822 bs->zero_beyond_eof = zero_beyond_eof; 1823 1824 return ret; 1825 } 1826 1827 static QEMUOptionParameter qcow2_create_options[] = { 1828 { 1829 .name = BLOCK_OPT_SIZE, 1830 .type = OPT_SIZE, 1831 .help = "Virtual disk size" 1832 }, 1833 { 1834 .name = BLOCK_OPT_COMPAT_LEVEL, 1835 .type = OPT_STRING, 1836 .help = "Compatibility level (0.10 or 1.1)" 1837 }, 1838 { 1839 .name = BLOCK_OPT_BACKING_FILE, 1840 .type = OPT_STRING, 1841 .help = "File name of a base image" 1842 }, 1843 { 1844 .name = BLOCK_OPT_BACKING_FMT, 1845 .type = OPT_STRING, 1846 .help = "Image format of the base image" 1847 }, 1848 { 1849 .name = BLOCK_OPT_ENCRYPT, 1850 .type = OPT_FLAG, 1851 .help = "Encrypt the image" 1852 }, 1853 { 1854 .name = BLOCK_OPT_CLUSTER_SIZE, 1855 .type = OPT_SIZE, 1856 .help = "qcow2 cluster size", 1857 .value = { .n = DEFAULT_CLUSTER_SIZE }, 1858 }, 1859 { 1860 .name = BLOCK_OPT_PREALLOC, 1861 .type = OPT_STRING, 1862 .help = "Preallocation mode (allowed values: off, metadata)" 1863 }, 1864 { 1865 .name = BLOCK_OPT_LAZY_REFCOUNTS, 1866 .type = OPT_FLAG, 1867 .help = "Postpone refcount updates", 1868 }, 1869 { NULL } 1870 }; 1871 1872 static BlockDriver bdrv_qcow2 = { 1873 .format_name = "qcow2", 1874 .instance_size = sizeof(BDRVQcowState), 1875 .bdrv_probe = qcow2_probe, 1876 .bdrv_open = qcow2_open, 1877 .bdrv_close = qcow2_close, 1878 .bdrv_reopen_prepare = qcow2_reopen_prepare, 1879 .bdrv_create = qcow2_create, 1880 .bdrv_has_zero_init = bdrv_has_zero_init_1, 1881 .bdrv_co_get_block_status = qcow2_co_get_block_status, 1882 .bdrv_set_key = qcow2_set_key, 1883 .bdrv_make_empty = qcow2_make_empty, 1884 1885 .bdrv_co_readv = qcow2_co_readv, 1886 .bdrv_co_writev = qcow2_co_writev, 1887 .bdrv_co_flush_to_os = qcow2_co_flush_to_os, 1888 1889 .bdrv_co_write_zeroes = qcow2_co_write_zeroes, 1890 .bdrv_co_discard = qcow2_co_discard, 1891 .bdrv_truncate = qcow2_truncate, 1892 .bdrv_write_compressed = qcow2_write_compressed, 1893 1894 .bdrv_snapshot_create = qcow2_snapshot_create, 1895 .bdrv_snapshot_goto = qcow2_snapshot_goto, 1896 .bdrv_snapshot_delete = qcow2_snapshot_delete, 1897 .bdrv_snapshot_list = qcow2_snapshot_list, 1898 .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, 1899 .bdrv_get_info = qcow2_get_info, 1900 1901 .bdrv_save_vmstate = qcow2_save_vmstate, 1902 .bdrv_load_vmstate = qcow2_load_vmstate, 1903 1904 .bdrv_change_backing_file = qcow2_change_backing_file, 1905 1906 .bdrv_invalidate_cache = qcow2_invalidate_cache, 1907 1908 .create_options = qcow2_create_options, 1909 .bdrv_check = qcow2_check, 1910 }; 1911 1912 static void bdrv_qcow2_init(void) 1913 { 1914 bdrv_register(&bdrv_qcow2); 1915 } 1916 1917 block_init(bdrv_qcow2_init); 1918