1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu-common.h" 26 #include "block/block_int.h" 27 #include "block/qcow2.h" 28 #include "qemu/range.h" 29 30 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); 31 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, 32 int64_t offset, int64_t length, 33 int addend, enum qcow2_discard_type type); 34 35 36 /*********************************************************/ 37 /* refcount handling */ 38 39 int qcow2_refcount_init(BlockDriverState *bs) 40 { 41 BDRVQcowState *s = bs->opaque; 42 unsigned int refcount_table_size2, i; 43 int ret; 44 45 assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); 46 refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); 47 s->refcount_table = g_try_malloc(refcount_table_size2); 48 49 if (s->refcount_table_size > 0) { 50 if (s->refcount_table == NULL) { 51 ret = -ENOMEM; 52 goto fail; 53 } 54 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); 55 ret = bdrv_pread(bs->file, s->refcount_table_offset, 56 s->refcount_table, refcount_table_size2); 57 if (ret < 0) { 58 goto fail; 59 } 60 for(i = 0; i < s->refcount_table_size; i++) 61 be64_to_cpus(&s->refcount_table[i]); 62 } 63 return 0; 64 fail: 65 return ret; 66 } 67 68 void qcow2_refcount_close(BlockDriverState *bs) 69 { 70 BDRVQcowState *s = bs->opaque; 71 g_free(s->refcount_table); 72 } 73 74 75 static int load_refcount_block(BlockDriverState *bs, 76 int64_t refcount_block_offset, 77 void **refcount_block) 78 { 79 BDRVQcowState *s = bs->opaque; 80 int ret; 81 82 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); 83 ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 84 refcount_block); 85 86 return ret; 87 } 88 89 /* 90 * Returns the refcount of the cluster given by its index. Any non-negative 91 * return value is the refcount of the cluster, negative values are -errno 92 * and indicate an error. 93 */ 94 static int get_refcount(BlockDriverState *bs, int64_t cluster_index) 95 { 96 BDRVQcowState *s = bs->opaque; 97 uint64_t refcount_table_index, block_index; 98 int64_t refcount_block_offset; 99 int ret; 100 uint16_t *refcount_block; 101 uint16_t refcount; 102 103 refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); 104 if (refcount_table_index >= s->refcount_table_size) 105 return 0; 106 refcount_block_offset = 107 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 108 if (!refcount_block_offset) 109 return 0; 110 111 if (offset_into_cluster(s, refcount_block_offset)) { 112 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 113 " unaligned (reftable index: %#" PRIx64 ")", 114 refcount_block_offset, refcount_table_index); 115 return -EIO; 116 } 117 118 ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 119 (void**) &refcount_block); 120 if (ret < 0) { 121 return ret; 122 } 123 124 block_index = cluster_index & 125 ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); 126 refcount = be16_to_cpu(refcount_block[block_index]); 127 128 ret = qcow2_cache_put(bs, s->refcount_block_cache, 129 (void**) &refcount_block); 130 if (ret < 0) { 131 return ret; 132 } 133 134 return refcount; 135 } 136 137 /* 138 * Rounds the refcount table size up to avoid growing the table for each single 139 * refcount block that is allocated. 140 */ 141 static unsigned int next_refcount_table_size(BDRVQcowState *s, 142 unsigned int min_size) 143 { 144 unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; 145 unsigned int refcount_table_clusters = 146 MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); 147 148 while (min_clusters > refcount_table_clusters) { 149 refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; 150 } 151 152 return refcount_table_clusters << (s->cluster_bits - 3); 153 } 154 155 156 /* Checks if two offsets are described by the same refcount block */ 157 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, 158 uint64_t offset_b) 159 { 160 uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT); 161 uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT); 162 163 return (block_a == block_b); 164 } 165 166 /* 167 * Loads a refcount block. If it doesn't exist yet, it is allocated first 168 * (including growing the refcount table if needed). 169 * 170 * Returns 0 on success or -errno in error case 171 */ 172 static int alloc_refcount_block(BlockDriverState *bs, 173 int64_t cluster_index, uint16_t **refcount_block) 174 { 175 BDRVQcowState *s = bs->opaque; 176 unsigned int refcount_table_index; 177 int ret; 178 179 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); 180 181 /* Find the refcount block for the given cluster */ 182 refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); 183 184 if (refcount_table_index < s->refcount_table_size) { 185 186 uint64_t refcount_block_offset = 187 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 188 189 /* If it's already there, we're done */ 190 if (refcount_block_offset) { 191 if (offset_into_cluster(s, refcount_block_offset)) { 192 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 193 PRIx64 " unaligned (reftable index: " 194 "%#x)", refcount_block_offset, 195 refcount_table_index); 196 return -EIO; 197 } 198 199 return load_refcount_block(bs, refcount_block_offset, 200 (void**) refcount_block); 201 } 202 } 203 204 /* 205 * If we came here, we need to allocate something. Something is at least 206 * a cluster for the new refcount block. It may also include a new refcount 207 * table if the old refcount table is too small. 208 * 209 * Note that allocating clusters here needs some special care: 210 * 211 * - We can't use the normal qcow2_alloc_clusters(), it would try to 212 * increase the refcount and very likely we would end up with an endless 213 * recursion. Instead we must place the refcount blocks in a way that 214 * they can describe them themselves. 215 * 216 * - We need to consider that at this point we are inside update_refcounts 217 * and potentially doing an initial refcount increase. This means that 218 * some clusters have already been allocated by the caller, but their 219 * refcount isn't accurate yet. If we allocate clusters for metadata, we 220 * need to return -EAGAIN to signal the caller that it needs to restart 221 * the search for free clusters. 222 * 223 * - alloc_clusters_noref and qcow2_free_clusters may load a different 224 * refcount block into the cache 225 */ 226 227 *refcount_block = NULL; 228 229 /* We write to the refcount table, so we might depend on L2 tables */ 230 ret = qcow2_cache_flush(bs, s->l2_table_cache); 231 if (ret < 0) { 232 return ret; 233 } 234 235 /* Allocate the refcount block itself and mark it as used */ 236 int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); 237 if (new_block < 0) { 238 return new_block; 239 } 240 241 #ifdef DEBUG_ALLOC2 242 fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 243 " at %" PRIx64 "\n", 244 refcount_table_index, cluster_index << s->cluster_bits, new_block); 245 #endif 246 247 if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { 248 /* Zero the new refcount block before updating it */ 249 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 250 (void**) refcount_block); 251 if (ret < 0) { 252 goto fail_block; 253 } 254 255 memset(*refcount_block, 0, s->cluster_size); 256 257 /* The block describes itself, need to update the cache */ 258 int block_index = (new_block >> s->cluster_bits) & 259 ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); 260 (*refcount_block)[block_index] = cpu_to_be16(1); 261 } else { 262 /* Described somewhere else. This can recurse at most twice before we 263 * arrive at a block that describes itself. */ 264 ret = update_refcount(bs, new_block, s->cluster_size, 1, 265 QCOW2_DISCARD_NEVER); 266 if (ret < 0) { 267 goto fail_block; 268 } 269 270 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 271 if (ret < 0) { 272 goto fail_block; 273 } 274 275 /* Initialize the new refcount block only after updating its refcount, 276 * update_refcount uses the refcount cache itself */ 277 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 278 (void**) refcount_block); 279 if (ret < 0) { 280 goto fail_block; 281 } 282 283 memset(*refcount_block, 0, s->cluster_size); 284 } 285 286 /* Now the new refcount block needs to be written to disk */ 287 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); 288 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block); 289 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 290 if (ret < 0) { 291 goto fail_block; 292 } 293 294 /* If the refcount table is big enough, just hook the block up there */ 295 if (refcount_table_index < s->refcount_table_size) { 296 uint64_t data64 = cpu_to_be64(new_block); 297 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); 298 ret = bdrv_pwrite_sync(bs->file, 299 s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), 300 &data64, sizeof(data64)); 301 if (ret < 0) { 302 goto fail_block; 303 } 304 305 s->refcount_table[refcount_table_index] = new_block; 306 307 /* The new refcount block may be where the caller intended to put its 308 * data, so let it restart the search. */ 309 return -EAGAIN; 310 } 311 312 ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); 313 if (ret < 0) { 314 goto fail_block; 315 } 316 317 /* 318 * If we come here, we need to grow the refcount table. Again, a new 319 * refcount table needs some space and we can't simply allocate to avoid 320 * endless recursion. 321 * 322 * Therefore let's grab new refcount blocks at the end of the image, which 323 * will describe themselves and the new refcount table. This way we can 324 * reference them only in the new table and do the switch to the new 325 * refcount table at once without producing an inconsistent state in 326 * between. 327 */ 328 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); 329 330 /* Calculate the number of refcount blocks needed so far */ 331 uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT); 332 uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters); 333 334 if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { 335 return -EFBIG; 336 } 337 338 /* And now we need at least one block more for the new metadata */ 339 uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); 340 uint64_t last_table_size; 341 uint64_t blocks_clusters; 342 do { 343 uint64_t table_clusters = 344 size_to_clusters(s, table_size * sizeof(uint64_t)); 345 blocks_clusters = 1 + 346 ((table_clusters + refcount_block_clusters - 1) 347 / refcount_block_clusters); 348 uint64_t meta_clusters = table_clusters + blocks_clusters; 349 350 last_table_size = table_size; 351 table_size = next_refcount_table_size(s, blocks_used + 352 ((meta_clusters + refcount_block_clusters - 1) 353 / refcount_block_clusters)); 354 355 } while (last_table_size != table_size); 356 357 #ifdef DEBUG_ALLOC2 358 fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", 359 s->refcount_table_size, table_size); 360 #endif 361 362 /* Create the new refcount table and blocks */ 363 uint64_t meta_offset = (blocks_used * refcount_block_clusters) * 364 s->cluster_size; 365 uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; 366 uint64_t *new_table = g_try_new0(uint64_t, table_size); 367 uint16_t *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); 368 369 assert(table_size > 0 && blocks_clusters > 0); 370 if (new_table == NULL || new_blocks == NULL) { 371 ret = -ENOMEM; 372 goto fail_table; 373 } 374 375 /* Fill the new refcount table */ 376 memcpy(new_table, s->refcount_table, 377 s->refcount_table_size * sizeof(uint64_t)); 378 new_table[refcount_table_index] = new_block; 379 380 int i; 381 for (i = 0; i < blocks_clusters; i++) { 382 new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); 383 } 384 385 /* Fill the refcount blocks */ 386 uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); 387 int block = 0; 388 for (i = 0; i < table_clusters + blocks_clusters; i++) { 389 new_blocks[block++] = cpu_to_be16(1); 390 } 391 392 /* Write refcount blocks to disk */ 393 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); 394 ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, 395 blocks_clusters * s->cluster_size); 396 g_free(new_blocks); 397 new_blocks = NULL; 398 if (ret < 0) { 399 goto fail_table; 400 } 401 402 /* Write refcount table to disk */ 403 for(i = 0; i < table_size; i++) { 404 cpu_to_be64s(&new_table[i]); 405 } 406 407 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); 408 ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, 409 table_size * sizeof(uint64_t)); 410 if (ret < 0) { 411 goto fail_table; 412 } 413 414 for(i = 0; i < table_size; i++) { 415 be64_to_cpus(&new_table[i]); 416 } 417 418 /* Hook up the new refcount table in the qcow2 header */ 419 uint8_t data[12]; 420 cpu_to_be64w((uint64_t*)data, table_offset); 421 cpu_to_be32w((uint32_t*)(data + 8), table_clusters); 422 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); 423 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), 424 data, sizeof(data)); 425 if (ret < 0) { 426 goto fail_table; 427 } 428 429 /* And switch it in memory */ 430 uint64_t old_table_offset = s->refcount_table_offset; 431 uint64_t old_table_size = s->refcount_table_size; 432 433 g_free(s->refcount_table); 434 s->refcount_table = new_table; 435 s->refcount_table_size = table_size; 436 s->refcount_table_offset = table_offset; 437 438 /* Free old table. */ 439 qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), 440 QCOW2_DISCARD_OTHER); 441 442 ret = load_refcount_block(bs, new_block, (void**) refcount_block); 443 if (ret < 0) { 444 return ret; 445 } 446 447 /* If we were trying to do the initial refcount update for some cluster 448 * allocation, we might have used the same clusters to store newly 449 * allocated metadata. Make the caller search some new space. */ 450 return -EAGAIN; 451 452 fail_table: 453 g_free(new_blocks); 454 g_free(new_table); 455 fail_block: 456 if (*refcount_block != NULL) { 457 qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); 458 } 459 return ret; 460 } 461 462 void qcow2_process_discards(BlockDriverState *bs, int ret) 463 { 464 BDRVQcowState *s = bs->opaque; 465 Qcow2DiscardRegion *d, *next; 466 467 QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { 468 QTAILQ_REMOVE(&s->discards, d, next); 469 470 /* Discard is optional, ignore the return value */ 471 if (ret >= 0) { 472 bdrv_discard(bs->file, 473 d->offset >> BDRV_SECTOR_BITS, 474 d->bytes >> BDRV_SECTOR_BITS); 475 } 476 477 g_free(d); 478 } 479 } 480 481 static void update_refcount_discard(BlockDriverState *bs, 482 uint64_t offset, uint64_t length) 483 { 484 BDRVQcowState *s = bs->opaque; 485 Qcow2DiscardRegion *d, *p, *next; 486 487 QTAILQ_FOREACH(d, &s->discards, next) { 488 uint64_t new_start = MIN(offset, d->offset); 489 uint64_t new_end = MAX(offset + length, d->offset + d->bytes); 490 491 if (new_end - new_start <= length + d->bytes) { 492 /* There can't be any overlap, areas ending up here have no 493 * references any more and therefore shouldn't get freed another 494 * time. */ 495 assert(d->bytes + length == new_end - new_start); 496 d->offset = new_start; 497 d->bytes = new_end - new_start; 498 goto found; 499 } 500 } 501 502 d = g_malloc(sizeof(*d)); 503 *d = (Qcow2DiscardRegion) { 504 .bs = bs, 505 .offset = offset, 506 .bytes = length, 507 }; 508 QTAILQ_INSERT_TAIL(&s->discards, d, next); 509 510 found: 511 /* Merge discard requests if they are adjacent now */ 512 QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { 513 if (p == d 514 || p->offset > d->offset + d->bytes 515 || d->offset > p->offset + p->bytes) 516 { 517 continue; 518 } 519 520 /* Still no overlap possible */ 521 assert(p->offset == d->offset + d->bytes 522 || d->offset == p->offset + p->bytes); 523 524 QTAILQ_REMOVE(&s->discards, p, next); 525 d->offset = MIN(d->offset, p->offset); 526 d->bytes += p->bytes; 527 } 528 } 529 530 /* XXX: cache several refcount block clusters ? */ 531 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, 532 int64_t offset, int64_t length, int addend, enum qcow2_discard_type type) 533 { 534 BDRVQcowState *s = bs->opaque; 535 int64_t start, last, cluster_offset; 536 uint16_t *refcount_block = NULL; 537 int64_t old_table_index = -1; 538 int ret; 539 540 #ifdef DEBUG_ALLOC2 541 fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", 542 offset, length, addend); 543 #endif 544 if (length < 0) { 545 return -EINVAL; 546 } else if (length == 0) { 547 return 0; 548 } 549 550 if (addend < 0) { 551 qcow2_cache_set_dependency(bs, s->refcount_block_cache, 552 s->l2_table_cache); 553 } 554 555 start = start_of_cluster(s, offset); 556 last = start_of_cluster(s, offset + length - 1); 557 for(cluster_offset = start; cluster_offset <= last; 558 cluster_offset += s->cluster_size) 559 { 560 int block_index, refcount; 561 int64_t cluster_index = cluster_offset >> s->cluster_bits; 562 int64_t table_index = 563 cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); 564 565 /* Load the refcount block and allocate it if needed */ 566 if (table_index != old_table_index) { 567 if (refcount_block) { 568 ret = qcow2_cache_put(bs, s->refcount_block_cache, 569 (void**) &refcount_block); 570 if (ret < 0) { 571 goto fail; 572 } 573 } 574 575 ret = alloc_refcount_block(bs, cluster_index, &refcount_block); 576 if (ret < 0) { 577 goto fail; 578 } 579 } 580 old_table_index = table_index; 581 582 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block); 583 584 /* we can update the count and save it */ 585 block_index = cluster_index & 586 ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); 587 588 refcount = be16_to_cpu(refcount_block[block_index]); 589 refcount += addend; 590 if (refcount < 0 || refcount > 0xffff) { 591 ret = -EINVAL; 592 goto fail; 593 } 594 if (refcount == 0 && cluster_index < s->free_cluster_index) { 595 s->free_cluster_index = cluster_index; 596 } 597 refcount_block[block_index] = cpu_to_be16(refcount); 598 599 if (refcount == 0 && s->discard_passthrough[type]) { 600 update_refcount_discard(bs, cluster_offset, s->cluster_size); 601 } 602 } 603 604 ret = 0; 605 fail: 606 if (!s->cache_discards) { 607 qcow2_process_discards(bs, ret); 608 } 609 610 /* Write last changed block to disk */ 611 if (refcount_block) { 612 int wret; 613 wret = qcow2_cache_put(bs, s->refcount_block_cache, 614 (void**) &refcount_block); 615 if (wret < 0) { 616 return ret < 0 ? ret : wret; 617 } 618 } 619 620 /* 621 * Try do undo any updates if an error is returned (This may succeed in 622 * some cases like ENOSPC for allocating a new refcount block) 623 */ 624 if (ret < 0) { 625 int dummy; 626 dummy = update_refcount(bs, offset, cluster_offset - offset, -addend, 627 QCOW2_DISCARD_NEVER); 628 (void)dummy; 629 } 630 631 return ret; 632 } 633 634 /* 635 * Increases or decreases the refcount of a given cluster by one. 636 * addend must be 1 or -1. 637 * 638 * If the return value is non-negative, it is the new refcount of the cluster. 639 * If it is negative, it is -errno and indicates an error. 640 */ 641 int qcow2_update_cluster_refcount(BlockDriverState *bs, 642 int64_t cluster_index, 643 int addend, 644 enum qcow2_discard_type type) 645 { 646 BDRVQcowState *s = bs->opaque; 647 int ret; 648 649 ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, 650 type); 651 if (ret < 0) { 652 return ret; 653 } 654 655 return get_refcount(bs, cluster_index); 656 } 657 658 659 660 /*********************************************************/ 661 /* cluster allocation functions */ 662 663 664 665 /* return < 0 if error */ 666 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) 667 { 668 BDRVQcowState *s = bs->opaque; 669 uint64_t i, nb_clusters; 670 int refcount; 671 672 nb_clusters = size_to_clusters(s, size); 673 retry: 674 for(i = 0; i < nb_clusters; i++) { 675 uint64_t next_cluster_index = s->free_cluster_index++; 676 refcount = get_refcount(bs, next_cluster_index); 677 678 if (refcount < 0) { 679 return refcount; 680 } else if (refcount != 0) { 681 goto retry; 682 } 683 } 684 685 /* Make sure that all offsets in the "allocated" range are representable 686 * in an int64_t */ 687 if (s->free_cluster_index > 0 && 688 s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) 689 { 690 return -EFBIG; 691 } 692 693 #ifdef DEBUG_ALLOC2 694 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", 695 size, 696 (s->free_cluster_index - nb_clusters) << s->cluster_bits); 697 #endif 698 return (s->free_cluster_index - nb_clusters) << s->cluster_bits; 699 } 700 701 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) 702 { 703 int64_t offset; 704 int ret; 705 706 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); 707 do { 708 offset = alloc_clusters_noref(bs, size); 709 if (offset < 0) { 710 return offset; 711 } 712 713 ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); 714 } while (ret == -EAGAIN); 715 716 if (ret < 0) { 717 return ret; 718 } 719 720 return offset; 721 } 722 723 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, 724 int nb_clusters) 725 { 726 BDRVQcowState *s = bs->opaque; 727 uint64_t cluster_index; 728 uint64_t i; 729 int refcount, ret; 730 731 assert(nb_clusters >= 0); 732 if (nb_clusters == 0) { 733 return 0; 734 } 735 736 do { 737 /* Check how many clusters there are free */ 738 cluster_index = offset >> s->cluster_bits; 739 for(i = 0; i < nb_clusters; i++) { 740 refcount = get_refcount(bs, cluster_index++); 741 742 if (refcount < 0) { 743 return refcount; 744 } else if (refcount != 0) { 745 break; 746 } 747 } 748 749 /* And then allocate them */ 750 ret = update_refcount(bs, offset, i << s->cluster_bits, 1, 751 QCOW2_DISCARD_NEVER); 752 } while (ret == -EAGAIN); 753 754 if (ret < 0) { 755 return ret; 756 } 757 758 return i; 759 } 760 761 /* only used to allocate compressed sectors. We try to allocate 762 contiguous sectors. size must be <= cluster_size */ 763 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) 764 { 765 BDRVQcowState *s = bs->opaque; 766 int64_t offset, cluster_offset; 767 int free_in_cluster; 768 769 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); 770 assert(size > 0 && size <= s->cluster_size); 771 if (s->free_byte_offset == 0) { 772 offset = qcow2_alloc_clusters(bs, s->cluster_size); 773 if (offset < 0) { 774 return offset; 775 } 776 s->free_byte_offset = offset; 777 } 778 redo: 779 free_in_cluster = s->cluster_size - 780 offset_into_cluster(s, s->free_byte_offset); 781 if (size <= free_in_cluster) { 782 /* enough space in current cluster */ 783 offset = s->free_byte_offset; 784 s->free_byte_offset += size; 785 free_in_cluster -= size; 786 if (free_in_cluster == 0) 787 s->free_byte_offset = 0; 788 if (offset_into_cluster(s, offset) != 0) 789 qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, 790 QCOW2_DISCARD_NEVER); 791 } else { 792 offset = qcow2_alloc_clusters(bs, s->cluster_size); 793 if (offset < 0) { 794 return offset; 795 } 796 cluster_offset = start_of_cluster(s, s->free_byte_offset); 797 if ((cluster_offset + s->cluster_size) == offset) { 798 /* we are lucky: contiguous data */ 799 offset = s->free_byte_offset; 800 qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, 801 QCOW2_DISCARD_NEVER); 802 s->free_byte_offset += size; 803 } else { 804 s->free_byte_offset = offset; 805 goto redo; 806 } 807 } 808 809 /* The cluster refcount was incremented, either by qcow2_alloc_clusters() 810 * or explicitly by qcow2_update_cluster_refcount(). Refcount blocks must 811 * be flushed before the caller's L2 table updates. 812 */ 813 qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); 814 return offset; 815 } 816 817 void qcow2_free_clusters(BlockDriverState *bs, 818 int64_t offset, int64_t size, 819 enum qcow2_discard_type type) 820 { 821 int ret; 822 823 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); 824 ret = update_refcount(bs, offset, size, -1, type); 825 if (ret < 0) { 826 fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); 827 /* TODO Remember the clusters to free them later and avoid leaking */ 828 } 829 } 830 831 /* 832 * Free a cluster using its L2 entry (handles clusters of all types, e.g. 833 * normal cluster, compressed cluster, etc.) 834 */ 835 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, 836 int nb_clusters, enum qcow2_discard_type type) 837 { 838 BDRVQcowState *s = bs->opaque; 839 840 switch (qcow2_get_cluster_type(l2_entry)) { 841 case QCOW2_CLUSTER_COMPRESSED: 842 { 843 int nb_csectors; 844 nb_csectors = ((l2_entry >> s->csize_shift) & 845 s->csize_mask) + 1; 846 qcow2_free_clusters(bs, 847 (l2_entry & s->cluster_offset_mask) & ~511, 848 nb_csectors * 512, type); 849 } 850 break; 851 case QCOW2_CLUSTER_NORMAL: 852 case QCOW2_CLUSTER_ZERO: 853 if (l2_entry & L2E_OFFSET_MASK) { 854 if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { 855 qcow2_signal_corruption(bs, false, -1, -1, 856 "Cannot free unaligned cluster %#llx", 857 l2_entry & L2E_OFFSET_MASK); 858 } else { 859 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, 860 nb_clusters << s->cluster_bits, type); 861 } 862 } 863 break; 864 case QCOW2_CLUSTER_UNALLOCATED: 865 break; 866 default: 867 abort(); 868 } 869 } 870 871 872 873 /*********************************************************/ 874 /* snapshots and image creation */ 875 876 877 878 /* update the refcounts of snapshots and the copied flag */ 879 int qcow2_update_snapshot_refcount(BlockDriverState *bs, 880 int64_t l1_table_offset, int l1_size, int addend) 881 { 882 BDRVQcowState *s = bs->opaque; 883 uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2; 884 bool l1_allocated = false; 885 int64_t old_offset, old_l2_offset; 886 int i, j, l1_modified = 0, nb_csectors, refcount; 887 int ret; 888 889 l2_table = NULL; 890 l1_table = NULL; 891 l1_size2 = l1_size * sizeof(uint64_t); 892 893 s->cache_discards = true; 894 895 /* WARNING: qcow2_snapshot_goto relies on this function not using the 896 * l1_table_offset when it is the current s->l1_table_offset! Be careful 897 * when changing this! */ 898 if (l1_table_offset != s->l1_table_offset) { 899 l1_table = g_try_malloc0(align_offset(l1_size2, 512)); 900 if (l1_size2 && l1_table == NULL) { 901 ret = -ENOMEM; 902 goto fail; 903 } 904 l1_allocated = true; 905 906 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); 907 if (ret < 0) { 908 goto fail; 909 } 910 911 for(i = 0;i < l1_size; i++) 912 be64_to_cpus(&l1_table[i]); 913 } else { 914 assert(l1_size == s->l1_size); 915 l1_table = s->l1_table; 916 l1_allocated = false; 917 } 918 919 for(i = 0; i < l1_size; i++) { 920 l2_offset = l1_table[i]; 921 if (l2_offset) { 922 old_l2_offset = l2_offset; 923 l2_offset &= L1E_OFFSET_MASK; 924 925 if (offset_into_cluster(s, l2_offset)) { 926 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 927 PRIx64 " unaligned (L1 index: %#x)", 928 l2_offset, i); 929 ret = -EIO; 930 goto fail; 931 } 932 933 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 934 (void**) &l2_table); 935 if (ret < 0) { 936 goto fail; 937 } 938 939 for(j = 0; j < s->l2_size; j++) { 940 uint64_t cluster_index; 941 942 offset = be64_to_cpu(l2_table[j]); 943 old_offset = offset; 944 offset &= ~QCOW_OFLAG_COPIED; 945 946 switch (qcow2_get_cluster_type(offset)) { 947 case QCOW2_CLUSTER_COMPRESSED: 948 nb_csectors = ((offset >> s->csize_shift) & 949 s->csize_mask) + 1; 950 if (addend != 0) { 951 ret = update_refcount(bs, 952 (offset & s->cluster_offset_mask) & ~511, 953 nb_csectors * 512, addend, 954 QCOW2_DISCARD_SNAPSHOT); 955 if (ret < 0) { 956 goto fail; 957 } 958 } 959 /* compressed clusters are never modified */ 960 refcount = 2; 961 break; 962 963 case QCOW2_CLUSTER_NORMAL: 964 case QCOW2_CLUSTER_ZERO: 965 if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) { 966 qcow2_signal_corruption(bs, true, -1, -1, "Data " 967 "cluster offset %#llx " 968 "unaligned (L2 offset: %#" 969 PRIx64 ", L2 index: %#x)", 970 offset & L2E_OFFSET_MASK, 971 l2_offset, j); 972 ret = -EIO; 973 goto fail; 974 } 975 976 cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; 977 if (!cluster_index) { 978 /* unallocated */ 979 refcount = 0; 980 break; 981 } 982 if (addend != 0) { 983 refcount = qcow2_update_cluster_refcount(bs, 984 cluster_index, addend, 985 QCOW2_DISCARD_SNAPSHOT); 986 } else { 987 refcount = get_refcount(bs, cluster_index); 988 } 989 990 if (refcount < 0) { 991 ret = refcount; 992 goto fail; 993 } 994 break; 995 996 case QCOW2_CLUSTER_UNALLOCATED: 997 refcount = 0; 998 break; 999 1000 default: 1001 abort(); 1002 } 1003 1004 if (refcount == 1) { 1005 offset |= QCOW_OFLAG_COPIED; 1006 } 1007 if (offset != old_offset) { 1008 if (addend > 0) { 1009 qcow2_cache_set_dependency(bs, s->l2_table_cache, 1010 s->refcount_block_cache); 1011 } 1012 l2_table[j] = cpu_to_be64(offset); 1013 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 1014 } 1015 } 1016 1017 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1018 if (ret < 0) { 1019 goto fail; 1020 } 1021 1022 1023 if (addend != 0) { 1024 refcount = qcow2_update_cluster_refcount(bs, l2_offset >> 1025 s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT); 1026 } else { 1027 refcount = get_refcount(bs, l2_offset >> s->cluster_bits); 1028 } 1029 if (refcount < 0) { 1030 ret = refcount; 1031 goto fail; 1032 } else if (refcount == 1) { 1033 l2_offset |= QCOW_OFLAG_COPIED; 1034 } 1035 if (l2_offset != old_l2_offset) { 1036 l1_table[i] = l2_offset; 1037 l1_modified = 1; 1038 } 1039 } 1040 } 1041 1042 ret = bdrv_flush(bs); 1043 fail: 1044 if (l2_table) { 1045 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1046 } 1047 1048 s->cache_discards = false; 1049 qcow2_process_discards(bs, ret); 1050 1051 /* Update L1 only if it isn't deleted anyway (addend = -1) */ 1052 if (ret == 0 && addend >= 0 && l1_modified) { 1053 for (i = 0; i < l1_size; i++) { 1054 cpu_to_be64s(&l1_table[i]); 1055 } 1056 1057 ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2); 1058 1059 for (i = 0; i < l1_size; i++) { 1060 be64_to_cpus(&l1_table[i]); 1061 } 1062 } 1063 if (l1_allocated) 1064 g_free(l1_table); 1065 return ret; 1066 } 1067 1068 1069 1070 1071 /*********************************************************/ 1072 /* refcount checking functions */ 1073 1074 1075 1076 /* 1077 * Increases the refcount for a range of clusters in a given refcount table. 1078 * This is used to construct a temporary refcount table out of L1 and L2 tables 1079 * which can be compared the the refcount table saved in the image. 1080 * 1081 * Modifies the number of errors in res. 1082 */ 1083 static void inc_refcounts(BlockDriverState *bs, 1084 BdrvCheckResult *res, 1085 uint16_t *refcount_table, 1086 int refcount_table_size, 1087 int64_t offset, int64_t size) 1088 { 1089 BDRVQcowState *s = bs->opaque; 1090 uint64_t start, last, cluster_offset, k; 1091 1092 if (size <= 0) 1093 return; 1094 1095 start = start_of_cluster(s, offset); 1096 last = start_of_cluster(s, offset + size - 1); 1097 for(cluster_offset = start; cluster_offset <= last; 1098 cluster_offset += s->cluster_size) { 1099 k = cluster_offset >> s->cluster_bits; 1100 if (k >= refcount_table_size) { 1101 fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after " 1102 "the end of the image file, can't properly check refcounts.\n", 1103 cluster_offset); 1104 res->check_errors++; 1105 } else { 1106 if (++refcount_table[k] == 0) { 1107 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 1108 "\n", cluster_offset); 1109 res->corruptions++; 1110 } 1111 } 1112 } 1113 } 1114 1115 /* Flags for check_refcounts_l1() and check_refcounts_l2() */ 1116 enum { 1117 CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ 1118 }; 1119 1120 /* 1121 * Increases the refcount in the given refcount table for the all clusters 1122 * referenced in the L2 table. While doing so, performs some checks on L2 1123 * entries. 1124 * 1125 * Returns the number of errors found by the checks or -errno if an internal 1126 * error occurred. 1127 */ 1128 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, 1129 uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset, 1130 int flags) 1131 { 1132 BDRVQcowState *s = bs->opaque; 1133 uint64_t *l2_table, l2_entry; 1134 uint64_t next_contiguous_offset = 0; 1135 int i, l2_size, nb_csectors; 1136 1137 /* Read L2 table from disk */ 1138 l2_size = s->l2_size * sizeof(uint64_t); 1139 l2_table = g_malloc(l2_size); 1140 1141 if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size) 1142 goto fail; 1143 1144 /* Do the actual checks */ 1145 for(i = 0; i < s->l2_size; i++) { 1146 l2_entry = be64_to_cpu(l2_table[i]); 1147 1148 switch (qcow2_get_cluster_type(l2_entry)) { 1149 case QCOW2_CLUSTER_COMPRESSED: 1150 /* Compressed clusters don't have QCOW_OFLAG_COPIED */ 1151 if (l2_entry & QCOW_OFLAG_COPIED) { 1152 fprintf(stderr, "ERROR: cluster %" PRId64 ": " 1153 "copied flag must never be set for compressed " 1154 "clusters\n", l2_entry >> s->cluster_bits); 1155 l2_entry &= ~QCOW_OFLAG_COPIED; 1156 res->corruptions++; 1157 } 1158 1159 /* Mark cluster as used */ 1160 nb_csectors = ((l2_entry >> s->csize_shift) & 1161 s->csize_mask) + 1; 1162 l2_entry &= s->cluster_offset_mask; 1163 inc_refcounts(bs, res, refcount_table, refcount_table_size, 1164 l2_entry & ~511, nb_csectors * 512); 1165 1166 if (flags & CHECK_FRAG_INFO) { 1167 res->bfi.allocated_clusters++; 1168 res->bfi.compressed_clusters++; 1169 1170 /* Compressed clusters are fragmented by nature. Since they 1171 * take up sub-sector space but we only have sector granularity 1172 * I/O we need to re-read the same sectors even for adjacent 1173 * compressed clusters. 1174 */ 1175 res->bfi.fragmented_clusters++; 1176 } 1177 break; 1178 1179 case QCOW2_CLUSTER_ZERO: 1180 if ((l2_entry & L2E_OFFSET_MASK) == 0) { 1181 break; 1182 } 1183 /* fall through */ 1184 1185 case QCOW2_CLUSTER_NORMAL: 1186 { 1187 uint64_t offset = l2_entry & L2E_OFFSET_MASK; 1188 1189 if (flags & CHECK_FRAG_INFO) { 1190 res->bfi.allocated_clusters++; 1191 if (next_contiguous_offset && 1192 offset != next_contiguous_offset) { 1193 res->bfi.fragmented_clusters++; 1194 } 1195 next_contiguous_offset = offset + s->cluster_size; 1196 } 1197 1198 /* Mark cluster as used */ 1199 inc_refcounts(bs, res, refcount_table,refcount_table_size, 1200 offset, s->cluster_size); 1201 1202 /* Correct offsets are cluster aligned */ 1203 if (offset_into_cluster(s, offset)) { 1204 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " 1205 "properly aligned; L2 entry corrupted.\n", offset); 1206 res->corruptions++; 1207 } 1208 break; 1209 } 1210 1211 case QCOW2_CLUSTER_UNALLOCATED: 1212 break; 1213 1214 default: 1215 abort(); 1216 } 1217 } 1218 1219 g_free(l2_table); 1220 return 0; 1221 1222 fail: 1223 fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); 1224 g_free(l2_table); 1225 return -EIO; 1226 } 1227 1228 /* 1229 * Increases the refcount for the L1 table, its L2 tables and all referenced 1230 * clusters in the given refcount table. While doing so, performs some checks 1231 * on L1 and L2 entries. 1232 * 1233 * Returns the number of errors found by the checks or -errno if an internal 1234 * error occurred. 1235 */ 1236 static int check_refcounts_l1(BlockDriverState *bs, 1237 BdrvCheckResult *res, 1238 uint16_t *refcount_table, 1239 int refcount_table_size, 1240 int64_t l1_table_offset, int l1_size, 1241 int flags) 1242 { 1243 BDRVQcowState *s = bs->opaque; 1244 uint64_t *l1_table, l2_offset, l1_size2; 1245 int i, ret; 1246 1247 l1_size2 = l1_size * sizeof(uint64_t); 1248 1249 /* Mark L1 table as used */ 1250 inc_refcounts(bs, res, refcount_table, refcount_table_size, 1251 l1_table_offset, l1_size2); 1252 1253 /* Read L1 table entries from disk */ 1254 if (l1_size2 == 0) { 1255 l1_table = NULL; 1256 } else { 1257 l1_table = g_try_malloc(l1_size2); 1258 if (l1_table == NULL) { 1259 ret = -ENOMEM; 1260 goto fail; 1261 } 1262 if (bdrv_pread(bs->file, l1_table_offset, 1263 l1_table, l1_size2) != l1_size2) 1264 goto fail; 1265 for(i = 0;i < l1_size; i++) 1266 be64_to_cpus(&l1_table[i]); 1267 } 1268 1269 /* Do the actual checks */ 1270 for(i = 0; i < l1_size; i++) { 1271 l2_offset = l1_table[i]; 1272 if (l2_offset) { 1273 /* Mark L2 table as used */ 1274 l2_offset &= L1E_OFFSET_MASK; 1275 inc_refcounts(bs, res, refcount_table, refcount_table_size, 1276 l2_offset, s->cluster_size); 1277 1278 /* L2 tables are cluster aligned */ 1279 if (offset_into_cluster(s, l2_offset)) { 1280 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " 1281 "cluster aligned; L1 entry corrupted\n", l2_offset); 1282 res->corruptions++; 1283 } 1284 1285 /* Process and check L2 entries */ 1286 ret = check_refcounts_l2(bs, res, refcount_table, 1287 refcount_table_size, l2_offset, flags); 1288 if (ret < 0) { 1289 goto fail; 1290 } 1291 } 1292 } 1293 g_free(l1_table); 1294 return 0; 1295 1296 fail: 1297 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); 1298 res->check_errors++; 1299 g_free(l1_table); 1300 return -EIO; 1301 } 1302 1303 /* 1304 * Checks the OFLAG_COPIED flag for all L1 and L2 entries. 1305 * 1306 * This function does not print an error message nor does it increment 1307 * check_errors if get_refcount fails (this is because such an error will have 1308 * been already detected and sufficiently signaled by the calling function 1309 * (qcow2_check_refcounts) by the time this function is called). 1310 */ 1311 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, 1312 BdrvCheckMode fix) 1313 { 1314 BDRVQcowState *s = bs->opaque; 1315 uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); 1316 int ret; 1317 int refcount; 1318 int i, j; 1319 1320 for (i = 0; i < s->l1_size; i++) { 1321 uint64_t l1_entry = s->l1_table[i]; 1322 uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; 1323 bool l2_dirty = false; 1324 1325 if (!l2_offset) { 1326 continue; 1327 } 1328 1329 refcount = get_refcount(bs, l2_offset >> s->cluster_bits); 1330 if (refcount < 0) { 1331 /* don't print message nor increment check_errors */ 1332 continue; 1333 } 1334 if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { 1335 fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " 1336 "l1_entry=%" PRIx64 " refcount=%d\n", 1337 fix & BDRV_FIX_ERRORS ? "Repairing" : 1338 "ERROR", 1339 i, l1_entry, refcount); 1340 if (fix & BDRV_FIX_ERRORS) { 1341 s->l1_table[i] = refcount == 1 1342 ? l1_entry | QCOW_OFLAG_COPIED 1343 : l1_entry & ~QCOW_OFLAG_COPIED; 1344 ret = qcow2_write_l1_entry(bs, i); 1345 if (ret < 0) { 1346 res->check_errors++; 1347 goto fail; 1348 } 1349 res->corruptions_fixed++; 1350 } else { 1351 res->corruptions++; 1352 } 1353 } 1354 1355 ret = bdrv_pread(bs->file, l2_offset, l2_table, 1356 s->l2_size * sizeof(uint64_t)); 1357 if (ret < 0) { 1358 fprintf(stderr, "ERROR: Could not read L2 table: %s\n", 1359 strerror(-ret)); 1360 res->check_errors++; 1361 goto fail; 1362 } 1363 1364 for (j = 0; j < s->l2_size; j++) { 1365 uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1366 uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; 1367 int cluster_type = qcow2_get_cluster_type(l2_entry); 1368 1369 if ((cluster_type == QCOW2_CLUSTER_NORMAL) || 1370 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { 1371 refcount = get_refcount(bs, data_offset >> s->cluster_bits); 1372 if (refcount < 0) { 1373 /* don't print message nor increment check_errors */ 1374 continue; 1375 } 1376 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { 1377 fprintf(stderr, "%s OFLAG_COPIED data cluster: " 1378 "l2_entry=%" PRIx64 " refcount=%d\n", 1379 fix & BDRV_FIX_ERRORS ? "Repairing" : 1380 "ERROR", 1381 l2_entry, refcount); 1382 if (fix & BDRV_FIX_ERRORS) { 1383 l2_table[j] = cpu_to_be64(refcount == 1 1384 ? l2_entry | QCOW_OFLAG_COPIED 1385 : l2_entry & ~QCOW_OFLAG_COPIED); 1386 l2_dirty = true; 1387 res->corruptions_fixed++; 1388 } else { 1389 res->corruptions++; 1390 } 1391 } 1392 } 1393 } 1394 1395 if (l2_dirty) { 1396 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, 1397 l2_offset, s->cluster_size); 1398 if (ret < 0) { 1399 fprintf(stderr, "ERROR: Could not write L2 table; metadata " 1400 "overlap check failed: %s\n", strerror(-ret)); 1401 res->check_errors++; 1402 goto fail; 1403 } 1404 1405 ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size); 1406 if (ret < 0) { 1407 fprintf(stderr, "ERROR: Could not write L2 table: %s\n", 1408 strerror(-ret)); 1409 res->check_errors++; 1410 goto fail; 1411 } 1412 } 1413 } 1414 1415 ret = 0; 1416 1417 fail: 1418 qemu_vfree(l2_table); 1419 return ret; 1420 } 1421 1422 /* 1423 * Writes one sector of the refcount table to the disk 1424 */ 1425 #define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t)) 1426 static int write_reftable_entry(BlockDriverState *bs, int rt_index) 1427 { 1428 BDRVQcowState *s = bs->opaque; 1429 uint64_t buf[RT_ENTRIES_PER_SECTOR]; 1430 int rt_start_index; 1431 int i, ret; 1432 1433 rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1); 1434 for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) { 1435 buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]); 1436 } 1437 1438 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE, 1439 s->refcount_table_offset + rt_start_index * sizeof(uint64_t), 1440 sizeof(buf)); 1441 if (ret < 0) { 1442 return ret; 1443 } 1444 1445 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); 1446 ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset + 1447 rt_start_index * sizeof(uint64_t), buf, sizeof(buf)); 1448 if (ret < 0) { 1449 return ret; 1450 } 1451 1452 return 0; 1453 } 1454 1455 /* 1456 * Allocates a new cluster for the given refcount block (represented by its 1457 * offset in the image file) and copies the current content there. This function 1458 * does _not_ decrement the reference count for the currently occupied cluster. 1459 * 1460 * This function prints an informative message to stderr on error (and returns 1461 * -errno); on success, the offset of the newly allocated cluster is returned. 1462 */ 1463 static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index, 1464 uint64_t offset) 1465 { 1466 BDRVQcowState *s = bs->opaque; 1467 int64_t new_offset = 0; 1468 void *refcount_block = NULL; 1469 int ret; 1470 1471 /* allocate new refcount block */ 1472 new_offset = qcow2_alloc_clusters(bs, s->cluster_size); 1473 if (new_offset < 0) { 1474 fprintf(stderr, "Could not allocate new cluster: %s\n", 1475 strerror(-new_offset)); 1476 ret = new_offset; 1477 goto done; 1478 } 1479 1480 /* fetch current refcount block content */ 1481 ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block); 1482 if (ret < 0) { 1483 fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret)); 1484 goto fail_free_cluster; 1485 } 1486 1487 /* new block has not yet been entered into refcount table, therefore it is 1488 * no refcount block yet (regarding this check) */ 1489 ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size); 1490 if (ret < 0) { 1491 fprintf(stderr, "Could not write refcount block; metadata overlap " 1492 "check failed: %s\n", strerror(-ret)); 1493 /* the image will be marked corrupt, so don't even attempt on freeing 1494 * the cluster */ 1495 goto done; 1496 } 1497 1498 /* write to new block */ 1499 ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block, 1500 s->cluster_sectors); 1501 if (ret < 0) { 1502 fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret)); 1503 goto fail_free_cluster; 1504 } 1505 1506 /* update refcount table */ 1507 assert(!offset_into_cluster(s, new_offset)); 1508 s->refcount_table[reftable_index] = new_offset; 1509 ret = write_reftable_entry(bs, reftable_index); 1510 if (ret < 0) { 1511 fprintf(stderr, "Could not update refcount table: %s\n", 1512 strerror(-ret)); 1513 goto fail_free_cluster; 1514 } 1515 1516 goto done; 1517 1518 fail_free_cluster: 1519 qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER); 1520 1521 done: 1522 if (refcount_block) { 1523 /* This should never fail, as it would only do so if the given refcount 1524 * block cannot be found in the cache. As this is impossible as long as 1525 * there are no bugs, assert the success. */ 1526 int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); 1527 assert(tmp == 0); 1528 } 1529 1530 if (ret < 0) { 1531 return ret; 1532 } 1533 1534 return new_offset; 1535 } 1536 1537 /* 1538 * Checks an image for refcount consistency. 1539 * 1540 * Returns 0 if no errors are found, the number of errors in case the image is 1541 * detected as corrupted, and -errno when an internal error occurred. 1542 */ 1543 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 1544 BdrvCheckMode fix) 1545 { 1546 BDRVQcowState *s = bs->opaque; 1547 int64_t size, i, highest_cluster, nb_clusters; 1548 int refcount1, refcount2; 1549 QCowSnapshot *sn; 1550 uint16_t *refcount_table; 1551 int ret; 1552 1553 size = bdrv_getlength(bs->file); 1554 if (size < 0) { 1555 res->check_errors++; 1556 return size; 1557 } 1558 1559 nb_clusters = size_to_clusters(s, size); 1560 if (nb_clusters > INT_MAX) { 1561 res->check_errors++; 1562 return -EFBIG; 1563 } 1564 1565 refcount_table = g_try_new0(uint16_t, nb_clusters); 1566 if (nb_clusters && refcount_table == NULL) { 1567 res->check_errors++; 1568 return -ENOMEM; 1569 } 1570 1571 res->bfi.total_clusters = 1572 size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); 1573 1574 /* header */ 1575 inc_refcounts(bs, res, refcount_table, nb_clusters, 1576 0, s->cluster_size); 1577 1578 /* current L1 table */ 1579 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 1580 s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO); 1581 if (ret < 0) { 1582 goto fail; 1583 } 1584 1585 /* snapshots */ 1586 for(i = 0; i < s->nb_snapshots; i++) { 1587 sn = s->snapshots + i; 1588 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 1589 sn->l1_table_offset, sn->l1_size, 0); 1590 if (ret < 0) { 1591 goto fail; 1592 } 1593 } 1594 inc_refcounts(bs, res, refcount_table, nb_clusters, 1595 s->snapshots_offset, s->snapshots_size); 1596 1597 /* refcount data */ 1598 inc_refcounts(bs, res, refcount_table, nb_clusters, 1599 s->refcount_table_offset, 1600 s->refcount_table_size * sizeof(uint64_t)); 1601 1602 for(i = 0; i < s->refcount_table_size; i++) { 1603 uint64_t offset, cluster; 1604 offset = s->refcount_table[i]; 1605 cluster = offset >> s->cluster_bits; 1606 1607 /* Refcount blocks are cluster aligned */ 1608 if (offset_into_cluster(s, offset)) { 1609 fprintf(stderr, "ERROR refcount block %" PRId64 " is not " 1610 "cluster aligned; refcount table entry corrupted\n", i); 1611 res->corruptions++; 1612 continue; 1613 } 1614 1615 if (cluster >= nb_clusters) { 1616 fprintf(stderr, "ERROR refcount block %" PRId64 1617 " is outside image\n", i); 1618 res->corruptions++; 1619 continue; 1620 } 1621 1622 if (offset != 0) { 1623 inc_refcounts(bs, res, refcount_table, nb_clusters, 1624 offset, s->cluster_size); 1625 if (refcount_table[cluster] != 1) { 1626 fprintf(stderr, "%s refcount block %" PRId64 1627 " refcount=%d\n", 1628 fix & BDRV_FIX_ERRORS ? "Repairing" : 1629 "ERROR", 1630 i, refcount_table[cluster]); 1631 1632 if (fix & BDRV_FIX_ERRORS) { 1633 int64_t new_offset; 1634 1635 new_offset = realloc_refcount_block(bs, i, offset); 1636 if (new_offset < 0) { 1637 res->corruptions++; 1638 continue; 1639 } 1640 1641 /* update refcounts */ 1642 if ((new_offset >> s->cluster_bits) >= nb_clusters) { 1643 /* increase refcount_table size if necessary */ 1644 int old_nb_clusters = nb_clusters; 1645 nb_clusters = (new_offset >> s->cluster_bits) + 1; 1646 refcount_table = g_renew(uint16_t, refcount_table, 1647 nb_clusters); 1648 memset(&refcount_table[old_nb_clusters], 0, (nb_clusters 1649 - old_nb_clusters) * sizeof(uint16_t)); 1650 } 1651 refcount_table[cluster]--; 1652 inc_refcounts(bs, res, refcount_table, nb_clusters, 1653 new_offset, s->cluster_size); 1654 1655 res->corruptions_fixed++; 1656 } else { 1657 res->corruptions++; 1658 } 1659 } 1660 } 1661 } 1662 1663 /* compare ref counts */ 1664 for (i = 0, highest_cluster = 0; i < nb_clusters; i++) { 1665 refcount1 = get_refcount(bs, i); 1666 if (refcount1 < 0) { 1667 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", 1668 i, strerror(-refcount1)); 1669 res->check_errors++; 1670 continue; 1671 } 1672 1673 refcount2 = refcount_table[i]; 1674 1675 if (refcount1 > 0 || refcount2 > 0) { 1676 highest_cluster = i; 1677 } 1678 1679 if (refcount1 != refcount2) { 1680 1681 /* Check if we're allowed to fix the mismatch */ 1682 int *num_fixed = NULL; 1683 if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { 1684 num_fixed = &res->leaks_fixed; 1685 } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { 1686 num_fixed = &res->corruptions_fixed; 1687 } 1688 1689 fprintf(stderr, "%s cluster %" PRId64 " refcount=%d reference=%d\n", 1690 num_fixed != NULL ? "Repairing" : 1691 refcount1 < refcount2 ? "ERROR" : 1692 "Leaked", 1693 i, refcount1, refcount2); 1694 1695 if (num_fixed) { 1696 ret = update_refcount(bs, i << s->cluster_bits, 1, 1697 refcount2 - refcount1, 1698 QCOW2_DISCARD_ALWAYS); 1699 if (ret >= 0) { 1700 (*num_fixed)++; 1701 continue; 1702 } 1703 } 1704 1705 /* And if we couldn't, print an error */ 1706 if (refcount1 < refcount2) { 1707 res->corruptions++; 1708 } else { 1709 res->leaks++; 1710 } 1711 } 1712 } 1713 1714 /* check OFLAG_COPIED */ 1715 ret = check_oflag_copied(bs, res, fix); 1716 if (ret < 0) { 1717 goto fail; 1718 } 1719 1720 res->image_end_offset = (highest_cluster + 1) * s->cluster_size; 1721 ret = 0; 1722 1723 fail: 1724 g_free(refcount_table); 1725 1726 return ret; 1727 } 1728 1729 #define overlaps_with(ofs, sz) \ 1730 ranges_overlap(offset, size, ofs, sz) 1731 1732 /* 1733 * Checks if the given offset into the image file is actually free to use by 1734 * looking for overlaps with important metadata sections (L1/L2 tables etc.), 1735 * i.e. a sanity check without relying on the refcount tables. 1736 * 1737 * The ign parameter specifies what checks not to perform (being a bitmask of 1738 * QCow2MetadataOverlap values), i.e., what sections to ignore. 1739 * 1740 * Returns: 1741 * - 0 if writing to this offset will not affect the mentioned metadata 1742 * - a positive QCow2MetadataOverlap value indicating one overlapping section 1743 * - a negative value (-errno) indicating an error while performing a check, 1744 * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 1745 */ 1746 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, 1747 int64_t size) 1748 { 1749 BDRVQcowState *s = bs->opaque; 1750 int chk = s->overlap_check & ~ign; 1751 int i, j; 1752 1753 if (!size) { 1754 return 0; 1755 } 1756 1757 if (chk & QCOW2_OL_MAIN_HEADER) { 1758 if (offset < s->cluster_size) { 1759 return QCOW2_OL_MAIN_HEADER; 1760 } 1761 } 1762 1763 /* align range to test to cluster boundaries */ 1764 size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); 1765 offset = start_of_cluster(s, offset); 1766 1767 if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { 1768 if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { 1769 return QCOW2_OL_ACTIVE_L1; 1770 } 1771 } 1772 1773 if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { 1774 if (overlaps_with(s->refcount_table_offset, 1775 s->refcount_table_size * sizeof(uint64_t))) { 1776 return QCOW2_OL_REFCOUNT_TABLE; 1777 } 1778 } 1779 1780 if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { 1781 if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { 1782 return QCOW2_OL_SNAPSHOT_TABLE; 1783 } 1784 } 1785 1786 if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { 1787 for (i = 0; i < s->nb_snapshots; i++) { 1788 if (s->snapshots[i].l1_size && 1789 overlaps_with(s->snapshots[i].l1_table_offset, 1790 s->snapshots[i].l1_size * sizeof(uint64_t))) { 1791 return QCOW2_OL_INACTIVE_L1; 1792 } 1793 } 1794 } 1795 1796 if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { 1797 for (i = 0; i < s->l1_size; i++) { 1798 if ((s->l1_table[i] & L1E_OFFSET_MASK) && 1799 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, 1800 s->cluster_size)) { 1801 return QCOW2_OL_ACTIVE_L2; 1802 } 1803 } 1804 } 1805 1806 if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { 1807 for (i = 0; i < s->refcount_table_size; i++) { 1808 if ((s->refcount_table[i] & REFT_OFFSET_MASK) && 1809 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, 1810 s->cluster_size)) { 1811 return QCOW2_OL_REFCOUNT_BLOCK; 1812 } 1813 } 1814 } 1815 1816 if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { 1817 for (i = 0; i < s->nb_snapshots; i++) { 1818 uint64_t l1_ofs = s->snapshots[i].l1_table_offset; 1819 uint32_t l1_sz = s->snapshots[i].l1_size; 1820 uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); 1821 uint64_t *l1 = g_try_malloc(l1_sz2); 1822 int ret; 1823 1824 if (l1_sz2 && l1 == NULL) { 1825 return -ENOMEM; 1826 } 1827 1828 ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); 1829 if (ret < 0) { 1830 g_free(l1); 1831 return ret; 1832 } 1833 1834 for (j = 0; j < l1_sz; j++) { 1835 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; 1836 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { 1837 g_free(l1); 1838 return QCOW2_OL_INACTIVE_L2; 1839 } 1840 } 1841 1842 g_free(l1); 1843 } 1844 } 1845 1846 return 0; 1847 } 1848 1849 static const char *metadata_ol_names[] = { 1850 [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", 1851 [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", 1852 [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", 1853 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", 1854 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", 1855 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", 1856 [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", 1857 [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", 1858 }; 1859 1860 /* 1861 * First performs a check for metadata overlaps (through 1862 * qcow2_check_metadata_overlap); if that fails with a negative value (error 1863 * while performing a check), that value is returned. If an impending overlap 1864 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt 1865 * and -EIO returned. 1866 * 1867 * Returns 0 if there were neither overlaps nor errors while checking for 1868 * overlaps; or a negative value (-errno) on error. 1869 */ 1870 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, 1871 int64_t size) 1872 { 1873 int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); 1874 1875 if (ret < 0) { 1876 return ret; 1877 } else if (ret > 0) { 1878 int metadata_ol_bitnr = ffs(ret) - 1; 1879 assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); 1880 1881 qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " 1882 "write on metadata (overlaps with %s)", 1883 metadata_ol_names[metadata_ol_bitnr]); 1884 return -EIO; 1885 } 1886 1887 return 0; 1888 } 1889