1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "qapi/error.h" 27 #include "qemu-common.h" 28 #include "block/block_int.h" 29 #include "qcow2.h" 30 #include "qemu/range.h" 31 #include "qemu/bswap.h" 32 #include "qemu/cutils.h" 33 34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size, 35 uint64_t max); 36 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, 37 int64_t offset, int64_t length, uint64_t addend, 38 bool decrease, enum qcow2_discard_type type); 39 40 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index); 41 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index); 42 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index); 43 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index); 44 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index); 45 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index); 46 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index); 47 48 static void set_refcount_ro0(void *refcount_array, uint64_t index, 49 uint64_t value); 50 static void set_refcount_ro1(void *refcount_array, uint64_t index, 51 uint64_t value); 52 static void set_refcount_ro2(void *refcount_array, uint64_t index, 53 uint64_t value); 54 static void set_refcount_ro3(void *refcount_array, uint64_t index, 55 uint64_t value); 56 static void set_refcount_ro4(void *refcount_array, uint64_t index, 57 uint64_t value); 58 static void set_refcount_ro5(void *refcount_array, uint64_t index, 59 uint64_t value); 60 static void set_refcount_ro6(void *refcount_array, uint64_t index, 61 uint64_t value); 62 63 64 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = { 65 &get_refcount_ro0, 66 &get_refcount_ro1, 67 &get_refcount_ro2, 68 &get_refcount_ro3, 69 &get_refcount_ro4, 70 &get_refcount_ro5, 71 &get_refcount_ro6 72 }; 73 74 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { 75 &set_refcount_ro0, 76 &set_refcount_ro1, 77 &set_refcount_ro2, 78 &set_refcount_ro3, 79 &set_refcount_ro4, 80 &set_refcount_ro5, 81 &set_refcount_ro6 82 }; 83 84 85 /*********************************************************/ 86 /* refcount handling */ 87 88 static void update_max_refcount_table_index(BDRVQcow2State *s) 89 { 90 unsigned i = s->refcount_table_size - 1; 91 while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) { 92 i--; 93 } 94 /* Set s->max_refcount_table_index to the index of the last used entry */ 95 s->max_refcount_table_index = i; 96 } 97 98 int qcow2_refcount_init(BlockDriverState *bs) 99 { 100 BDRVQcow2State *s = bs->opaque; 101 unsigned int refcount_table_size2, i; 102 int ret; 103 104 assert(s->refcount_order >= 0 && s->refcount_order <= 6); 105 106 s->get_refcount = get_refcount_funcs[s->refcount_order]; 107 s->set_refcount = set_refcount_funcs[s->refcount_order]; 108 109 assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); 110 refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); 111 s->refcount_table = g_try_malloc(refcount_table_size2); 112 113 if (s->refcount_table_size > 0) { 114 if (s->refcount_table == NULL) { 115 ret = -ENOMEM; 116 goto fail; 117 } 118 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); 119 ret = bdrv_pread(bs->file, s->refcount_table_offset, 120 s->refcount_table, refcount_table_size2); 121 if (ret < 0) { 122 goto fail; 123 } 124 for(i = 0; i < s->refcount_table_size; i++) 125 be64_to_cpus(&s->refcount_table[i]); 126 update_max_refcount_table_index(s); 127 } 128 return 0; 129 fail: 130 return ret; 131 } 132 133 void qcow2_refcount_close(BlockDriverState *bs) 134 { 135 BDRVQcow2State *s = bs->opaque; 136 g_free(s->refcount_table); 137 } 138 139 140 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index) 141 { 142 return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1; 143 } 144 145 static void set_refcount_ro0(void *refcount_array, uint64_t index, 146 uint64_t value) 147 { 148 assert(!(value >> 1)); 149 ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8)); 150 ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8); 151 } 152 153 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index) 154 { 155 return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4))) 156 & 0x3; 157 } 158 159 static void set_refcount_ro1(void *refcount_array, uint64_t index, 160 uint64_t value) 161 { 162 assert(!(value >> 2)); 163 ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4))); 164 ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4)); 165 } 166 167 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index) 168 { 169 return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2))) 170 & 0xf; 171 } 172 173 static void set_refcount_ro2(void *refcount_array, uint64_t index, 174 uint64_t value) 175 { 176 assert(!(value >> 4)); 177 ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2))); 178 ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2)); 179 } 180 181 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index) 182 { 183 return ((const uint8_t *)refcount_array)[index]; 184 } 185 186 static void set_refcount_ro3(void *refcount_array, uint64_t index, 187 uint64_t value) 188 { 189 assert(!(value >> 8)); 190 ((uint8_t *)refcount_array)[index] = value; 191 } 192 193 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index) 194 { 195 return be16_to_cpu(((const uint16_t *)refcount_array)[index]); 196 } 197 198 static void set_refcount_ro4(void *refcount_array, uint64_t index, 199 uint64_t value) 200 { 201 assert(!(value >> 16)); 202 ((uint16_t *)refcount_array)[index] = cpu_to_be16(value); 203 } 204 205 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index) 206 { 207 return be32_to_cpu(((const uint32_t *)refcount_array)[index]); 208 } 209 210 static void set_refcount_ro5(void *refcount_array, uint64_t index, 211 uint64_t value) 212 { 213 assert(!(value >> 32)); 214 ((uint32_t *)refcount_array)[index] = cpu_to_be32(value); 215 } 216 217 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index) 218 { 219 return be64_to_cpu(((const uint64_t *)refcount_array)[index]); 220 } 221 222 static void set_refcount_ro6(void *refcount_array, uint64_t index, 223 uint64_t value) 224 { 225 ((uint64_t *)refcount_array)[index] = cpu_to_be64(value); 226 } 227 228 229 static int load_refcount_block(BlockDriverState *bs, 230 int64_t refcount_block_offset, 231 void **refcount_block) 232 { 233 BDRVQcow2State *s = bs->opaque; 234 235 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); 236 return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 237 refcount_block); 238 } 239 240 /* 241 * Retrieves the refcount of the cluster given by its index and stores it in 242 * *refcount. Returns 0 on success and -errno on failure. 243 */ 244 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, 245 uint64_t *refcount) 246 { 247 BDRVQcow2State *s = bs->opaque; 248 uint64_t refcount_table_index, block_index; 249 int64_t refcount_block_offset; 250 int ret; 251 void *refcount_block; 252 253 refcount_table_index = cluster_index >> s->refcount_block_bits; 254 if (refcount_table_index >= s->refcount_table_size) { 255 *refcount = 0; 256 return 0; 257 } 258 refcount_block_offset = 259 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 260 if (!refcount_block_offset) { 261 *refcount = 0; 262 return 0; 263 } 264 265 if (offset_into_cluster(s, refcount_block_offset)) { 266 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 267 " unaligned (reftable index: %#" PRIx64 ")", 268 refcount_block_offset, refcount_table_index); 269 return -EIO; 270 } 271 272 ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 273 &refcount_block); 274 if (ret < 0) { 275 return ret; 276 } 277 278 block_index = cluster_index & (s->refcount_block_size - 1); 279 *refcount = s->get_refcount(refcount_block, block_index); 280 281 qcow2_cache_put(s->refcount_block_cache, &refcount_block); 282 283 return 0; 284 } 285 286 /* Checks if two offsets are described by the same refcount block */ 287 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a, 288 uint64_t offset_b) 289 { 290 uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits); 291 uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits); 292 293 return (block_a == block_b); 294 } 295 296 /* 297 * Loads a refcount block. If it doesn't exist yet, it is allocated first 298 * (including growing the refcount table if needed). 299 * 300 * Returns 0 on success or -errno in error case 301 */ 302 static int alloc_refcount_block(BlockDriverState *bs, 303 int64_t cluster_index, void **refcount_block) 304 { 305 BDRVQcow2State *s = bs->opaque; 306 unsigned int refcount_table_index; 307 int64_t ret; 308 309 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); 310 311 /* Find the refcount block for the given cluster */ 312 refcount_table_index = cluster_index >> s->refcount_block_bits; 313 314 if (refcount_table_index < s->refcount_table_size) { 315 316 uint64_t refcount_block_offset = 317 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 318 319 /* If it's already there, we're done */ 320 if (refcount_block_offset) { 321 if (offset_into_cluster(s, refcount_block_offset)) { 322 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 323 PRIx64 " unaligned (reftable index: " 324 "%#x)", refcount_block_offset, 325 refcount_table_index); 326 return -EIO; 327 } 328 329 return load_refcount_block(bs, refcount_block_offset, 330 refcount_block); 331 } 332 } 333 334 /* 335 * If we came here, we need to allocate something. Something is at least 336 * a cluster for the new refcount block. It may also include a new refcount 337 * table if the old refcount table is too small. 338 * 339 * Note that allocating clusters here needs some special care: 340 * 341 * - We can't use the normal qcow2_alloc_clusters(), it would try to 342 * increase the refcount and very likely we would end up with an endless 343 * recursion. Instead we must place the refcount blocks in a way that 344 * they can describe them themselves. 345 * 346 * - We need to consider that at this point we are inside update_refcounts 347 * and potentially doing an initial refcount increase. This means that 348 * some clusters have already been allocated by the caller, but their 349 * refcount isn't accurate yet. If we allocate clusters for metadata, we 350 * need to return -EAGAIN to signal the caller that it needs to restart 351 * the search for free clusters. 352 * 353 * - alloc_clusters_noref and qcow2_free_clusters may load a different 354 * refcount block into the cache 355 */ 356 357 *refcount_block = NULL; 358 359 /* We write to the refcount table, so we might depend on L2 tables */ 360 ret = qcow2_cache_flush(bs, s->l2_table_cache); 361 if (ret < 0) { 362 return ret; 363 } 364 365 /* Allocate the refcount block itself and mark it as used */ 366 int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX); 367 if (new_block < 0) { 368 return new_block; 369 } 370 371 /* If we're allocating the block at offset 0 then something is wrong */ 372 if (new_block == 0) { 373 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid " 374 "allocation of refcount block at offset 0"); 375 return -EIO; 376 } 377 378 #ifdef DEBUG_ALLOC2 379 fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 380 " at %" PRIx64 "\n", 381 refcount_table_index, cluster_index << s->cluster_bits, new_block); 382 #endif 383 384 if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { 385 /* Zero the new refcount block before updating it */ 386 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 387 refcount_block); 388 if (ret < 0) { 389 goto fail; 390 } 391 392 memset(*refcount_block, 0, s->cluster_size); 393 394 /* The block describes itself, need to update the cache */ 395 int block_index = (new_block >> s->cluster_bits) & 396 (s->refcount_block_size - 1); 397 s->set_refcount(*refcount_block, block_index, 1); 398 } else { 399 /* Described somewhere else. This can recurse at most twice before we 400 * arrive at a block that describes itself. */ 401 ret = update_refcount(bs, new_block, s->cluster_size, 1, false, 402 QCOW2_DISCARD_NEVER); 403 if (ret < 0) { 404 goto fail; 405 } 406 407 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 408 if (ret < 0) { 409 goto fail; 410 } 411 412 /* Initialize the new refcount block only after updating its refcount, 413 * update_refcount uses the refcount cache itself */ 414 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 415 refcount_block); 416 if (ret < 0) { 417 goto fail; 418 } 419 420 memset(*refcount_block, 0, s->cluster_size); 421 } 422 423 /* Now the new refcount block needs to be written to disk */ 424 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); 425 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block); 426 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 427 if (ret < 0) { 428 goto fail; 429 } 430 431 /* If the refcount table is big enough, just hook the block up there */ 432 if (refcount_table_index < s->refcount_table_size) { 433 uint64_t data64 = cpu_to_be64(new_block); 434 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); 435 ret = bdrv_pwrite_sync(bs->file, 436 s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), 437 &data64, sizeof(data64)); 438 if (ret < 0) { 439 goto fail; 440 } 441 442 s->refcount_table[refcount_table_index] = new_block; 443 /* If there's a hole in s->refcount_table then it can happen 444 * that refcount_table_index < s->max_refcount_table_index */ 445 s->max_refcount_table_index = 446 MAX(s->max_refcount_table_index, refcount_table_index); 447 448 /* The new refcount block may be where the caller intended to put its 449 * data, so let it restart the search. */ 450 return -EAGAIN; 451 } 452 453 qcow2_cache_put(s->refcount_block_cache, refcount_block); 454 455 /* 456 * If we come here, we need to grow the refcount table. Again, a new 457 * refcount table needs some space and we can't simply allocate to avoid 458 * endless recursion. 459 * 460 * Therefore let's grab new refcount blocks at the end of the image, which 461 * will describe themselves and the new refcount table. This way we can 462 * reference them only in the new table and do the switch to the new 463 * refcount table at once without producing an inconsistent state in 464 * between. 465 */ 466 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); 467 468 /* Calculate the number of refcount blocks needed so far; this will be the 469 * basis for calculating the index of the first cluster used for the 470 * self-describing refcount structures which we are about to create. 471 * 472 * Because we reached this point, there cannot be any refcount entries for 473 * cluster_index or higher indices yet. However, because new_block has been 474 * allocated to describe that cluster (and it will assume this role later 475 * on), we cannot use that index; also, new_block may actually have a higher 476 * cluster index than cluster_index, so it needs to be taken into account 477 * here (and 1 needs to be added to its value because that cluster is used). 478 */ 479 uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1, 480 (new_block >> s->cluster_bits) + 1), 481 s->refcount_block_size); 482 483 /* Create the new refcount table and blocks */ 484 uint64_t meta_offset = (blocks_used * s->refcount_block_size) * 485 s->cluster_size; 486 487 ret = qcow2_refcount_area(bs, meta_offset, 0, false, 488 refcount_table_index, new_block); 489 if (ret < 0) { 490 return ret; 491 } 492 493 ret = load_refcount_block(bs, new_block, refcount_block); 494 if (ret < 0) { 495 return ret; 496 } 497 498 /* If we were trying to do the initial refcount update for some cluster 499 * allocation, we might have used the same clusters to store newly 500 * allocated metadata. Make the caller search some new space. */ 501 return -EAGAIN; 502 503 fail: 504 if (*refcount_block != NULL) { 505 qcow2_cache_put(s->refcount_block_cache, refcount_block); 506 } 507 return ret; 508 } 509 510 /* 511 * Starting at @start_offset, this function creates new self-covering refcount 512 * structures: A new refcount table and refcount blocks which cover all of 513 * themselves, and a number of @additional_clusters beyond their end. 514 * @start_offset must be at the end of the image file, that is, there must be 515 * only empty space beyond it. 516 * If @exact_size is false, the refcount table will have 50 % more entries than 517 * necessary so it will not need to grow again soon. 518 * If @new_refblock_offset is not zero, it contains the offset of a refcount 519 * block that should be entered into the new refcount table at index 520 * @new_refblock_index. 521 * 522 * Returns: The offset after the new refcount structures (i.e. where the 523 * @additional_clusters may be placed) on success, -errno on error. 524 */ 525 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset, 526 uint64_t additional_clusters, bool exact_size, 527 int new_refblock_index, 528 uint64_t new_refblock_offset) 529 { 530 BDRVQcow2State *s = bs->opaque; 531 uint64_t total_refblock_count_u64, additional_refblock_count; 532 int total_refblock_count, table_size, area_reftable_index, table_clusters; 533 int i; 534 uint64_t table_offset, block_offset, end_offset; 535 int ret; 536 uint64_t *new_table; 537 538 assert(!(start_offset % s->cluster_size)); 539 540 qcow2_refcount_metadata_size(start_offset / s->cluster_size + 541 additional_clusters, 542 s->cluster_size, s->refcount_order, 543 !exact_size, &total_refblock_count_u64); 544 if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) { 545 return -EFBIG; 546 } 547 total_refblock_count = total_refblock_count_u64; 548 549 /* Index in the refcount table of the first refcount block to cover the area 550 * of refcount structures we are about to create; we know that 551 * @total_refblock_count can cover @start_offset, so this will definitely 552 * fit into an int. */ 553 area_reftable_index = (start_offset / s->cluster_size) / 554 s->refcount_block_size; 555 556 if (exact_size) { 557 table_size = total_refblock_count; 558 } else { 559 table_size = total_refblock_count + 560 DIV_ROUND_UP(total_refblock_count, 2); 561 } 562 /* The qcow2 file can only store the reftable size in number of clusters */ 563 table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t)); 564 table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size; 565 566 if (table_size > QCOW_MAX_REFTABLE_SIZE) { 567 return -EFBIG; 568 } 569 570 new_table = g_try_new0(uint64_t, table_size); 571 572 assert(table_size > 0); 573 if (new_table == NULL) { 574 ret = -ENOMEM; 575 goto fail; 576 } 577 578 /* Fill the new refcount table */ 579 if (table_size > s->max_refcount_table_index) { 580 /* We're actually growing the reftable */ 581 memcpy(new_table, s->refcount_table, 582 (s->max_refcount_table_index + 1) * sizeof(uint64_t)); 583 } else { 584 /* Improbable case: We're shrinking the reftable. However, the caller 585 * has assured us that there is only empty space beyond @start_offset, 586 * so we can simply drop all of the refblocks that won't fit into the 587 * new reftable. */ 588 memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t)); 589 } 590 591 if (new_refblock_offset) { 592 assert(new_refblock_index < total_refblock_count); 593 new_table[new_refblock_index] = new_refblock_offset; 594 } 595 596 /* Count how many new refblocks we have to create */ 597 additional_refblock_count = 0; 598 for (i = area_reftable_index; i < total_refblock_count; i++) { 599 if (!new_table[i]) { 600 additional_refblock_count++; 601 } 602 } 603 604 table_offset = start_offset + additional_refblock_count * s->cluster_size; 605 end_offset = table_offset + table_clusters * s->cluster_size; 606 607 /* Fill the refcount blocks, and create new ones, if necessary */ 608 block_offset = start_offset; 609 for (i = area_reftable_index; i < total_refblock_count; i++) { 610 void *refblock_data; 611 uint64_t first_offset_covered; 612 613 /* Reuse an existing refblock if possible, create a new one otherwise */ 614 if (new_table[i]) { 615 ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i], 616 &refblock_data); 617 if (ret < 0) { 618 goto fail; 619 } 620 } else { 621 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, 622 block_offset, &refblock_data); 623 if (ret < 0) { 624 goto fail; 625 } 626 memset(refblock_data, 0, s->cluster_size); 627 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, 628 refblock_data); 629 630 new_table[i] = block_offset; 631 block_offset += s->cluster_size; 632 } 633 634 /* First host offset covered by this refblock */ 635 first_offset_covered = (uint64_t)i * s->refcount_block_size * 636 s->cluster_size; 637 if (first_offset_covered < end_offset) { 638 int j, end_index; 639 640 /* Set the refcount of all of the new refcount structures to 1 */ 641 642 if (first_offset_covered < start_offset) { 643 assert(i == area_reftable_index); 644 j = (start_offset - first_offset_covered) / s->cluster_size; 645 assert(j < s->refcount_block_size); 646 } else { 647 j = 0; 648 } 649 650 end_index = MIN((end_offset - first_offset_covered) / 651 s->cluster_size, 652 s->refcount_block_size); 653 654 for (; j < end_index; j++) { 655 /* The caller guaranteed us this space would be empty */ 656 assert(s->get_refcount(refblock_data, j) == 0); 657 s->set_refcount(refblock_data, j, 1); 658 } 659 660 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, 661 refblock_data); 662 } 663 664 qcow2_cache_put(s->refcount_block_cache, &refblock_data); 665 } 666 667 assert(block_offset == table_offset); 668 669 /* Write refcount blocks to disk */ 670 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); 671 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 672 if (ret < 0) { 673 goto fail; 674 } 675 676 /* Write refcount table to disk */ 677 for (i = 0; i < total_refblock_count; i++) { 678 cpu_to_be64s(&new_table[i]); 679 } 680 681 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); 682 ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, 683 table_size * sizeof(uint64_t)); 684 if (ret < 0) { 685 goto fail; 686 } 687 688 for (i = 0; i < total_refblock_count; i++) { 689 be64_to_cpus(&new_table[i]); 690 } 691 692 /* Hook up the new refcount table in the qcow2 header */ 693 struct QEMU_PACKED { 694 uint64_t d64; 695 uint32_t d32; 696 } data; 697 data.d64 = cpu_to_be64(table_offset); 698 data.d32 = cpu_to_be32(table_clusters); 699 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); 700 ret = bdrv_pwrite_sync(bs->file, 701 offsetof(QCowHeader, refcount_table_offset), 702 &data, sizeof(data)); 703 if (ret < 0) { 704 goto fail; 705 } 706 707 /* And switch it in memory */ 708 uint64_t old_table_offset = s->refcount_table_offset; 709 uint64_t old_table_size = s->refcount_table_size; 710 711 g_free(s->refcount_table); 712 s->refcount_table = new_table; 713 s->refcount_table_size = table_size; 714 s->refcount_table_offset = table_offset; 715 update_max_refcount_table_index(s); 716 717 /* Free old table. */ 718 qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), 719 QCOW2_DISCARD_OTHER); 720 721 return end_offset; 722 723 fail: 724 g_free(new_table); 725 return ret; 726 } 727 728 void qcow2_process_discards(BlockDriverState *bs, int ret) 729 { 730 BDRVQcow2State *s = bs->opaque; 731 Qcow2DiscardRegion *d, *next; 732 733 QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { 734 QTAILQ_REMOVE(&s->discards, d, next); 735 736 /* Discard is optional, ignore the return value */ 737 if (ret >= 0) { 738 bdrv_pdiscard(bs->file, d->offset, d->bytes); 739 } 740 741 g_free(d); 742 } 743 } 744 745 static void update_refcount_discard(BlockDriverState *bs, 746 uint64_t offset, uint64_t length) 747 { 748 BDRVQcow2State *s = bs->opaque; 749 Qcow2DiscardRegion *d, *p, *next; 750 751 QTAILQ_FOREACH(d, &s->discards, next) { 752 uint64_t new_start = MIN(offset, d->offset); 753 uint64_t new_end = MAX(offset + length, d->offset + d->bytes); 754 755 if (new_end - new_start <= length + d->bytes) { 756 /* There can't be any overlap, areas ending up here have no 757 * references any more and therefore shouldn't get freed another 758 * time. */ 759 assert(d->bytes + length == new_end - new_start); 760 d->offset = new_start; 761 d->bytes = new_end - new_start; 762 goto found; 763 } 764 } 765 766 d = g_malloc(sizeof(*d)); 767 *d = (Qcow2DiscardRegion) { 768 .bs = bs, 769 .offset = offset, 770 .bytes = length, 771 }; 772 QTAILQ_INSERT_TAIL(&s->discards, d, next); 773 774 found: 775 /* Merge discard requests if they are adjacent now */ 776 QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { 777 if (p == d 778 || p->offset > d->offset + d->bytes 779 || d->offset > p->offset + p->bytes) 780 { 781 continue; 782 } 783 784 /* Still no overlap possible */ 785 assert(p->offset == d->offset + d->bytes 786 || d->offset == p->offset + p->bytes); 787 788 QTAILQ_REMOVE(&s->discards, p, next); 789 d->offset = MIN(d->offset, p->offset); 790 d->bytes += p->bytes; 791 g_free(p); 792 } 793 } 794 795 /* XXX: cache several refcount block clusters ? */ 796 /* @addend is the absolute value of the addend; if @decrease is set, @addend 797 * will be subtracted from the current refcount, otherwise it will be added */ 798 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, 799 int64_t offset, 800 int64_t length, 801 uint64_t addend, 802 bool decrease, 803 enum qcow2_discard_type type) 804 { 805 BDRVQcow2State *s = bs->opaque; 806 int64_t start, last, cluster_offset; 807 void *refcount_block = NULL; 808 int64_t old_table_index = -1; 809 int ret; 810 811 #ifdef DEBUG_ALLOC2 812 fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 813 " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "", 814 addend); 815 #endif 816 if (length < 0) { 817 return -EINVAL; 818 } else if (length == 0) { 819 return 0; 820 } 821 822 if (decrease) { 823 qcow2_cache_set_dependency(bs, s->refcount_block_cache, 824 s->l2_table_cache); 825 } 826 827 start = start_of_cluster(s, offset); 828 last = start_of_cluster(s, offset + length - 1); 829 for(cluster_offset = start; cluster_offset <= last; 830 cluster_offset += s->cluster_size) 831 { 832 int block_index; 833 uint64_t refcount; 834 int64_t cluster_index = cluster_offset >> s->cluster_bits; 835 int64_t table_index = cluster_index >> s->refcount_block_bits; 836 837 /* Load the refcount block and allocate it if needed */ 838 if (table_index != old_table_index) { 839 if (refcount_block) { 840 qcow2_cache_put(s->refcount_block_cache, &refcount_block); 841 } 842 ret = alloc_refcount_block(bs, cluster_index, &refcount_block); 843 /* If the caller needs to restart the search for free clusters, 844 * try the same ones first to see if they're still free. */ 845 if (ret == -EAGAIN) { 846 if (s->free_cluster_index > (start >> s->cluster_bits)) { 847 s->free_cluster_index = (start >> s->cluster_bits); 848 } 849 } 850 if (ret < 0) { 851 goto fail; 852 } 853 } 854 old_table_index = table_index; 855 856 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block); 857 858 /* we can update the count and save it */ 859 block_index = cluster_index & (s->refcount_block_size - 1); 860 861 refcount = s->get_refcount(refcount_block, block_index); 862 if (decrease ? (refcount - addend > refcount) 863 : (refcount + addend < refcount || 864 refcount + addend > s->refcount_max)) 865 { 866 ret = -EINVAL; 867 goto fail; 868 } 869 if (decrease) { 870 refcount -= addend; 871 } else { 872 refcount += addend; 873 } 874 if (refcount == 0 && cluster_index < s->free_cluster_index) { 875 s->free_cluster_index = cluster_index; 876 } 877 s->set_refcount(refcount_block, block_index, refcount); 878 879 if (refcount == 0) { 880 void *table; 881 882 table = qcow2_cache_is_table_offset(s->refcount_block_cache, 883 offset); 884 if (table != NULL) { 885 qcow2_cache_put(s->refcount_block_cache, &refcount_block); 886 qcow2_cache_discard(s->refcount_block_cache, table); 887 } 888 889 table = qcow2_cache_is_table_offset(s->l2_table_cache, offset); 890 if (table != NULL) { 891 qcow2_cache_discard(s->l2_table_cache, table); 892 } 893 894 if (s->discard_passthrough[type]) { 895 update_refcount_discard(bs, cluster_offset, s->cluster_size); 896 } 897 } 898 } 899 900 ret = 0; 901 fail: 902 if (!s->cache_discards) { 903 qcow2_process_discards(bs, ret); 904 } 905 906 /* Write last changed block to disk */ 907 if (refcount_block) { 908 qcow2_cache_put(s->refcount_block_cache, &refcount_block); 909 } 910 911 /* 912 * Try do undo any updates if an error is returned (This may succeed in 913 * some cases like ENOSPC for allocating a new refcount block) 914 */ 915 if (ret < 0) { 916 int dummy; 917 dummy = update_refcount(bs, offset, cluster_offset - offset, addend, 918 !decrease, QCOW2_DISCARD_NEVER); 919 (void)dummy; 920 } 921 922 return ret; 923 } 924 925 /* 926 * Increases or decreases the refcount of a given cluster. 927 * 928 * @addend is the absolute value of the addend; if @decrease is set, @addend 929 * will be subtracted from the current refcount, otherwise it will be added. 930 * 931 * On success 0 is returned; on failure -errno is returned. 932 */ 933 int qcow2_update_cluster_refcount(BlockDriverState *bs, 934 int64_t cluster_index, 935 uint64_t addend, bool decrease, 936 enum qcow2_discard_type type) 937 { 938 BDRVQcow2State *s = bs->opaque; 939 int ret; 940 941 ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, 942 decrease, type); 943 if (ret < 0) { 944 return ret; 945 } 946 947 return 0; 948 } 949 950 951 952 /*********************************************************/ 953 /* cluster allocation functions */ 954 955 956 957 /* return < 0 if error */ 958 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size, 959 uint64_t max) 960 { 961 BDRVQcow2State *s = bs->opaque; 962 uint64_t i, nb_clusters, refcount; 963 int ret; 964 965 /* We can't allocate clusters if they may still be queued for discard. */ 966 if (s->cache_discards) { 967 qcow2_process_discards(bs, 0); 968 } 969 970 nb_clusters = size_to_clusters(s, size); 971 retry: 972 for(i = 0; i < nb_clusters; i++) { 973 uint64_t next_cluster_index = s->free_cluster_index++; 974 ret = qcow2_get_refcount(bs, next_cluster_index, &refcount); 975 976 if (ret < 0) { 977 return ret; 978 } else if (refcount != 0) { 979 goto retry; 980 } 981 } 982 983 /* Make sure that all offsets in the "allocated" range are representable 984 * in the requested max */ 985 if (s->free_cluster_index > 0 && 986 s->free_cluster_index - 1 > (max >> s->cluster_bits)) 987 { 988 return -EFBIG; 989 } 990 991 #ifdef DEBUG_ALLOC2 992 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", 993 size, 994 (s->free_cluster_index - nb_clusters) << s->cluster_bits); 995 #endif 996 return (s->free_cluster_index - nb_clusters) << s->cluster_bits; 997 } 998 999 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) 1000 { 1001 int64_t offset; 1002 int ret; 1003 1004 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); 1005 do { 1006 offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET); 1007 if (offset < 0) { 1008 return offset; 1009 } 1010 1011 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); 1012 } while (ret == -EAGAIN); 1013 1014 if (ret < 0) { 1015 return ret; 1016 } 1017 1018 return offset; 1019 } 1020 1021 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, 1022 int64_t nb_clusters) 1023 { 1024 BDRVQcow2State *s = bs->opaque; 1025 uint64_t cluster_index, refcount; 1026 uint64_t i; 1027 int ret; 1028 1029 assert(nb_clusters >= 0); 1030 if (nb_clusters == 0) { 1031 return 0; 1032 } 1033 1034 do { 1035 /* Check how many clusters there are free */ 1036 cluster_index = offset >> s->cluster_bits; 1037 for(i = 0; i < nb_clusters; i++) { 1038 ret = qcow2_get_refcount(bs, cluster_index++, &refcount); 1039 if (ret < 0) { 1040 return ret; 1041 } else if (refcount != 0) { 1042 break; 1043 } 1044 } 1045 1046 /* And then allocate them */ 1047 ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false, 1048 QCOW2_DISCARD_NEVER); 1049 } while (ret == -EAGAIN); 1050 1051 if (ret < 0) { 1052 return ret; 1053 } 1054 1055 return i; 1056 } 1057 1058 /* only used to allocate compressed sectors. We try to allocate 1059 contiguous sectors. size must be <= cluster_size */ 1060 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) 1061 { 1062 BDRVQcow2State *s = bs->opaque; 1063 int64_t offset; 1064 size_t free_in_cluster; 1065 int ret; 1066 1067 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); 1068 assert(size > 0 && size <= s->cluster_size); 1069 assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); 1070 1071 offset = s->free_byte_offset; 1072 1073 if (offset) { 1074 uint64_t refcount; 1075 ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount); 1076 if (ret < 0) { 1077 return ret; 1078 } 1079 1080 if (refcount == s->refcount_max) { 1081 offset = 0; 1082 } 1083 } 1084 1085 free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); 1086 do { 1087 if (!offset || free_in_cluster < size) { 1088 int64_t new_cluster; 1089 1090 new_cluster = alloc_clusters_noref(bs, s->cluster_size, 1091 MIN(s->cluster_offset_mask, 1092 QCOW_MAX_CLUSTER_OFFSET)); 1093 if (new_cluster < 0) { 1094 return new_cluster; 1095 } 1096 1097 if (new_cluster == 0) { 1098 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid " 1099 "allocation of compressed cluster " 1100 "at offset 0"); 1101 return -EIO; 1102 } 1103 1104 if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { 1105 offset = new_cluster; 1106 free_in_cluster = s->cluster_size; 1107 } else { 1108 free_in_cluster += s->cluster_size; 1109 } 1110 } 1111 1112 assert(offset); 1113 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); 1114 if (ret < 0) { 1115 offset = 0; 1116 } 1117 } while (ret == -EAGAIN); 1118 if (ret < 0) { 1119 return ret; 1120 } 1121 1122 /* The cluster refcount was incremented; refcount blocks must be flushed 1123 * before the caller's L2 table updates. */ 1124 qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); 1125 1126 s->free_byte_offset = offset + size; 1127 if (!offset_into_cluster(s, s->free_byte_offset)) { 1128 s->free_byte_offset = 0; 1129 } 1130 1131 return offset; 1132 } 1133 1134 void qcow2_free_clusters(BlockDriverState *bs, 1135 int64_t offset, int64_t size, 1136 enum qcow2_discard_type type) 1137 { 1138 int ret; 1139 1140 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); 1141 ret = update_refcount(bs, offset, size, 1, true, type); 1142 if (ret < 0) { 1143 fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); 1144 /* TODO Remember the clusters to free them later and avoid leaking */ 1145 } 1146 } 1147 1148 /* 1149 * Free a cluster using its L2 entry (handles clusters of all types, e.g. 1150 * normal cluster, compressed cluster, etc.) 1151 */ 1152 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, 1153 int nb_clusters, enum qcow2_discard_type type) 1154 { 1155 BDRVQcow2State *s = bs->opaque; 1156 1157 switch (qcow2_get_cluster_type(l2_entry)) { 1158 case QCOW2_CLUSTER_COMPRESSED: 1159 { 1160 int nb_csectors; 1161 nb_csectors = ((l2_entry >> s->csize_shift) & 1162 s->csize_mask) + 1; 1163 qcow2_free_clusters(bs, 1164 (l2_entry & s->cluster_offset_mask) & ~511, 1165 nb_csectors * 512, type); 1166 } 1167 break; 1168 case QCOW2_CLUSTER_NORMAL: 1169 case QCOW2_CLUSTER_ZERO_ALLOC: 1170 if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { 1171 qcow2_signal_corruption(bs, false, -1, -1, 1172 "Cannot free unaligned cluster %#llx", 1173 l2_entry & L2E_OFFSET_MASK); 1174 } else { 1175 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, 1176 nb_clusters << s->cluster_bits, type); 1177 } 1178 break; 1179 case QCOW2_CLUSTER_ZERO_PLAIN: 1180 case QCOW2_CLUSTER_UNALLOCATED: 1181 break; 1182 default: 1183 abort(); 1184 } 1185 } 1186 1187 int coroutine_fn qcow2_write_caches(BlockDriverState *bs) 1188 { 1189 BDRVQcow2State *s = bs->opaque; 1190 int ret; 1191 1192 ret = qcow2_cache_write(bs, s->l2_table_cache); 1193 if (ret < 0) { 1194 return ret; 1195 } 1196 1197 if (qcow2_need_accurate_refcounts(s)) { 1198 ret = qcow2_cache_write(bs, s->refcount_block_cache); 1199 if (ret < 0) { 1200 return ret; 1201 } 1202 } 1203 1204 return 0; 1205 } 1206 1207 int coroutine_fn qcow2_flush_caches(BlockDriverState *bs) 1208 { 1209 int ret = qcow2_write_caches(bs); 1210 if (ret < 0) { 1211 return ret; 1212 } 1213 1214 return bdrv_flush(bs->file->bs); 1215 } 1216 1217 /*********************************************************/ 1218 /* snapshots and image creation */ 1219 1220 1221 1222 /* update the refcounts of snapshots and the copied flag */ 1223 int qcow2_update_snapshot_refcount(BlockDriverState *bs, 1224 int64_t l1_table_offset, int l1_size, int addend) 1225 { 1226 BDRVQcow2State *s = bs->opaque; 1227 uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount; 1228 bool l1_allocated = false; 1229 int64_t old_entry, old_l2_offset; 1230 unsigned slice, slice_size2, n_slices; 1231 int i, j, l1_modified = 0, nb_csectors; 1232 int ret; 1233 1234 assert(addend >= -1 && addend <= 1); 1235 1236 l2_slice = NULL; 1237 l1_table = NULL; 1238 l1_size2 = l1_size * sizeof(uint64_t); 1239 slice_size2 = s->l2_slice_size * sizeof(uint64_t); 1240 n_slices = s->cluster_size / slice_size2; 1241 1242 s->cache_discards = true; 1243 1244 /* WARNING: qcow2_snapshot_goto relies on this function not using the 1245 * l1_table_offset when it is the current s->l1_table_offset! Be careful 1246 * when changing this! */ 1247 if (l1_table_offset != s->l1_table_offset) { 1248 l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512)); 1249 if (l1_size2 && l1_table == NULL) { 1250 ret = -ENOMEM; 1251 goto fail; 1252 } 1253 l1_allocated = true; 1254 1255 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); 1256 if (ret < 0) { 1257 goto fail; 1258 } 1259 1260 for (i = 0; i < l1_size; i++) { 1261 be64_to_cpus(&l1_table[i]); 1262 } 1263 } else { 1264 assert(l1_size == s->l1_size); 1265 l1_table = s->l1_table; 1266 l1_allocated = false; 1267 } 1268 1269 for (i = 0; i < l1_size; i++) { 1270 l2_offset = l1_table[i]; 1271 if (l2_offset) { 1272 old_l2_offset = l2_offset; 1273 l2_offset &= L1E_OFFSET_MASK; 1274 1275 if (offset_into_cluster(s, l2_offset)) { 1276 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 1277 PRIx64 " unaligned (L1 index: %#x)", 1278 l2_offset, i); 1279 ret = -EIO; 1280 goto fail; 1281 } 1282 1283 for (slice = 0; slice < n_slices; slice++) { 1284 ret = qcow2_cache_get(bs, s->l2_table_cache, 1285 l2_offset + slice * slice_size2, 1286 (void **) &l2_slice); 1287 if (ret < 0) { 1288 goto fail; 1289 } 1290 1291 for (j = 0; j < s->l2_slice_size; j++) { 1292 uint64_t cluster_index; 1293 uint64_t offset; 1294 1295 entry = be64_to_cpu(l2_slice[j]); 1296 old_entry = entry; 1297 entry &= ~QCOW_OFLAG_COPIED; 1298 offset = entry & L2E_OFFSET_MASK; 1299 1300 switch (qcow2_get_cluster_type(entry)) { 1301 case QCOW2_CLUSTER_COMPRESSED: 1302 nb_csectors = ((entry >> s->csize_shift) & 1303 s->csize_mask) + 1; 1304 if (addend != 0) { 1305 ret = update_refcount( 1306 bs, (entry & s->cluster_offset_mask) & ~511, 1307 nb_csectors * 512, abs(addend), addend < 0, 1308 QCOW2_DISCARD_SNAPSHOT); 1309 if (ret < 0) { 1310 goto fail; 1311 } 1312 } 1313 /* compressed clusters are never modified */ 1314 refcount = 2; 1315 break; 1316 1317 case QCOW2_CLUSTER_NORMAL: 1318 case QCOW2_CLUSTER_ZERO_ALLOC: 1319 if (offset_into_cluster(s, offset)) { 1320 /* Here l2_index means table (not slice) index */ 1321 int l2_index = slice * s->l2_slice_size + j; 1322 qcow2_signal_corruption( 1323 bs, true, -1, -1, "Cluster " 1324 "allocation offset %#" PRIx64 1325 " unaligned (L2 offset: %#" 1326 PRIx64 ", L2 index: %#x)", 1327 offset, l2_offset, l2_index); 1328 ret = -EIO; 1329 goto fail; 1330 } 1331 1332 cluster_index = offset >> s->cluster_bits; 1333 assert(cluster_index); 1334 if (addend != 0) { 1335 ret = qcow2_update_cluster_refcount( 1336 bs, cluster_index, abs(addend), addend < 0, 1337 QCOW2_DISCARD_SNAPSHOT); 1338 if (ret < 0) { 1339 goto fail; 1340 } 1341 } 1342 1343 ret = qcow2_get_refcount(bs, cluster_index, &refcount); 1344 if (ret < 0) { 1345 goto fail; 1346 } 1347 break; 1348 1349 case QCOW2_CLUSTER_ZERO_PLAIN: 1350 case QCOW2_CLUSTER_UNALLOCATED: 1351 refcount = 0; 1352 break; 1353 1354 default: 1355 abort(); 1356 } 1357 1358 if (refcount == 1) { 1359 entry |= QCOW_OFLAG_COPIED; 1360 } 1361 if (entry != old_entry) { 1362 if (addend > 0) { 1363 qcow2_cache_set_dependency(bs, s->l2_table_cache, 1364 s->refcount_block_cache); 1365 } 1366 l2_slice[j] = cpu_to_be64(entry); 1367 qcow2_cache_entry_mark_dirty(s->l2_table_cache, 1368 l2_slice); 1369 } 1370 } 1371 1372 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1373 } 1374 1375 if (addend != 0) { 1376 ret = qcow2_update_cluster_refcount(bs, l2_offset >> 1377 s->cluster_bits, 1378 abs(addend), addend < 0, 1379 QCOW2_DISCARD_SNAPSHOT); 1380 if (ret < 0) { 1381 goto fail; 1382 } 1383 } 1384 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1385 &refcount); 1386 if (ret < 0) { 1387 goto fail; 1388 } else if (refcount == 1) { 1389 l2_offset |= QCOW_OFLAG_COPIED; 1390 } 1391 if (l2_offset != old_l2_offset) { 1392 l1_table[i] = l2_offset; 1393 l1_modified = 1; 1394 } 1395 } 1396 } 1397 1398 ret = bdrv_flush(bs); 1399 fail: 1400 if (l2_slice) { 1401 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1402 } 1403 1404 s->cache_discards = false; 1405 qcow2_process_discards(bs, ret); 1406 1407 /* Update L1 only if it isn't deleted anyway (addend = -1) */ 1408 if (ret == 0 && addend >= 0 && l1_modified) { 1409 for (i = 0; i < l1_size; i++) { 1410 cpu_to_be64s(&l1_table[i]); 1411 } 1412 1413 ret = bdrv_pwrite_sync(bs->file, l1_table_offset, 1414 l1_table, l1_size2); 1415 1416 for (i = 0; i < l1_size; i++) { 1417 be64_to_cpus(&l1_table[i]); 1418 } 1419 } 1420 if (l1_allocated) 1421 g_free(l1_table); 1422 return ret; 1423 } 1424 1425 1426 1427 1428 /*********************************************************/ 1429 /* refcount checking functions */ 1430 1431 1432 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries) 1433 { 1434 /* This assertion holds because there is no way we can address more than 1435 * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because 1436 * offsets have to be representable in bytes); due to every cluster 1437 * corresponding to one refcount entry, we are well below that limit */ 1438 assert(entries < (UINT64_C(1) << (64 - 9))); 1439 1440 /* Thanks to the assertion this will not overflow, because 1441 * s->refcount_order < 7. 1442 * (note: x << s->refcount_order == x * s->refcount_bits) */ 1443 return DIV_ROUND_UP(entries << s->refcount_order, 8); 1444 } 1445 1446 /** 1447 * Reallocates *array so that it can hold new_size entries. *size must contain 1448 * the current number of entries in *array. If the reallocation fails, *array 1449 * and *size will not be modified and -errno will be returned. If the 1450 * reallocation is successful, *array will be set to the new buffer, *size 1451 * will be set to new_size and 0 will be returned. The size of the reallocated 1452 * refcount array buffer will be aligned to a cluster boundary, and the newly 1453 * allocated area will be zeroed. 1454 */ 1455 static int realloc_refcount_array(BDRVQcow2State *s, void **array, 1456 int64_t *size, int64_t new_size) 1457 { 1458 int64_t old_byte_size, new_byte_size; 1459 void *new_ptr; 1460 1461 /* Round to clusters so the array can be directly written to disk */ 1462 old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size)) 1463 * s->cluster_size; 1464 new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size)) 1465 * s->cluster_size; 1466 1467 if (new_byte_size == old_byte_size) { 1468 *size = new_size; 1469 return 0; 1470 } 1471 1472 assert(new_byte_size > 0); 1473 1474 if (new_byte_size > SIZE_MAX) { 1475 return -ENOMEM; 1476 } 1477 1478 new_ptr = g_try_realloc(*array, new_byte_size); 1479 if (!new_ptr) { 1480 return -ENOMEM; 1481 } 1482 1483 if (new_byte_size > old_byte_size) { 1484 memset((char *)new_ptr + old_byte_size, 0, 1485 new_byte_size - old_byte_size); 1486 } 1487 1488 *array = new_ptr; 1489 *size = new_size; 1490 1491 return 0; 1492 } 1493 1494 /* 1495 * Increases the refcount for a range of clusters in a given refcount table. 1496 * This is used to construct a temporary refcount table out of L1 and L2 tables 1497 * which can be compared to the refcount table saved in the image. 1498 * 1499 * Modifies the number of errors in res. 1500 */ 1501 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, 1502 void **refcount_table, 1503 int64_t *refcount_table_size, 1504 int64_t offset, int64_t size) 1505 { 1506 BDRVQcow2State *s = bs->opaque; 1507 uint64_t start, last, cluster_offset, k, refcount; 1508 int ret; 1509 1510 if (size <= 0) { 1511 return 0; 1512 } 1513 1514 start = start_of_cluster(s, offset); 1515 last = start_of_cluster(s, offset + size - 1); 1516 for(cluster_offset = start; cluster_offset <= last; 1517 cluster_offset += s->cluster_size) { 1518 k = cluster_offset >> s->cluster_bits; 1519 if (k >= *refcount_table_size) { 1520 ret = realloc_refcount_array(s, refcount_table, 1521 refcount_table_size, k + 1); 1522 if (ret < 0) { 1523 res->check_errors++; 1524 return ret; 1525 } 1526 } 1527 1528 refcount = s->get_refcount(*refcount_table, k); 1529 if (refcount == s->refcount_max) { 1530 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 1531 "\n", cluster_offset); 1532 fprintf(stderr, "Use qemu-img amend to increase the refcount entry " 1533 "width or qemu-img convert to create a clean copy if the " 1534 "image cannot be opened for writing\n"); 1535 res->corruptions++; 1536 continue; 1537 } 1538 s->set_refcount(*refcount_table, k, refcount + 1); 1539 } 1540 1541 return 0; 1542 } 1543 1544 /* Flags for check_refcounts_l1() and check_refcounts_l2() */ 1545 enum { 1546 CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ 1547 }; 1548 1549 /* 1550 * Increases the refcount in the given refcount table for the all clusters 1551 * referenced in the L2 table. While doing so, performs some checks on L2 1552 * entries. 1553 * 1554 * Returns the number of errors found by the checks or -errno if an internal 1555 * error occurred. 1556 */ 1557 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, 1558 void **refcount_table, 1559 int64_t *refcount_table_size, int64_t l2_offset, 1560 int flags, BdrvCheckMode fix) 1561 { 1562 BDRVQcow2State *s = bs->opaque; 1563 uint64_t *l2_table, l2_entry; 1564 uint64_t next_contiguous_offset = 0; 1565 int i, l2_size, nb_csectors, ret; 1566 1567 /* Read L2 table from disk */ 1568 l2_size = s->l2_size * sizeof(uint64_t); 1569 l2_table = g_malloc(l2_size); 1570 1571 ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size); 1572 if (ret < 0) { 1573 fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); 1574 res->check_errors++; 1575 goto fail; 1576 } 1577 1578 /* Do the actual checks */ 1579 for(i = 0; i < s->l2_size; i++) { 1580 l2_entry = be64_to_cpu(l2_table[i]); 1581 1582 switch (qcow2_get_cluster_type(l2_entry)) { 1583 case QCOW2_CLUSTER_COMPRESSED: 1584 /* Compressed clusters don't have QCOW_OFLAG_COPIED */ 1585 if (l2_entry & QCOW_OFLAG_COPIED) { 1586 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": " 1587 "copied flag must never be set for compressed " 1588 "clusters\n", l2_entry & s->cluster_offset_mask); 1589 l2_entry &= ~QCOW_OFLAG_COPIED; 1590 res->corruptions++; 1591 } 1592 1593 /* Mark cluster as used */ 1594 nb_csectors = ((l2_entry >> s->csize_shift) & 1595 s->csize_mask) + 1; 1596 l2_entry &= s->cluster_offset_mask; 1597 ret = qcow2_inc_refcounts_imrt(bs, res, 1598 refcount_table, refcount_table_size, 1599 l2_entry & ~511, nb_csectors * 512); 1600 if (ret < 0) { 1601 goto fail; 1602 } 1603 1604 if (flags & CHECK_FRAG_INFO) { 1605 res->bfi.allocated_clusters++; 1606 res->bfi.compressed_clusters++; 1607 1608 /* Compressed clusters are fragmented by nature. Since they 1609 * take up sub-sector space but we only have sector granularity 1610 * I/O we need to re-read the same sectors even for adjacent 1611 * compressed clusters. 1612 */ 1613 res->bfi.fragmented_clusters++; 1614 } 1615 break; 1616 1617 case QCOW2_CLUSTER_ZERO_ALLOC: 1618 case QCOW2_CLUSTER_NORMAL: 1619 { 1620 uint64_t offset = l2_entry & L2E_OFFSET_MASK; 1621 1622 if (flags & CHECK_FRAG_INFO) { 1623 res->bfi.allocated_clusters++; 1624 if (next_contiguous_offset && 1625 offset != next_contiguous_offset) { 1626 res->bfi.fragmented_clusters++; 1627 } 1628 next_contiguous_offset = offset + s->cluster_size; 1629 } 1630 1631 /* Correct offsets are cluster aligned */ 1632 if (offset_into_cluster(s, offset)) { 1633 if (qcow2_get_cluster_type(l2_entry) == 1634 QCOW2_CLUSTER_ZERO_ALLOC) 1635 { 1636 fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero " 1637 "cluster is not properly aligned; L2 entry " 1638 "corrupted.\n", 1639 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", 1640 offset); 1641 if (fix & BDRV_FIX_ERRORS) { 1642 uint64_t l2e_offset = 1643 l2_offset + (uint64_t)i * sizeof(uint64_t); 1644 1645 l2_entry = QCOW_OFLAG_ZERO; 1646 l2_table[i] = cpu_to_be64(l2_entry); 1647 ret = qcow2_pre_write_overlap_check(bs, 1648 QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2, 1649 l2e_offset, sizeof(uint64_t)); 1650 if (ret < 0) { 1651 fprintf(stderr, "ERROR: Overlap check failed\n"); 1652 res->check_errors++; 1653 /* Something is seriously wrong, so abort checking 1654 * this L2 table */ 1655 goto fail; 1656 } 1657 1658 ret = bdrv_pwrite_sync(bs->file, l2e_offset, 1659 &l2_table[i], sizeof(uint64_t)); 1660 if (ret < 0) { 1661 fprintf(stderr, "ERROR: Failed to overwrite L2 " 1662 "table entry: %s\n", strerror(-ret)); 1663 res->check_errors++; 1664 /* Do not abort, continue checking the rest of this 1665 * L2 table's entries */ 1666 } else { 1667 res->corruptions_fixed++; 1668 /* Skip marking the cluster as used 1669 * (it is unused now) */ 1670 continue; 1671 } 1672 } else { 1673 res->corruptions++; 1674 } 1675 } else { 1676 fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is " 1677 "not properly aligned; L2 entry corrupted.\n", offset); 1678 res->corruptions++; 1679 } 1680 } 1681 1682 /* Mark cluster as used */ 1683 ret = qcow2_inc_refcounts_imrt(bs, res, 1684 refcount_table, refcount_table_size, 1685 offset, s->cluster_size); 1686 if (ret < 0) { 1687 goto fail; 1688 } 1689 break; 1690 } 1691 1692 case QCOW2_CLUSTER_ZERO_PLAIN: 1693 case QCOW2_CLUSTER_UNALLOCATED: 1694 break; 1695 1696 default: 1697 abort(); 1698 } 1699 } 1700 1701 g_free(l2_table); 1702 return 0; 1703 1704 fail: 1705 g_free(l2_table); 1706 return ret; 1707 } 1708 1709 /* 1710 * Increases the refcount for the L1 table, its L2 tables and all referenced 1711 * clusters in the given refcount table. While doing so, performs some checks 1712 * on L1 and L2 entries. 1713 * 1714 * Returns the number of errors found by the checks or -errno if an internal 1715 * error occurred. 1716 */ 1717 static int check_refcounts_l1(BlockDriverState *bs, 1718 BdrvCheckResult *res, 1719 void **refcount_table, 1720 int64_t *refcount_table_size, 1721 int64_t l1_table_offset, int l1_size, 1722 int flags, BdrvCheckMode fix) 1723 { 1724 BDRVQcow2State *s = bs->opaque; 1725 uint64_t *l1_table = NULL, l2_offset, l1_size2; 1726 int i, ret; 1727 1728 l1_size2 = l1_size * sizeof(uint64_t); 1729 1730 /* Mark L1 table as used */ 1731 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size, 1732 l1_table_offset, l1_size2); 1733 if (ret < 0) { 1734 goto fail; 1735 } 1736 1737 /* Read L1 table entries from disk */ 1738 if (l1_size2 > 0) { 1739 l1_table = g_try_malloc(l1_size2); 1740 if (l1_table == NULL) { 1741 ret = -ENOMEM; 1742 res->check_errors++; 1743 goto fail; 1744 } 1745 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); 1746 if (ret < 0) { 1747 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); 1748 res->check_errors++; 1749 goto fail; 1750 } 1751 for(i = 0;i < l1_size; i++) 1752 be64_to_cpus(&l1_table[i]); 1753 } 1754 1755 /* Do the actual checks */ 1756 for(i = 0; i < l1_size; i++) { 1757 l2_offset = l1_table[i]; 1758 if (l2_offset) { 1759 /* Mark L2 table as used */ 1760 l2_offset &= L1E_OFFSET_MASK; 1761 ret = qcow2_inc_refcounts_imrt(bs, res, 1762 refcount_table, refcount_table_size, 1763 l2_offset, s->cluster_size); 1764 if (ret < 0) { 1765 goto fail; 1766 } 1767 1768 /* L2 tables are cluster aligned */ 1769 if (offset_into_cluster(s, l2_offset)) { 1770 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " 1771 "cluster aligned; L1 entry corrupted\n", l2_offset); 1772 res->corruptions++; 1773 } 1774 1775 /* Process and check L2 entries */ 1776 ret = check_refcounts_l2(bs, res, refcount_table, 1777 refcount_table_size, l2_offset, flags, 1778 fix); 1779 if (ret < 0) { 1780 goto fail; 1781 } 1782 } 1783 } 1784 g_free(l1_table); 1785 return 0; 1786 1787 fail: 1788 g_free(l1_table); 1789 return ret; 1790 } 1791 1792 /* 1793 * Checks the OFLAG_COPIED flag for all L1 and L2 entries. 1794 * 1795 * This function does not print an error message nor does it increment 1796 * check_errors if qcow2_get_refcount fails (this is because such an error will 1797 * have been already detected and sufficiently signaled by the calling function 1798 * (qcow2_check_refcounts) by the time this function is called). 1799 */ 1800 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, 1801 BdrvCheckMode fix) 1802 { 1803 BDRVQcow2State *s = bs->opaque; 1804 uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); 1805 int ret; 1806 uint64_t refcount; 1807 int i, j; 1808 bool repair; 1809 1810 if (fix & BDRV_FIX_ERRORS) { 1811 /* Always repair */ 1812 repair = true; 1813 } else if (fix & BDRV_FIX_LEAKS) { 1814 /* Repair only if that seems safe: This function is always 1815 * called after the refcounts have been fixed, so the refcount 1816 * is accurate if that repair was successful */ 1817 repair = !res->check_errors && !res->corruptions && !res->leaks; 1818 } else { 1819 repair = false; 1820 } 1821 1822 for (i = 0; i < s->l1_size; i++) { 1823 uint64_t l1_entry = s->l1_table[i]; 1824 uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; 1825 bool l2_dirty = false; 1826 1827 if (!l2_offset) { 1828 continue; 1829 } 1830 1831 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1832 &refcount); 1833 if (ret < 0) { 1834 /* don't print message nor increment check_errors */ 1835 continue; 1836 } 1837 if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { 1838 fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " 1839 "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n", 1840 repair ? "Repairing" : "ERROR", i, l1_entry, refcount); 1841 if (repair) { 1842 s->l1_table[i] = refcount == 1 1843 ? l1_entry | QCOW_OFLAG_COPIED 1844 : l1_entry & ~QCOW_OFLAG_COPIED; 1845 ret = qcow2_write_l1_entry(bs, i); 1846 if (ret < 0) { 1847 res->check_errors++; 1848 goto fail; 1849 } 1850 res->corruptions_fixed++; 1851 } else { 1852 res->corruptions++; 1853 } 1854 } 1855 1856 ret = bdrv_pread(bs->file, l2_offset, l2_table, 1857 s->l2_size * sizeof(uint64_t)); 1858 if (ret < 0) { 1859 fprintf(stderr, "ERROR: Could not read L2 table: %s\n", 1860 strerror(-ret)); 1861 res->check_errors++; 1862 goto fail; 1863 } 1864 1865 for (j = 0; j < s->l2_size; j++) { 1866 uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1867 uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; 1868 QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 1869 1870 if (cluster_type == QCOW2_CLUSTER_NORMAL || 1871 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) { 1872 ret = qcow2_get_refcount(bs, 1873 data_offset >> s->cluster_bits, 1874 &refcount); 1875 if (ret < 0) { 1876 /* don't print message nor increment check_errors */ 1877 continue; 1878 } 1879 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { 1880 fprintf(stderr, "%s OFLAG_COPIED data cluster: " 1881 "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n", 1882 repair ? "Repairing" : "ERROR", l2_entry, refcount); 1883 if (repair) { 1884 l2_table[j] = cpu_to_be64(refcount == 1 1885 ? l2_entry | QCOW_OFLAG_COPIED 1886 : l2_entry & ~QCOW_OFLAG_COPIED); 1887 l2_dirty = true; 1888 res->corruptions_fixed++; 1889 } else { 1890 res->corruptions++; 1891 } 1892 } 1893 } 1894 } 1895 1896 if (l2_dirty) { 1897 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, 1898 l2_offset, s->cluster_size); 1899 if (ret < 0) { 1900 fprintf(stderr, "ERROR: Could not write L2 table; metadata " 1901 "overlap check failed: %s\n", strerror(-ret)); 1902 res->check_errors++; 1903 goto fail; 1904 } 1905 1906 ret = bdrv_pwrite(bs->file, l2_offset, l2_table, 1907 s->cluster_size); 1908 if (ret < 0) { 1909 fprintf(stderr, "ERROR: Could not write L2 table: %s\n", 1910 strerror(-ret)); 1911 res->check_errors++; 1912 goto fail; 1913 } 1914 } 1915 } 1916 1917 ret = 0; 1918 1919 fail: 1920 qemu_vfree(l2_table); 1921 return ret; 1922 } 1923 1924 /* 1925 * Checks consistency of refblocks and accounts for each refblock in 1926 * *refcount_table. 1927 */ 1928 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, 1929 BdrvCheckMode fix, bool *rebuild, 1930 void **refcount_table, int64_t *nb_clusters) 1931 { 1932 BDRVQcow2State *s = bs->opaque; 1933 int64_t i, size; 1934 int ret; 1935 1936 for(i = 0; i < s->refcount_table_size; i++) { 1937 uint64_t offset, cluster; 1938 offset = s->refcount_table[i]; 1939 cluster = offset >> s->cluster_bits; 1940 1941 /* Refcount blocks are cluster aligned */ 1942 if (offset_into_cluster(s, offset)) { 1943 fprintf(stderr, "ERROR refcount block %" PRId64 " is not " 1944 "cluster aligned; refcount table entry corrupted\n", i); 1945 res->corruptions++; 1946 *rebuild = true; 1947 continue; 1948 } 1949 1950 if (cluster >= *nb_clusters) { 1951 fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n", 1952 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); 1953 1954 if (fix & BDRV_FIX_ERRORS) { 1955 int64_t new_nb_clusters; 1956 Error *local_err = NULL; 1957 1958 if (offset > INT64_MAX - s->cluster_size) { 1959 ret = -EINVAL; 1960 goto resize_fail; 1961 } 1962 1963 ret = bdrv_truncate(bs->file, offset + s->cluster_size, 1964 PREALLOC_MODE_OFF, &local_err); 1965 if (ret < 0) { 1966 error_report_err(local_err); 1967 goto resize_fail; 1968 } 1969 size = bdrv_getlength(bs->file->bs); 1970 if (size < 0) { 1971 ret = size; 1972 goto resize_fail; 1973 } 1974 1975 new_nb_clusters = size_to_clusters(s, size); 1976 assert(new_nb_clusters >= *nb_clusters); 1977 1978 ret = realloc_refcount_array(s, refcount_table, 1979 nb_clusters, new_nb_clusters); 1980 if (ret < 0) { 1981 res->check_errors++; 1982 return ret; 1983 } 1984 1985 if (cluster >= *nb_clusters) { 1986 ret = -EINVAL; 1987 goto resize_fail; 1988 } 1989 1990 res->corruptions_fixed++; 1991 ret = qcow2_inc_refcounts_imrt(bs, res, 1992 refcount_table, nb_clusters, 1993 offset, s->cluster_size); 1994 if (ret < 0) { 1995 return ret; 1996 } 1997 /* No need to check whether the refcount is now greater than 1: 1998 * This area was just allocated and zeroed, so it can only be 1999 * exactly 1 after qcow2_inc_refcounts_imrt() */ 2000 continue; 2001 2002 resize_fail: 2003 res->corruptions++; 2004 *rebuild = true; 2005 fprintf(stderr, "ERROR could not resize image: %s\n", 2006 strerror(-ret)); 2007 } else { 2008 res->corruptions++; 2009 } 2010 continue; 2011 } 2012 2013 if (offset != 0) { 2014 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2015 offset, s->cluster_size); 2016 if (ret < 0) { 2017 return ret; 2018 } 2019 if (s->get_refcount(*refcount_table, cluster) != 1) { 2020 fprintf(stderr, "ERROR refcount block %" PRId64 2021 " refcount=%" PRIu64 "\n", i, 2022 s->get_refcount(*refcount_table, cluster)); 2023 res->corruptions++; 2024 *rebuild = true; 2025 } 2026 } 2027 } 2028 2029 return 0; 2030 } 2031 2032 /* 2033 * Calculates an in-memory refcount table. 2034 */ 2035 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 2036 BdrvCheckMode fix, bool *rebuild, 2037 void **refcount_table, int64_t *nb_clusters) 2038 { 2039 BDRVQcow2State *s = bs->opaque; 2040 int64_t i; 2041 QCowSnapshot *sn; 2042 int ret; 2043 2044 if (!*refcount_table) { 2045 int64_t old_size = 0; 2046 ret = realloc_refcount_array(s, refcount_table, 2047 &old_size, *nb_clusters); 2048 if (ret < 0) { 2049 res->check_errors++; 2050 return ret; 2051 } 2052 } 2053 2054 /* header */ 2055 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2056 0, s->cluster_size); 2057 if (ret < 0) { 2058 return ret; 2059 } 2060 2061 /* current L1 table */ 2062 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 2063 s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO, 2064 fix); 2065 if (ret < 0) { 2066 return ret; 2067 } 2068 2069 /* snapshots */ 2070 for (i = 0; i < s->nb_snapshots; i++) { 2071 sn = s->snapshots + i; 2072 if (offset_into_cluster(s, sn->l1_table_offset)) { 2073 fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": " 2074 "L1 table is not cluster aligned; snapshot table entry " 2075 "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset); 2076 res->corruptions++; 2077 continue; 2078 } 2079 if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { 2080 fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": " 2081 "L1 table is too large; snapshot table entry corrupted\n", 2082 sn->id_str, sn->name, sn->l1_size); 2083 res->corruptions++; 2084 continue; 2085 } 2086 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 2087 sn->l1_table_offset, sn->l1_size, 0, fix); 2088 if (ret < 0) { 2089 return ret; 2090 } 2091 } 2092 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2093 s->snapshots_offset, s->snapshots_size); 2094 if (ret < 0) { 2095 return ret; 2096 } 2097 2098 /* refcount data */ 2099 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2100 s->refcount_table_offset, 2101 s->refcount_table_size * sizeof(uint64_t)); 2102 if (ret < 0) { 2103 return ret; 2104 } 2105 2106 /* encryption */ 2107 if (s->crypto_header.length) { 2108 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2109 s->crypto_header.offset, 2110 s->crypto_header.length); 2111 if (ret < 0) { 2112 return ret; 2113 } 2114 } 2115 2116 /* bitmaps */ 2117 ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters); 2118 if (ret < 0) { 2119 return ret; 2120 } 2121 2122 return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters); 2123 } 2124 2125 /* 2126 * Compares the actual reference count for each cluster in the image against the 2127 * refcount as reported by the refcount structures on-disk. 2128 */ 2129 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 2130 BdrvCheckMode fix, bool *rebuild, 2131 int64_t *highest_cluster, 2132 void *refcount_table, int64_t nb_clusters) 2133 { 2134 BDRVQcow2State *s = bs->opaque; 2135 int64_t i; 2136 uint64_t refcount1, refcount2; 2137 int ret; 2138 2139 for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { 2140 ret = qcow2_get_refcount(bs, i, &refcount1); 2141 if (ret < 0) { 2142 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", 2143 i, strerror(-ret)); 2144 res->check_errors++; 2145 continue; 2146 } 2147 2148 refcount2 = s->get_refcount(refcount_table, i); 2149 2150 if (refcount1 > 0 || refcount2 > 0) { 2151 *highest_cluster = i; 2152 } 2153 2154 if (refcount1 != refcount2) { 2155 /* Check if we're allowed to fix the mismatch */ 2156 int *num_fixed = NULL; 2157 if (refcount1 == 0) { 2158 *rebuild = true; 2159 } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { 2160 num_fixed = &res->leaks_fixed; 2161 } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { 2162 num_fixed = &res->corruptions_fixed; 2163 } 2164 2165 fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64 2166 " reference=%" PRIu64 "\n", 2167 num_fixed != NULL ? "Repairing" : 2168 refcount1 < refcount2 ? "ERROR" : 2169 "Leaked", 2170 i, refcount1, refcount2); 2171 2172 if (num_fixed) { 2173 ret = update_refcount(bs, i << s->cluster_bits, 1, 2174 refcount_diff(refcount1, refcount2), 2175 refcount1 > refcount2, 2176 QCOW2_DISCARD_ALWAYS); 2177 if (ret >= 0) { 2178 (*num_fixed)++; 2179 continue; 2180 } 2181 } 2182 2183 /* And if we couldn't, print an error */ 2184 if (refcount1 < refcount2) { 2185 res->corruptions++; 2186 } else { 2187 res->leaks++; 2188 } 2189 } 2190 } 2191 } 2192 2193 /* 2194 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to 2195 * the on-disk refcount structures. 2196 * 2197 * On input, *first_free_cluster tells where to start looking, and need not 2198 * actually be a free cluster; the returned offset will not be before that 2199 * cluster. On output, *first_free_cluster points to the first gap found, even 2200 * if that gap was too small to be used as the returned offset. 2201 * 2202 * Note that *first_free_cluster is a cluster index whereas the return value is 2203 * an offset. 2204 */ 2205 static int64_t alloc_clusters_imrt(BlockDriverState *bs, 2206 int cluster_count, 2207 void **refcount_table, 2208 int64_t *imrt_nb_clusters, 2209 int64_t *first_free_cluster) 2210 { 2211 BDRVQcow2State *s = bs->opaque; 2212 int64_t cluster = *first_free_cluster, i; 2213 bool first_gap = true; 2214 int contiguous_free_clusters; 2215 int ret; 2216 2217 /* Starting at *first_free_cluster, find a range of at least cluster_count 2218 * continuously free clusters */ 2219 for (contiguous_free_clusters = 0; 2220 cluster < *imrt_nb_clusters && 2221 contiguous_free_clusters < cluster_count; 2222 cluster++) 2223 { 2224 if (!s->get_refcount(*refcount_table, cluster)) { 2225 contiguous_free_clusters++; 2226 if (first_gap) { 2227 /* If this is the first free cluster found, update 2228 * *first_free_cluster accordingly */ 2229 *first_free_cluster = cluster; 2230 first_gap = false; 2231 } 2232 } else if (contiguous_free_clusters) { 2233 contiguous_free_clusters = 0; 2234 } 2235 } 2236 2237 /* If contiguous_free_clusters is greater than zero, it contains the number 2238 * of continuously free clusters until the current cluster; the first free 2239 * cluster in the current "gap" is therefore 2240 * cluster - contiguous_free_clusters */ 2241 2242 /* If no such range could be found, grow the in-memory refcount table 2243 * accordingly to append free clusters at the end of the image */ 2244 if (contiguous_free_clusters < cluster_count) { 2245 /* contiguous_free_clusters clusters are already empty at the image end; 2246 * we need cluster_count clusters; therefore, we have to allocate 2247 * cluster_count - contiguous_free_clusters new clusters at the end of 2248 * the image (which is the current value of cluster; note that cluster 2249 * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond 2250 * the image end) */ 2251 ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters, 2252 cluster + cluster_count 2253 - contiguous_free_clusters); 2254 if (ret < 0) { 2255 return ret; 2256 } 2257 } 2258 2259 /* Go back to the first free cluster */ 2260 cluster -= contiguous_free_clusters; 2261 for (i = 0; i < cluster_count; i++) { 2262 s->set_refcount(*refcount_table, cluster + i, 1); 2263 } 2264 2265 return cluster << s->cluster_bits; 2266 } 2267 2268 /* 2269 * Creates a new refcount structure based solely on the in-memory information 2270 * given through *refcount_table. All necessary allocations will be reflected 2271 * in that array. 2272 * 2273 * On success, the old refcount structure is leaked (it will be covered by the 2274 * new refcount structure). 2275 */ 2276 static int rebuild_refcount_structure(BlockDriverState *bs, 2277 BdrvCheckResult *res, 2278 void **refcount_table, 2279 int64_t *nb_clusters) 2280 { 2281 BDRVQcow2State *s = bs->opaque; 2282 int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; 2283 int64_t refblock_offset, refblock_start, refblock_index; 2284 uint32_t reftable_size = 0; 2285 uint64_t *on_disk_reftable = NULL; 2286 void *on_disk_refblock; 2287 int ret = 0; 2288 struct { 2289 uint64_t reftable_offset; 2290 uint32_t reftable_clusters; 2291 } QEMU_PACKED reftable_offset_and_clusters; 2292 2293 qcow2_cache_empty(bs, s->refcount_block_cache); 2294 2295 write_refblocks: 2296 for (; cluster < *nb_clusters; cluster++) { 2297 if (!s->get_refcount(*refcount_table, cluster)) { 2298 continue; 2299 } 2300 2301 refblock_index = cluster >> s->refcount_block_bits; 2302 refblock_start = refblock_index << s->refcount_block_bits; 2303 2304 /* Don't allocate a cluster in a refblock already written to disk */ 2305 if (first_free_cluster < refblock_start) { 2306 first_free_cluster = refblock_start; 2307 } 2308 refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, 2309 nb_clusters, &first_free_cluster); 2310 if (refblock_offset < 0) { 2311 fprintf(stderr, "ERROR allocating refblock: %s\n", 2312 strerror(-refblock_offset)); 2313 res->check_errors++; 2314 ret = refblock_offset; 2315 goto fail; 2316 } 2317 2318 if (reftable_size <= refblock_index) { 2319 uint32_t old_reftable_size = reftable_size; 2320 uint64_t *new_on_disk_reftable; 2321 2322 reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t), 2323 s->cluster_size) / sizeof(uint64_t); 2324 new_on_disk_reftable = g_try_realloc(on_disk_reftable, 2325 reftable_size * 2326 sizeof(uint64_t)); 2327 if (!new_on_disk_reftable) { 2328 res->check_errors++; 2329 ret = -ENOMEM; 2330 goto fail; 2331 } 2332 on_disk_reftable = new_on_disk_reftable; 2333 2334 memset(on_disk_reftable + old_reftable_size, 0, 2335 (reftable_size - old_reftable_size) * sizeof(uint64_t)); 2336 2337 /* The offset we have for the reftable is now no longer valid; 2338 * this will leak that range, but we can easily fix that by running 2339 * a leak-fixing check after this rebuild operation */ 2340 reftable_offset = -1; 2341 } else { 2342 assert(on_disk_reftable); 2343 } 2344 on_disk_reftable[refblock_index] = refblock_offset; 2345 2346 /* If this is apparently the last refblock (for now), try to squeeze the 2347 * reftable in */ 2348 if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits && 2349 reftable_offset < 0) 2350 { 2351 uint64_t reftable_clusters = size_to_clusters(s, reftable_size * 2352 sizeof(uint64_t)); 2353 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, 2354 refcount_table, nb_clusters, 2355 &first_free_cluster); 2356 if (reftable_offset < 0) { 2357 fprintf(stderr, "ERROR allocating reftable: %s\n", 2358 strerror(-reftable_offset)); 2359 res->check_errors++; 2360 ret = reftable_offset; 2361 goto fail; 2362 } 2363 } 2364 2365 ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, 2366 s->cluster_size); 2367 if (ret < 0) { 2368 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); 2369 goto fail; 2370 } 2371 2372 /* The size of *refcount_table is always cluster-aligned, therefore the 2373 * write operation will not overflow */ 2374 on_disk_refblock = (void *)((char *) *refcount_table + 2375 refblock_index * s->cluster_size); 2376 2377 ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE, 2378 on_disk_refblock, s->cluster_sectors); 2379 if (ret < 0) { 2380 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); 2381 goto fail; 2382 } 2383 2384 /* Go to the end of this refblock */ 2385 cluster = refblock_start + s->refcount_block_size - 1; 2386 } 2387 2388 if (reftable_offset < 0) { 2389 uint64_t post_refblock_start, reftable_clusters; 2390 2391 post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size); 2392 reftable_clusters = size_to_clusters(s, 2393 reftable_size * sizeof(uint64_t)); 2394 /* Not pretty but simple */ 2395 if (first_free_cluster < post_refblock_start) { 2396 first_free_cluster = post_refblock_start; 2397 } 2398 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, 2399 refcount_table, nb_clusters, 2400 &first_free_cluster); 2401 if (reftable_offset < 0) { 2402 fprintf(stderr, "ERROR allocating reftable: %s\n", 2403 strerror(-reftable_offset)); 2404 res->check_errors++; 2405 ret = reftable_offset; 2406 goto fail; 2407 } 2408 2409 goto write_refblocks; 2410 } 2411 2412 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { 2413 cpu_to_be64s(&on_disk_reftable[refblock_index]); 2414 } 2415 2416 ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, 2417 reftable_size * sizeof(uint64_t)); 2418 if (ret < 0) { 2419 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); 2420 goto fail; 2421 } 2422 2423 assert(reftable_size < INT_MAX / sizeof(uint64_t)); 2424 ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, 2425 reftable_size * sizeof(uint64_t)); 2426 if (ret < 0) { 2427 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); 2428 goto fail; 2429 } 2430 2431 /* Enter new reftable into the image header */ 2432 reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset); 2433 reftable_offset_and_clusters.reftable_clusters = 2434 cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t))); 2435 ret = bdrv_pwrite_sync(bs->file, 2436 offsetof(QCowHeader, refcount_table_offset), 2437 &reftable_offset_and_clusters, 2438 sizeof(reftable_offset_and_clusters)); 2439 if (ret < 0) { 2440 fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret)); 2441 goto fail; 2442 } 2443 2444 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { 2445 be64_to_cpus(&on_disk_reftable[refblock_index]); 2446 } 2447 s->refcount_table = on_disk_reftable; 2448 s->refcount_table_offset = reftable_offset; 2449 s->refcount_table_size = reftable_size; 2450 update_max_refcount_table_index(s); 2451 2452 return 0; 2453 2454 fail: 2455 g_free(on_disk_reftable); 2456 return ret; 2457 } 2458 2459 /* 2460 * Checks an image for refcount consistency. 2461 * 2462 * Returns 0 if no errors are found, the number of errors in case the image is 2463 * detected as corrupted, and -errno when an internal error occurred. 2464 */ 2465 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 2466 BdrvCheckMode fix) 2467 { 2468 BDRVQcow2State *s = bs->opaque; 2469 BdrvCheckResult pre_compare_res; 2470 int64_t size, highest_cluster, nb_clusters; 2471 void *refcount_table = NULL; 2472 bool rebuild = false; 2473 int ret; 2474 2475 size = bdrv_getlength(bs->file->bs); 2476 if (size < 0) { 2477 res->check_errors++; 2478 return size; 2479 } 2480 2481 nb_clusters = size_to_clusters(s, size); 2482 if (nb_clusters > INT_MAX) { 2483 res->check_errors++; 2484 return -EFBIG; 2485 } 2486 2487 res->bfi.total_clusters = 2488 size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); 2489 2490 ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table, 2491 &nb_clusters); 2492 if (ret < 0) { 2493 goto fail; 2494 } 2495 2496 /* In case we don't need to rebuild the refcount structure (but want to fix 2497 * something), this function is immediately called again, in which case the 2498 * result should be ignored */ 2499 pre_compare_res = *res; 2500 compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table, 2501 nb_clusters); 2502 2503 if (rebuild && (fix & BDRV_FIX_ERRORS)) { 2504 BdrvCheckResult old_res = *res; 2505 int fresh_leaks = 0; 2506 2507 fprintf(stderr, "Rebuilding refcount structure\n"); 2508 ret = rebuild_refcount_structure(bs, res, &refcount_table, 2509 &nb_clusters); 2510 if (ret < 0) { 2511 goto fail; 2512 } 2513 2514 res->corruptions = 0; 2515 res->leaks = 0; 2516 2517 /* Because the old reftable has been exchanged for a new one the 2518 * references have to be recalculated */ 2519 rebuild = false; 2520 memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters)); 2521 ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table, 2522 &nb_clusters); 2523 if (ret < 0) { 2524 goto fail; 2525 } 2526 2527 if (fix & BDRV_FIX_LEAKS) { 2528 /* The old refcount structures are now leaked, fix it; the result 2529 * can be ignored, aside from leaks which were introduced by 2530 * rebuild_refcount_structure() that could not be fixed */ 2531 BdrvCheckResult saved_res = *res; 2532 *res = (BdrvCheckResult){ 0 }; 2533 2534 compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild, 2535 &highest_cluster, refcount_table, nb_clusters); 2536 if (rebuild) { 2537 fprintf(stderr, "ERROR rebuilt refcount structure is still " 2538 "broken\n"); 2539 } 2540 2541 /* Any leaks accounted for here were introduced by 2542 * rebuild_refcount_structure() because that function has created a 2543 * new refcount structure from scratch */ 2544 fresh_leaks = res->leaks; 2545 *res = saved_res; 2546 } 2547 2548 if (res->corruptions < old_res.corruptions) { 2549 res->corruptions_fixed += old_res.corruptions - res->corruptions; 2550 } 2551 if (res->leaks < old_res.leaks) { 2552 res->leaks_fixed += old_res.leaks - res->leaks; 2553 } 2554 res->leaks += fresh_leaks; 2555 } else if (fix) { 2556 if (rebuild) { 2557 fprintf(stderr, "ERROR need to rebuild refcount structures\n"); 2558 res->check_errors++; 2559 ret = -EIO; 2560 goto fail; 2561 } 2562 2563 if (res->leaks || res->corruptions) { 2564 *res = pre_compare_res; 2565 compare_refcounts(bs, res, fix, &rebuild, &highest_cluster, 2566 refcount_table, nb_clusters); 2567 } 2568 } 2569 2570 /* check OFLAG_COPIED */ 2571 ret = check_oflag_copied(bs, res, fix); 2572 if (ret < 0) { 2573 goto fail; 2574 } 2575 2576 res->image_end_offset = (highest_cluster + 1) * s->cluster_size; 2577 ret = 0; 2578 2579 fail: 2580 g_free(refcount_table); 2581 2582 return ret; 2583 } 2584 2585 #define overlaps_with(ofs, sz) \ 2586 ranges_overlap(offset, size, ofs, sz) 2587 2588 /* 2589 * Checks if the given offset into the image file is actually free to use by 2590 * looking for overlaps with important metadata sections (L1/L2 tables etc.), 2591 * i.e. a sanity check without relying on the refcount tables. 2592 * 2593 * The ign parameter specifies what checks not to perform (being a bitmask of 2594 * QCow2MetadataOverlap values), i.e., what sections to ignore. 2595 * 2596 * Returns: 2597 * - 0 if writing to this offset will not affect the mentioned metadata 2598 * - a positive QCow2MetadataOverlap value indicating one overlapping section 2599 * - a negative value (-errno) indicating an error while performing a check, 2600 * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 2601 */ 2602 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, 2603 int64_t size) 2604 { 2605 BDRVQcow2State *s = bs->opaque; 2606 int chk = s->overlap_check & ~ign; 2607 int i, j; 2608 2609 if (!size) { 2610 return 0; 2611 } 2612 2613 if (chk & QCOW2_OL_MAIN_HEADER) { 2614 if (offset < s->cluster_size) { 2615 return QCOW2_OL_MAIN_HEADER; 2616 } 2617 } 2618 2619 /* align range to test to cluster boundaries */ 2620 size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size); 2621 offset = start_of_cluster(s, offset); 2622 2623 if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { 2624 if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { 2625 return QCOW2_OL_ACTIVE_L1; 2626 } 2627 } 2628 2629 if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { 2630 if (overlaps_with(s->refcount_table_offset, 2631 s->refcount_table_size * sizeof(uint64_t))) { 2632 return QCOW2_OL_REFCOUNT_TABLE; 2633 } 2634 } 2635 2636 if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { 2637 if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { 2638 return QCOW2_OL_SNAPSHOT_TABLE; 2639 } 2640 } 2641 2642 if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { 2643 for (i = 0; i < s->nb_snapshots; i++) { 2644 if (s->snapshots[i].l1_size && 2645 overlaps_with(s->snapshots[i].l1_table_offset, 2646 s->snapshots[i].l1_size * sizeof(uint64_t))) { 2647 return QCOW2_OL_INACTIVE_L1; 2648 } 2649 } 2650 } 2651 2652 if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { 2653 for (i = 0; i < s->l1_size; i++) { 2654 if ((s->l1_table[i] & L1E_OFFSET_MASK) && 2655 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, 2656 s->cluster_size)) { 2657 return QCOW2_OL_ACTIVE_L2; 2658 } 2659 } 2660 } 2661 2662 if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { 2663 unsigned last_entry = s->max_refcount_table_index; 2664 assert(last_entry < s->refcount_table_size); 2665 assert(last_entry + 1 == s->refcount_table_size || 2666 (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0); 2667 for (i = 0; i <= last_entry; i++) { 2668 if ((s->refcount_table[i] & REFT_OFFSET_MASK) && 2669 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, 2670 s->cluster_size)) { 2671 return QCOW2_OL_REFCOUNT_BLOCK; 2672 } 2673 } 2674 } 2675 2676 if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { 2677 for (i = 0; i < s->nb_snapshots; i++) { 2678 uint64_t l1_ofs = s->snapshots[i].l1_table_offset; 2679 uint32_t l1_sz = s->snapshots[i].l1_size; 2680 uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); 2681 uint64_t *l1; 2682 int ret; 2683 2684 ret = qcow2_validate_table(bs, l1_ofs, l1_sz, sizeof(uint64_t), 2685 QCOW_MAX_L1_SIZE, "", NULL); 2686 if (ret < 0) { 2687 return ret; 2688 } 2689 2690 l1 = g_try_malloc(l1_sz2); 2691 2692 if (l1_sz2 && l1 == NULL) { 2693 return -ENOMEM; 2694 } 2695 2696 ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); 2697 if (ret < 0) { 2698 g_free(l1); 2699 return ret; 2700 } 2701 2702 for (j = 0; j < l1_sz; j++) { 2703 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; 2704 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { 2705 g_free(l1); 2706 return QCOW2_OL_INACTIVE_L2; 2707 } 2708 } 2709 2710 g_free(l1); 2711 } 2712 } 2713 2714 if ((chk & QCOW2_OL_BITMAP_DIRECTORY) && 2715 (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) 2716 { 2717 if (overlaps_with(s->bitmap_directory_offset, 2718 s->bitmap_directory_size)) 2719 { 2720 return QCOW2_OL_BITMAP_DIRECTORY; 2721 } 2722 } 2723 2724 return 0; 2725 } 2726 2727 static const char *metadata_ol_names[] = { 2728 [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", 2729 [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", 2730 [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", 2731 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", 2732 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", 2733 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", 2734 [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", 2735 [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", 2736 [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = "bitmap directory", 2737 }; 2738 QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names)); 2739 2740 /* 2741 * First performs a check for metadata overlaps (through 2742 * qcow2_check_metadata_overlap); if that fails with a negative value (error 2743 * while performing a check), that value is returned. If an impending overlap 2744 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt 2745 * and -EIO returned. 2746 * 2747 * Returns 0 if there were neither overlaps nor errors while checking for 2748 * overlaps; or a negative value (-errno) on error. 2749 */ 2750 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, 2751 int64_t size) 2752 { 2753 int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); 2754 2755 if (ret < 0) { 2756 return ret; 2757 } else if (ret > 0) { 2758 int metadata_ol_bitnr = ctz32(ret); 2759 assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); 2760 2761 qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " 2762 "write on metadata (overlaps with %s)", 2763 metadata_ol_names[metadata_ol_bitnr]); 2764 return -EIO; 2765 } 2766 2767 return 0; 2768 } 2769 2770 /* A pointer to a function of this type is given to walk_over_reftable(). That 2771 * function will create refblocks and pass them to a RefblockFinishOp once they 2772 * are completed (@refblock). @refblock_empty is set if the refblock is 2773 * completely empty. 2774 * 2775 * Along with the refblock, a corresponding reftable entry is passed, in the 2776 * reftable @reftable (which may be reallocated) at @reftable_index. 2777 * 2778 * @allocated should be set to true if a new cluster has been allocated. 2779 */ 2780 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable, 2781 uint64_t reftable_index, uint64_t *reftable_size, 2782 void *refblock, bool refblock_empty, 2783 bool *allocated, Error **errp); 2784 2785 /** 2786 * This "operation" for walk_over_reftable() allocates the refblock on disk (if 2787 * it is not empty) and inserts its offset into the new reftable. The size of 2788 * this new reftable is increased as required. 2789 */ 2790 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable, 2791 uint64_t reftable_index, uint64_t *reftable_size, 2792 void *refblock, bool refblock_empty, bool *allocated, 2793 Error **errp) 2794 { 2795 BDRVQcow2State *s = bs->opaque; 2796 int64_t offset; 2797 2798 if (!refblock_empty && reftable_index >= *reftable_size) { 2799 uint64_t *new_reftable; 2800 uint64_t new_reftable_size; 2801 2802 new_reftable_size = ROUND_UP(reftable_index + 1, 2803 s->cluster_size / sizeof(uint64_t)); 2804 if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { 2805 error_setg(errp, 2806 "This operation would make the refcount table grow " 2807 "beyond the maximum size supported by QEMU, aborting"); 2808 return -ENOTSUP; 2809 } 2810 2811 new_reftable = g_try_realloc(*reftable, new_reftable_size * 2812 sizeof(uint64_t)); 2813 if (!new_reftable) { 2814 error_setg(errp, "Failed to increase reftable buffer size"); 2815 return -ENOMEM; 2816 } 2817 2818 memset(new_reftable + *reftable_size, 0, 2819 (new_reftable_size - *reftable_size) * sizeof(uint64_t)); 2820 2821 *reftable = new_reftable; 2822 *reftable_size = new_reftable_size; 2823 } 2824 2825 if (!refblock_empty && !(*reftable)[reftable_index]) { 2826 offset = qcow2_alloc_clusters(bs, s->cluster_size); 2827 if (offset < 0) { 2828 error_setg_errno(errp, -offset, "Failed to allocate refblock"); 2829 return offset; 2830 } 2831 (*reftable)[reftable_index] = offset; 2832 *allocated = true; 2833 } 2834 2835 return 0; 2836 } 2837 2838 /** 2839 * This "operation" for walk_over_reftable() writes the refblock to disk at the 2840 * offset specified by the new reftable's entry. It does not modify the new 2841 * reftable or change any refcounts. 2842 */ 2843 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable, 2844 uint64_t reftable_index, uint64_t *reftable_size, 2845 void *refblock, bool refblock_empty, bool *allocated, 2846 Error **errp) 2847 { 2848 BDRVQcow2State *s = bs->opaque; 2849 int64_t offset; 2850 int ret; 2851 2852 if (reftable_index < *reftable_size && (*reftable)[reftable_index]) { 2853 offset = (*reftable)[reftable_index]; 2854 2855 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 2856 if (ret < 0) { 2857 error_setg_errno(errp, -ret, "Overlap check failed"); 2858 return ret; 2859 } 2860 2861 ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size); 2862 if (ret < 0) { 2863 error_setg_errno(errp, -ret, "Failed to write refblock"); 2864 return ret; 2865 } 2866 } else { 2867 assert(refblock_empty); 2868 } 2869 2870 return 0; 2871 } 2872 2873 /** 2874 * This function walks over the existing reftable and every referenced refblock; 2875 * if @new_set_refcount is non-NULL, it is called for every refcount entry to 2876 * create an equal new entry in the passed @new_refblock. Once that 2877 * @new_refblock is completely filled, @operation will be called. 2878 * 2879 * @status_cb and @cb_opaque are used for the amend operation's status callback. 2880 * @index is the index of the walk_over_reftable() calls and @total is the total 2881 * number of walk_over_reftable() calls per amend operation. Both are used for 2882 * calculating the parameters for the status callback. 2883 * 2884 * @allocated is set to true if a new cluster has been allocated. 2885 */ 2886 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable, 2887 uint64_t *new_reftable_index, 2888 uint64_t *new_reftable_size, 2889 void *new_refblock, int new_refblock_size, 2890 int new_refcount_bits, 2891 RefblockFinishOp *operation, bool *allocated, 2892 Qcow2SetRefcountFunc *new_set_refcount, 2893 BlockDriverAmendStatusCB *status_cb, 2894 void *cb_opaque, int index, int total, 2895 Error **errp) 2896 { 2897 BDRVQcow2State *s = bs->opaque; 2898 uint64_t reftable_index; 2899 bool new_refblock_empty = true; 2900 int refblock_index; 2901 int new_refblock_index = 0; 2902 int ret; 2903 2904 for (reftable_index = 0; reftable_index < s->refcount_table_size; 2905 reftable_index++) 2906 { 2907 uint64_t refblock_offset = s->refcount_table[reftable_index] 2908 & REFT_OFFSET_MASK; 2909 2910 status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index, 2911 (uint64_t)total * s->refcount_table_size, cb_opaque); 2912 2913 if (refblock_offset) { 2914 void *refblock; 2915 2916 if (offset_into_cluster(s, refblock_offset)) { 2917 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 2918 PRIx64 " unaligned (reftable index: %#" 2919 PRIx64 ")", refblock_offset, 2920 reftable_index); 2921 error_setg(errp, 2922 "Image is corrupt (unaligned refblock offset)"); 2923 return -EIO; 2924 } 2925 2926 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset, 2927 &refblock); 2928 if (ret < 0) { 2929 error_setg_errno(errp, -ret, "Failed to retrieve refblock"); 2930 return ret; 2931 } 2932 2933 for (refblock_index = 0; refblock_index < s->refcount_block_size; 2934 refblock_index++) 2935 { 2936 uint64_t refcount; 2937 2938 if (new_refblock_index >= new_refblock_size) { 2939 /* new_refblock is now complete */ 2940 ret = operation(bs, new_reftable, *new_reftable_index, 2941 new_reftable_size, new_refblock, 2942 new_refblock_empty, allocated, errp); 2943 if (ret < 0) { 2944 qcow2_cache_put(s->refcount_block_cache, &refblock); 2945 return ret; 2946 } 2947 2948 (*new_reftable_index)++; 2949 new_refblock_index = 0; 2950 new_refblock_empty = true; 2951 } 2952 2953 refcount = s->get_refcount(refblock, refblock_index); 2954 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) { 2955 uint64_t offset; 2956 2957 qcow2_cache_put(s->refcount_block_cache, &refblock); 2958 2959 offset = ((reftable_index << s->refcount_block_bits) 2960 + refblock_index) << s->cluster_bits; 2961 2962 error_setg(errp, "Cannot decrease refcount entry width to " 2963 "%i bits: Cluster at offset %#" PRIx64 " has a " 2964 "refcount of %" PRIu64, new_refcount_bits, 2965 offset, refcount); 2966 return -EINVAL; 2967 } 2968 2969 if (new_set_refcount) { 2970 new_set_refcount(new_refblock, new_refblock_index++, 2971 refcount); 2972 } else { 2973 new_refblock_index++; 2974 } 2975 new_refblock_empty = new_refblock_empty && refcount == 0; 2976 } 2977 2978 qcow2_cache_put(s->refcount_block_cache, &refblock); 2979 } else { 2980 /* No refblock means every refcount is 0 */ 2981 for (refblock_index = 0; refblock_index < s->refcount_block_size; 2982 refblock_index++) 2983 { 2984 if (new_refblock_index >= new_refblock_size) { 2985 /* new_refblock is now complete */ 2986 ret = operation(bs, new_reftable, *new_reftable_index, 2987 new_reftable_size, new_refblock, 2988 new_refblock_empty, allocated, errp); 2989 if (ret < 0) { 2990 return ret; 2991 } 2992 2993 (*new_reftable_index)++; 2994 new_refblock_index = 0; 2995 new_refblock_empty = true; 2996 } 2997 2998 if (new_set_refcount) { 2999 new_set_refcount(new_refblock, new_refblock_index++, 0); 3000 } else { 3001 new_refblock_index++; 3002 } 3003 } 3004 } 3005 } 3006 3007 if (new_refblock_index > 0) { 3008 /* Complete the potentially existing partially filled final refblock */ 3009 if (new_set_refcount) { 3010 for (; new_refblock_index < new_refblock_size; 3011 new_refblock_index++) 3012 { 3013 new_set_refcount(new_refblock, new_refblock_index, 0); 3014 } 3015 } 3016 3017 ret = operation(bs, new_reftable, *new_reftable_index, 3018 new_reftable_size, new_refblock, new_refblock_empty, 3019 allocated, errp); 3020 if (ret < 0) { 3021 return ret; 3022 } 3023 3024 (*new_reftable_index)++; 3025 } 3026 3027 status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size, 3028 (uint64_t)total * s->refcount_table_size, cb_opaque); 3029 3030 return 0; 3031 } 3032 3033 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, 3034 BlockDriverAmendStatusCB *status_cb, 3035 void *cb_opaque, Error **errp) 3036 { 3037 BDRVQcow2State *s = bs->opaque; 3038 Qcow2GetRefcountFunc *new_get_refcount; 3039 Qcow2SetRefcountFunc *new_set_refcount; 3040 void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size); 3041 uint64_t *new_reftable = NULL, new_reftable_size = 0; 3042 uint64_t *old_reftable, old_reftable_size, old_reftable_offset; 3043 uint64_t new_reftable_index = 0; 3044 uint64_t i; 3045 int64_t new_reftable_offset = 0, allocated_reftable_size = 0; 3046 int new_refblock_size, new_refcount_bits = 1 << refcount_order; 3047 int old_refcount_order; 3048 int walk_index = 0; 3049 int ret; 3050 bool new_allocation; 3051 3052 assert(s->qcow_version >= 3); 3053 assert(refcount_order >= 0 && refcount_order <= 6); 3054 3055 /* see qcow2_open() */ 3056 new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3)); 3057 3058 new_get_refcount = get_refcount_funcs[refcount_order]; 3059 new_set_refcount = set_refcount_funcs[refcount_order]; 3060 3061 3062 do { 3063 int total_walks; 3064 3065 new_allocation = false; 3066 3067 /* At least we have to do this walk and the one which writes the 3068 * refblocks; also, at least we have to do this loop here at least 3069 * twice (normally), first to do the allocations, and second to 3070 * determine that everything is correctly allocated, this then makes 3071 * three walks in total */ 3072 total_walks = MAX(walk_index + 2, 3); 3073 3074 /* First, allocate the structures so they are present in the refcount 3075 * structures */ 3076 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, 3077 &new_reftable_size, NULL, new_refblock_size, 3078 new_refcount_bits, &alloc_refblock, 3079 &new_allocation, NULL, status_cb, cb_opaque, 3080 walk_index++, total_walks, errp); 3081 if (ret < 0) { 3082 goto done; 3083 } 3084 3085 new_reftable_index = 0; 3086 3087 if (new_allocation) { 3088 if (new_reftable_offset) { 3089 qcow2_free_clusters(bs, new_reftable_offset, 3090 allocated_reftable_size * sizeof(uint64_t), 3091 QCOW2_DISCARD_NEVER); 3092 } 3093 3094 new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size * 3095 sizeof(uint64_t)); 3096 if (new_reftable_offset < 0) { 3097 error_setg_errno(errp, -new_reftable_offset, 3098 "Failed to allocate the new reftable"); 3099 ret = new_reftable_offset; 3100 goto done; 3101 } 3102 allocated_reftable_size = new_reftable_size; 3103 } 3104 } while (new_allocation); 3105 3106 /* Second, write the new refblocks */ 3107 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, 3108 &new_reftable_size, new_refblock, 3109 new_refblock_size, new_refcount_bits, 3110 &flush_refblock, &new_allocation, new_set_refcount, 3111 status_cb, cb_opaque, walk_index, walk_index + 1, 3112 errp); 3113 if (ret < 0) { 3114 goto done; 3115 } 3116 assert(!new_allocation); 3117 3118 3119 /* Write the new reftable */ 3120 ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset, 3121 new_reftable_size * sizeof(uint64_t)); 3122 if (ret < 0) { 3123 error_setg_errno(errp, -ret, "Overlap check failed"); 3124 goto done; 3125 } 3126 3127 for (i = 0; i < new_reftable_size; i++) { 3128 cpu_to_be64s(&new_reftable[i]); 3129 } 3130 3131 ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable, 3132 new_reftable_size * sizeof(uint64_t)); 3133 3134 for (i = 0; i < new_reftable_size; i++) { 3135 be64_to_cpus(&new_reftable[i]); 3136 } 3137 3138 if (ret < 0) { 3139 error_setg_errno(errp, -ret, "Failed to write the new reftable"); 3140 goto done; 3141 } 3142 3143 3144 /* Empty the refcount cache */ 3145 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 3146 if (ret < 0) { 3147 error_setg_errno(errp, -ret, "Failed to flush the refblock cache"); 3148 goto done; 3149 } 3150 3151 /* Update the image header to point to the new reftable; this only updates 3152 * the fields which are relevant to qcow2_update_header(); other fields 3153 * such as s->refcount_table or s->refcount_bits stay stale for now 3154 * (because we have to restore everything if qcow2_update_header() fails) */ 3155 old_refcount_order = s->refcount_order; 3156 old_reftable_size = s->refcount_table_size; 3157 old_reftable_offset = s->refcount_table_offset; 3158 3159 s->refcount_order = refcount_order; 3160 s->refcount_table_size = new_reftable_size; 3161 s->refcount_table_offset = new_reftable_offset; 3162 3163 ret = qcow2_update_header(bs); 3164 if (ret < 0) { 3165 s->refcount_order = old_refcount_order; 3166 s->refcount_table_size = old_reftable_size; 3167 s->refcount_table_offset = old_reftable_offset; 3168 error_setg_errno(errp, -ret, "Failed to update the qcow2 header"); 3169 goto done; 3170 } 3171 3172 /* Now update the rest of the in-memory information */ 3173 old_reftable = s->refcount_table; 3174 s->refcount_table = new_reftable; 3175 update_max_refcount_table_index(s); 3176 3177 s->refcount_bits = 1 << refcount_order; 3178 s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); 3179 s->refcount_max += s->refcount_max - 1; 3180 3181 s->refcount_block_bits = s->cluster_bits - (refcount_order - 3); 3182 s->refcount_block_size = 1 << s->refcount_block_bits; 3183 3184 s->get_refcount = new_get_refcount; 3185 s->set_refcount = new_set_refcount; 3186 3187 /* For cleaning up all old refblocks and the old reftable below the "done" 3188 * label */ 3189 new_reftable = old_reftable; 3190 new_reftable_size = old_reftable_size; 3191 new_reftable_offset = old_reftable_offset; 3192 3193 done: 3194 if (new_reftable) { 3195 /* On success, new_reftable actually points to the old reftable (and 3196 * new_reftable_size is the old reftable's size); but that is just 3197 * fine */ 3198 for (i = 0; i < new_reftable_size; i++) { 3199 uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK; 3200 if (offset) { 3201 qcow2_free_clusters(bs, offset, s->cluster_size, 3202 QCOW2_DISCARD_OTHER); 3203 } 3204 } 3205 g_free(new_reftable); 3206 3207 if (new_reftable_offset > 0) { 3208 qcow2_free_clusters(bs, new_reftable_offset, 3209 new_reftable_size * sizeof(uint64_t), 3210 QCOW2_DISCARD_OTHER); 3211 } 3212 } 3213 3214 qemu_vfree(new_refblock); 3215 return ret; 3216 } 3217 3218 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset) 3219 { 3220 BDRVQcow2State *s = bs->opaque; 3221 uint32_t index = offset_to_reftable_index(s, offset); 3222 int64_t covering_refblock_offset = 0; 3223 3224 if (index < s->refcount_table_size) { 3225 covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK; 3226 } 3227 if (!covering_refblock_offset) { 3228 qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is " 3229 "not covered by the refcount structures", 3230 offset); 3231 return -EIO; 3232 } 3233 3234 return covering_refblock_offset; 3235 } 3236 3237 static int qcow2_discard_refcount_block(BlockDriverState *bs, 3238 uint64_t discard_block_offs) 3239 { 3240 BDRVQcow2State *s = bs->opaque; 3241 int64_t refblock_offs; 3242 uint64_t cluster_index = discard_block_offs >> s->cluster_bits; 3243 uint32_t block_index = cluster_index & (s->refcount_block_size - 1); 3244 void *refblock; 3245 int ret; 3246 3247 refblock_offs = get_refblock_offset(bs, discard_block_offs); 3248 if (refblock_offs < 0) { 3249 return refblock_offs; 3250 } 3251 3252 assert(discard_block_offs != 0); 3253 3254 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs, 3255 &refblock); 3256 if (ret < 0) { 3257 return ret; 3258 } 3259 3260 if (s->get_refcount(refblock, block_index) != 1) { 3261 qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:" 3262 " refblock offset %#" PRIx64 3263 ", reftable index %u" 3264 ", block offset %#" PRIx64 3265 ", refcount %#" PRIx64, 3266 refblock_offs, 3267 offset_to_reftable_index(s, discard_block_offs), 3268 discard_block_offs, 3269 s->get_refcount(refblock, block_index)); 3270 qcow2_cache_put(s->refcount_block_cache, &refblock); 3271 return -EINVAL; 3272 } 3273 s->set_refcount(refblock, block_index, 0); 3274 3275 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock); 3276 3277 qcow2_cache_put(s->refcount_block_cache, &refblock); 3278 3279 if (cluster_index < s->free_cluster_index) { 3280 s->free_cluster_index = cluster_index; 3281 } 3282 3283 refblock = qcow2_cache_is_table_offset(s->refcount_block_cache, 3284 discard_block_offs); 3285 if (refblock) { 3286 /* discard refblock from the cache if refblock is cached */ 3287 qcow2_cache_discard(s->refcount_block_cache, refblock); 3288 } 3289 update_refcount_discard(bs, discard_block_offs, s->cluster_size); 3290 3291 return 0; 3292 } 3293 3294 int qcow2_shrink_reftable(BlockDriverState *bs) 3295 { 3296 BDRVQcow2State *s = bs->opaque; 3297 uint64_t *reftable_tmp = 3298 g_malloc(s->refcount_table_size * sizeof(uint64_t)); 3299 int i, ret; 3300 3301 for (i = 0; i < s->refcount_table_size; i++) { 3302 int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK; 3303 void *refblock; 3304 bool unused_block; 3305 3306 if (refblock_offs == 0) { 3307 reftable_tmp[i] = 0; 3308 continue; 3309 } 3310 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs, 3311 &refblock); 3312 if (ret < 0) { 3313 goto out; 3314 } 3315 3316 /* the refblock has own reference */ 3317 if (i == offset_to_reftable_index(s, refblock_offs)) { 3318 uint64_t block_index = (refblock_offs >> s->cluster_bits) & 3319 (s->refcount_block_size - 1); 3320 uint64_t refcount = s->get_refcount(refblock, block_index); 3321 3322 s->set_refcount(refblock, block_index, 0); 3323 3324 unused_block = buffer_is_zero(refblock, s->cluster_size); 3325 3326 s->set_refcount(refblock, block_index, refcount); 3327 } else { 3328 unused_block = buffer_is_zero(refblock, s->cluster_size); 3329 } 3330 qcow2_cache_put(s->refcount_block_cache, &refblock); 3331 3332 reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]); 3333 } 3334 3335 ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp, 3336 s->refcount_table_size * sizeof(uint64_t)); 3337 /* 3338 * If the write in the reftable failed the image may contain a partially 3339 * overwritten reftable. In this case it would be better to clear the 3340 * reftable in memory to avoid possible image corruption. 3341 */ 3342 for (i = 0; i < s->refcount_table_size; i++) { 3343 if (s->refcount_table[i] && !reftable_tmp[i]) { 3344 if (ret == 0) { 3345 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] & 3346 REFT_OFFSET_MASK); 3347 } 3348 s->refcount_table[i] = 0; 3349 } 3350 } 3351 3352 if (!s->cache_discards) { 3353 qcow2_process_discards(bs, ret); 3354 } 3355 3356 out: 3357 g_free(reftable_tmp); 3358 return ret; 3359 } 3360 3361 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size) 3362 { 3363 BDRVQcow2State *s = bs->opaque; 3364 int64_t i; 3365 3366 for (i = size_to_clusters(s, size) - 1; i >= 0; i--) { 3367 uint64_t refcount; 3368 int ret = qcow2_get_refcount(bs, i, &refcount); 3369 if (ret < 0) { 3370 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", 3371 i, strerror(-ret)); 3372 return ret; 3373 } 3374 if (refcount > 0) { 3375 return i; 3376 } 3377 } 3378 qcow2_signal_corruption(bs, true, -1, -1, 3379 "There are no references in the refcount table."); 3380 return -EIO; 3381 } 3382