1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "block/block-io.h" 27 #include "qapi/error.h" 28 #include "qcow2.h" 29 #include "qemu/range.h" 30 #include "qemu/bswap.h" 31 #include "qemu/cutils.h" 32 #include "qemu/memalign.h" 33 #include "trace.h" 34 35 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size, 36 uint64_t max); 37 38 G_GNUC_WARN_UNUSED_RESULT 39 static int update_refcount(BlockDriverState *bs, 40 int64_t offset, int64_t length, uint64_t addend, 41 bool decrease, enum qcow2_discard_type type); 42 43 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index); 44 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index); 45 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index); 46 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index); 47 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index); 48 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index); 49 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index); 50 51 static void set_refcount_ro0(void *refcount_array, uint64_t index, 52 uint64_t value); 53 static void set_refcount_ro1(void *refcount_array, uint64_t index, 54 uint64_t value); 55 static void set_refcount_ro2(void *refcount_array, uint64_t index, 56 uint64_t value); 57 static void set_refcount_ro3(void *refcount_array, uint64_t index, 58 uint64_t value); 59 static void set_refcount_ro4(void *refcount_array, uint64_t index, 60 uint64_t value); 61 static void set_refcount_ro5(void *refcount_array, uint64_t index, 62 uint64_t value); 63 static void set_refcount_ro6(void *refcount_array, uint64_t index, 64 uint64_t value); 65 66 67 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = { 68 &get_refcount_ro0, 69 &get_refcount_ro1, 70 &get_refcount_ro2, 71 &get_refcount_ro3, 72 &get_refcount_ro4, 73 &get_refcount_ro5, 74 &get_refcount_ro6 75 }; 76 77 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { 78 &set_refcount_ro0, 79 &set_refcount_ro1, 80 &set_refcount_ro2, 81 &set_refcount_ro3, 82 &set_refcount_ro4, 83 &set_refcount_ro5, 84 &set_refcount_ro6 85 }; 86 87 88 /*********************************************************/ 89 /* refcount handling */ 90 91 static void update_max_refcount_table_index(BDRVQcow2State *s) 92 { 93 unsigned i = s->refcount_table_size - 1; 94 while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) { 95 i--; 96 } 97 /* Set s->max_refcount_table_index to the index of the last used entry */ 98 s->max_refcount_table_index = i; 99 } 100 101 int coroutine_fn qcow2_refcount_init(BlockDriverState *bs) 102 { 103 BDRVQcow2State *s = bs->opaque; 104 unsigned int refcount_table_size2, i; 105 int ret; 106 107 assert(s->refcount_order >= 0 && s->refcount_order <= 6); 108 109 s->get_refcount = get_refcount_funcs[s->refcount_order]; 110 s->set_refcount = set_refcount_funcs[s->refcount_order]; 111 112 assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE); 113 refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE; 114 s->refcount_table = g_try_malloc(refcount_table_size2); 115 116 if (s->refcount_table_size > 0) { 117 if (s->refcount_table == NULL) { 118 ret = -ENOMEM; 119 goto fail; 120 } 121 BLKDBG_CO_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); 122 ret = bdrv_co_pread(bs->file, s->refcount_table_offset, 123 refcount_table_size2, s->refcount_table, 0); 124 if (ret < 0) { 125 goto fail; 126 } 127 for(i = 0; i < s->refcount_table_size; i++) 128 be64_to_cpus(&s->refcount_table[i]); 129 update_max_refcount_table_index(s); 130 } 131 return 0; 132 fail: 133 return ret; 134 } 135 136 void qcow2_refcount_close(BlockDriverState *bs) 137 { 138 BDRVQcow2State *s = bs->opaque; 139 g_free(s->refcount_table); 140 } 141 142 143 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index) 144 { 145 return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1; 146 } 147 148 static void set_refcount_ro0(void *refcount_array, uint64_t index, 149 uint64_t value) 150 { 151 assert(!(value >> 1)); 152 ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8)); 153 ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8); 154 } 155 156 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index) 157 { 158 return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4))) 159 & 0x3; 160 } 161 162 static void set_refcount_ro1(void *refcount_array, uint64_t index, 163 uint64_t value) 164 { 165 assert(!(value >> 2)); 166 ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4))); 167 ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4)); 168 } 169 170 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index) 171 { 172 return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2))) 173 & 0xf; 174 } 175 176 static void set_refcount_ro2(void *refcount_array, uint64_t index, 177 uint64_t value) 178 { 179 assert(!(value >> 4)); 180 ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2))); 181 ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2)); 182 } 183 184 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index) 185 { 186 return ((const uint8_t *)refcount_array)[index]; 187 } 188 189 static void set_refcount_ro3(void *refcount_array, uint64_t index, 190 uint64_t value) 191 { 192 assert(!(value >> 8)); 193 ((uint8_t *)refcount_array)[index] = value; 194 } 195 196 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index) 197 { 198 return be16_to_cpu(((const uint16_t *)refcount_array)[index]); 199 } 200 201 static void set_refcount_ro4(void *refcount_array, uint64_t index, 202 uint64_t value) 203 { 204 assert(!(value >> 16)); 205 ((uint16_t *)refcount_array)[index] = cpu_to_be16(value); 206 } 207 208 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index) 209 { 210 return be32_to_cpu(((const uint32_t *)refcount_array)[index]); 211 } 212 213 static void set_refcount_ro5(void *refcount_array, uint64_t index, 214 uint64_t value) 215 { 216 assert(!(value >> 32)); 217 ((uint32_t *)refcount_array)[index] = cpu_to_be32(value); 218 } 219 220 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index) 221 { 222 return be64_to_cpu(((const uint64_t *)refcount_array)[index]); 223 } 224 225 static void set_refcount_ro6(void *refcount_array, uint64_t index, 226 uint64_t value) 227 { 228 ((uint64_t *)refcount_array)[index] = cpu_to_be64(value); 229 } 230 231 232 static int GRAPH_RDLOCK 233 load_refcount_block(BlockDriverState *bs, int64_t refcount_block_offset, 234 void **refcount_block) 235 { 236 BDRVQcow2State *s = bs->opaque; 237 238 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); 239 return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 240 refcount_block); 241 } 242 243 /* 244 * Retrieves the refcount of the cluster given by its index and stores it in 245 * *refcount. Returns 0 on success and -errno on failure. 246 */ 247 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, 248 uint64_t *refcount) 249 { 250 BDRVQcow2State *s = bs->opaque; 251 uint64_t refcount_table_index, block_index; 252 int64_t refcount_block_offset; 253 int ret; 254 void *refcount_block; 255 256 refcount_table_index = cluster_index >> s->refcount_block_bits; 257 if (refcount_table_index >= s->refcount_table_size) { 258 *refcount = 0; 259 return 0; 260 } 261 refcount_block_offset = 262 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 263 if (!refcount_block_offset) { 264 *refcount = 0; 265 return 0; 266 } 267 268 if (offset_into_cluster(s, refcount_block_offset)) { 269 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 270 " unaligned (reftable index: %#" PRIx64 ")", 271 refcount_block_offset, refcount_table_index); 272 return -EIO; 273 } 274 275 ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 276 &refcount_block); 277 if (ret < 0) { 278 return ret; 279 } 280 281 block_index = cluster_index & (s->refcount_block_size - 1); 282 *refcount = s->get_refcount(refcount_block, block_index); 283 284 qcow2_cache_put(s->refcount_block_cache, &refcount_block); 285 286 return 0; 287 } 288 289 /* Checks if two offsets are described by the same refcount block */ 290 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a, 291 uint64_t offset_b) 292 { 293 uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits); 294 uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits); 295 296 return (block_a == block_b); 297 } 298 299 /* 300 * Loads a refcount block. If it doesn't exist yet, it is allocated first 301 * (including growing the refcount table if needed). 302 * 303 * Returns 0 on success or -errno in error case 304 */ 305 static int GRAPH_RDLOCK 306 alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index, 307 void **refcount_block) 308 { 309 BDRVQcow2State *s = bs->opaque; 310 unsigned int refcount_table_index; 311 int64_t ret; 312 313 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); 314 315 /* Find the refcount block for the given cluster */ 316 refcount_table_index = cluster_index >> s->refcount_block_bits; 317 318 if (refcount_table_index < s->refcount_table_size) { 319 320 uint64_t refcount_block_offset = 321 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 322 323 /* If it's already there, we're done */ 324 if (refcount_block_offset) { 325 if (offset_into_cluster(s, refcount_block_offset)) { 326 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 327 PRIx64 " unaligned (reftable index: " 328 "%#x)", refcount_block_offset, 329 refcount_table_index); 330 return -EIO; 331 } 332 333 return load_refcount_block(bs, refcount_block_offset, 334 refcount_block); 335 } 336 } 337 338 /* 339 * If we came here, we need to allocate something. Something is at least 340 * a cluster for the new refcount block. It may also include a new refcount 341 * table if the old refcount table is too small. 342 * 343 * Note that allocating clusters here needs some special care: 344 * 345 * - We can't use the normal qcow2_alloc_clusters(), it would try to 346 * increase the refcount and very likely we would end up with an endless 347 * recursion. Instead we must place the refcount blocks in a way that 348 * they can describe them themselves. 349 * 350 * - We need to consider that at this point we are inside update_refcounts 351 * and potentially doing an initial refcount increase. This means that 352 * some clusters have already been allocated by the caller, but their 353 * refcount isn't accurate yet. If we allocate clusters for metadata, we 354 * need to return -EAGAIN to signal the caller that it needs to restart 355 * the search for free clusters. 356 * 357 * - alloc_clusters_noref and qcow2_free_clusters may load a different 358 * refcount block into the cache 359 */ 360 361 *refcount_block = NULL; 362 363 /* We write to the refcount table, so we might depend on L2 tables */ 364 ret = qcow2_cache_flush(bs, s->l2_table_cache); 365 if (ret < 0) { 366 return ret; 367 } 368 369 /* Allocate the refcount block itself and mark it as used */ 370 int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX); 371 if (new_block < 0) { 372 return new_block; 373 } 374 375 /* The offset must fit in the offset field of the refcount table entry */ 376 assert((new_block & REFT_OFFSET_MASK) == new_block); 377 378 /* If we're allocating the block at offset 0 then something is wrong */ 379 if (new_block == 0) { 380 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid " 381 "allocation of refcount block at offset 0"); 382 return -EIO; 383 } 384 385 #ifdef DEBUG_ALLOC2 386 fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 387 " at %" PRIx64 "\n", 388 refcount_table_index, cluster_index << s->cluster_bits, new_block); 389 #endif 390 391 if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { 392 /* Zero the new refcount block before updating it */ 393 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 394 refcount_block); 395 if (ret < 0) { 396 goto fail; 397 } 398 399 memset(*refcount_block, 0, s->cluster_size); 400 401 /* The block describes itself, need to update the cache */ 402 int block_index = (new_block >> s->cluster_bits) & 403 (s->refcount_block_size - 1); 404 s->set_refcount(*refcount_block, block_index, 1); 405 } else { 406 /* Described somewhere else. This can recurse at most twice before we 407 * arrive at a block that describes itself. */ 408 ret = update_refcount(bs, new_block, s->cluster_size, 1, false, 409 QCOW2_DISCARD_NEVER); 410 if (ret < 0) { 411 goto fail; 412 } 413 414 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 415 if (ret < 0) { 416 goto fail; 417 } 418 419 /* Initialize the new refcount block only after updating its refcount, 420 * update_refcount uses the refcount cache itself */ 421 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 422 refcount_block); 423 if (ret < 0) { 424 goto fail; 425 } 426 427 memset(*refcount_block, 0, s->cluster_size); 428 } 429 430 /* Now the new refcount block needs to be written to disk */ 431 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); 432 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block); 433 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 434 if (ret < 0) { 435 goto fail; 436 } 437 438 /* If the refcount table is big enough, just hook the block up there */ 439 if (refcount_table_index < s->refcount_table_size) { 440 uint64_t data64 = cpu_to_be64(new_block); 441 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); 442 ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset + 443 refcount_table_index * REFTABLE_ENTRY_SIZE, 444 sizeof(data64), &data64, 0); 445 if (ret < 0) { 446 goto fail; 447 } 448 449 s->refcount_table[refcount_table_index] = new_block; 450 /* If there's a hole in s->refcount_table then it can happen 451 * that refcount_table_index < s->max_refcount_table_index */ 452 s->max_refcount_table_index = 453 MAX(s->max_refcount_table_index, refcount_table_index); 454 455 /* The new refcount block may be where the caller intended to put its 456 * data, so let it restart the search. */ 457 return -EAGAIN; 458 } 459 460 qcow2_cache_put(s->refcount_block_cache, refcount_block); 461 462 /* 463 * If we come here, we need to grow the refcount table. Again, a new 464 * refcount table needs some space and we can't simply allocate to avoid 465 * endless recursion. 466 * 467 * Therefore let's grab new refcount blocks at the end of the image, which 468 * will describe themselves and the new refcount table. This way we can 469 * reference them only in the new table and do the switch to the new 470 * refcount table at once without producing an inconsistent state in 471 * between. 472 */ 473 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); 474 475 /* Calculate the number of refcount blocks needed so far; this will be the 476 * basis for calculating the index of the first cluster used for the 477 * self-describing refcount structures which we are about to create. 478 * 479 * Because we reached this point, there cannot be any refcount entries for 480 * cluster_index or higher indices yet. However, because new_block has been 481 * allocated to describe that cluster (and it will assume this role later 482 * on), we cannot use that index; also, new_block may actually have a higher 483 * cluster index than cluster_index, so it needs to be taken into account 484 * here (and 1 needs to be added to its value because that cluster is used). 485 */ 486 uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1, 487 (new_block >> s->cluster_bits) + 1), 488 s->refcount_block_size); 489 490 /* Create the new refcount table and blocks */ 491 uint64_t meta_offset = (blocks_used * s->refcount_block_size) * 492 s->cluster_size; 493 494 ret = qcow2_refcount_area(bs, meta_offset, 0, false, 495 refcount_table_index, new_block); 496 if (ret < 0) { 497 return ret; 498 } 499 500 ret = load_refcount_block(bs, new_block, refcount_block); 501 if (ret < 0) { 502 return ret; 503 } 504 505 /* If we were trying to do the initial refcount update for some cluster 506 * allocation, we might have used the same clusters to store newly 507 * allocated metadata. Make the caller search some new space. */ 508 return -EAGAIN; 509 510 fail: 511 if (*refcount_block != NULL) { 512 qcow2_cache_put(s->refcount_block_cache, refcount_block); 513 } 514 return ret; 515 } 516 517 /* 518 * Starting at @start_offset, this function creates new self-covering refcount 519 * structures: A new refcount table and refcount blocks which cover all of 520 * themselves, and a number of @additional_clusters beyond their end. 521 * @start_offset must be at the end of the image file, that is, there must be 522 * only empty space beyond it. 523 * If @exact_size is false, the refcount table will have 50 % more entries than 524 * necessary so it will not need to grow again soon. 525 * If @new_refblock_offset is not zero, it contains the offset of a refcount 526 * block that should be entered into the new refcount table at index 527 * @new_refblock_index. 528 * 529 * Returns: The offset after the new refcount structures (i.e. where the 530 * @additional_clusters may be placed) on success, -errno on error. 531 */ 532 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset, 533 uint64_t additional_clusters, bool exact_size, 534 int new_refblock_index, 535 uint64_t new_refblock_offset) 536 { 537 BDRVQcow2State *s = bs->opaque; 538 uint64_t total_refblock_count_u64, additional_refblock_count; 539 int total_refblock_count, table_size, area_reftable_index, table_clusters; 540 int i; 541 uint64_t table_offset, block_offset, end_offset; 542 int ret; 543 uint64_t *new_table; 544 545 assert(!(start_offset % s->cluster_size)); 546 547 qcow2_refcount_metadata_size(start_offset / s->cluster_size + 548 additional_clusters, 549 s->cluster_size, s->refcount_order, 550 !exact_size, &total_refblock_count_u64); 551 if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) { 552 return -EFBIG; 553 } 554 total_refblock_count = total_refblock_count_u64; 555 556 /* Index in the refcount table of the first refcount block to cover the area 557 * of refcount structures we are about to create; we know that 558 * @total_refblock_count can cover @start_offset, so this will definitely 559 * fit into an int. */ 560 area_reftable_index = (start_offset / s->cluster_size) / 561 s->refcount_block_size; 562 563 if (exact_size) { 564 table_size = total_refblock_count; 565 } else { 566 table_size = total_refblock_count + 567 DIV_ROUND_UP(total_refblock_count, 2); 568 } 569 /* The qcow2 file can only store the reftable size in number of clusters */ 570 table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE); 571 table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size; 572 573 if (table_size > QCOW_MAX_REFTABLE_SIZE) { 574 return -EFBIG; 575 } 576 577 new_table = g_try_new0(uint64_t, table_size); 578 579 assert(table_size > 0); 580 if (new_table == NULL) { 581 ret = -ENOMEM; 582 goto fail; 583 } 584 585 /* Fill the new refcount table */ 586 if (table_size > s->max_refcount_table_index) { 587 /* We're actually growing the reftable */ 588 memcpy(new_table, s->refcount_table, 589 (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE); 590 } else { 591 /* Improbable case: We're shrinking the reftable. However, the caller 592 * has assured us that there is only empty space beyond @start_offset, 593 * so we can simply drop all of the refblocks that won't fit into the 594 * new reftable. */ 595 memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE); 596 } 597 598 if (new_refblock_offset) { 599 assert(new_refblock_index < total_refblock_count); 600 new_table[new_refblock_index] = new_refblock_offset; 601 } 602 603 /* Count how many new refblocks we have to create */ 604 additional_refblock_count = 0; 605 for (i = area_reftable_index; i < total_refblock_count; i++) { 606 if (!new_table[i]) { 607 additional_refblock_count++; 608 } 609 } 610 611 table_offset = start_offset + additional_refblock_count * s->cluster_size; 612 end_offset = table_offset + table_clusters * s->cluster_size; 613 614 /* Fill the refcount blocks, and create new ones, if necessary */ 615 block_offset = start_offset; 616 for (i = area_reftable_index; i < total_refblock_count; i++) { 617 void *refblock_data; 618 uint64_t first_offset_covered; 619 620 /* Reuse an existing refblock if possible, create a new one otherwise */ 621 if (new_table[i]) { 622 ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i], 623 &refblock_data); 624 if (ret < 0) { 625 goto fail; 626 } 627 } else { 628 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, 629 block_offset, &refblock_data); 630 if (ret < 0) { 631 goto fail; 632 } 633 memset(refblock_data, 0, s->cluster_size); 634 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, 635 refblock_data); 636 637 new_table[i] = block_offset; 638 block_offset += s->cluster_size; 639 } 640 641 /* First host offset covered by this refblock */ 642 first_offset_covered = (uint64_t)i * s->refcount_block_size * 643 s->cluster_size; 644 if (first_offset_covered < end_offset) { 645 int j, end_index; 646 647 /* Set the refcount of all of the new refcount structures to 1 */ 648 649 if (first_offset_covered < start_offset) { 650 assert(i == area_reftable_index); 651 j = (start_offset - first_offset_covered) / s->cluster_size; 652 assert(j < s->refcount_block_size); 653 } else { 654 j = 0; 655 } 656 657 end_index = MIN((end_offset - first_offset_covered) / 658 s->cluster_size, 659 s->refcount_block_size); 660 661 for (; j < end_index; j++) { 662 /* The caller guaranteed us this space would be empty */ 663 assert(s->get_refcount(refblock_data, j) == 0); 664 s->set_refcount(refblock_data, j, 1); 665 } 666 667 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, 668 refblock_data); 669 } 670 671 qcow2_cache_put(s->refcount_block_cache, &refblock_data); 672 } 673 674 assert(block_offset == table_offset); 675 676 /* Write refcount blocks to disk */ 677 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); 678 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 679 if (ret < 0) { 680 goto fail; 681 } 682 683 /* Write refcount table to disk */ 684 for (i = 0; i < total_refblock_count; i++) { 685 cpu_to_be64s(&new_table[i]); 686 } 687 688 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); 689 ret = bdrv_pwrite_sync(bs->file, table_offset, 690 table_size * REFTABLE_ENTRY_SIZE, new_table, 0); 691 if (ret < 0) { 692 goto fail; 693 } 694 695 for (i = 0; i < total_refblock_count; i++) { 696 be64_to_cpus(&new_table[i]); 697 } 698 699 /* Hook up the new refcount table in the qcow2 header */ 700 struct QEMU_PACKED { 701 uint64_t d64; 702 uint32_t d32; 703 } data; 704 data.d64 = cpu_to_be64(table_offset); 705 data.d32 = cpu_to_be32(table_clusters); 706 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); 707 ret = bdrv_pwrite_sync(bs->file, 708 offsetof(QCowHeader, refcount_table_offset), 709 sizeof(data), &data, 0); 710 if (ret < 0) { 711 goto fail; 712 } 713 714 /* And switch it in memory */ 715 uint64_t old_table_offset = s->refcount_table_offset; 716 uint64_t old_table_size = s->refcount_table_size; 717 718 g_free(s->refcount_table); 719 s->refcount_table = new_table; 720 s->refcount_table_size = table_size; 721 s->refcount_table_offset = table_offset; 722 update_max_refcount_table_index(s); 723 724 /* Free old table. */ 725 qcow2_free_clusters(bs, old_table_offset, 726 old_table_size * REFTABLE_ENTRY_SIZE, 727 QCOW2_DISCARD_OTHER); 728 729 return end_offset; 730 731 fail: 732 g_free(new_table); 733 return ret; 734 } 735 736 void qcow2_process_discards(BlockDriverState *bs, int ret) 737 { 738 BDRVQcow2State *s = bs->opaque; 739 Qcow2DiscardRegion *d, *next; 740 741 QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { 742 QTAILQ_REMOVE(&s->discards, d, next); 743 744 /* Discard is optional, ignore the return value */ 745 if (ret >= 0) { 746 int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes); 747 if (r2 < 0) { 748 trace_qcow2_process_discards_failed_region(d->offset, d->bytes, 749 r2); 750 } 751 } 752 753 g_free(d); 754 } 755 } 756 757 static void queue_discard(BlockDriverState *bs, 758 uint64_t offset, uint64_t length) 759 { 760 BDRVQcow2State *s = bs->opaque; 761 Qcow2DiscardRegion *d, *p, *next; 762 763 QTAILQ_FOREACH(d, &s->discards, next) { 764 uint64_t new_start = MIN(offset, d->offset); 765 uint64_t new_end = MAX(offset + length, d->offset + d->bytes); 766 767 if (new_end - new_start <= length + d->bytes) { 768 /* There can't be any overlap, areas ending up here have no 769 * references any more and therefore shouldn't get freed another 770 * time. */ 771 assert(d->bytes + length == new_end - new_start); 772 d->offset = new_start; 773 d->bytes = new_end - new_start; 774 goto found; 775 } 776 } 777 778 d = g_malloc(sizeof(*d)); 779 *d = (Qcow2DiscardRegion) { 780 .bs = bs, 781 .offset = offset, 782 .bytes = length, 783 }; 784 QTAILQ_INSERT_TAIL(&s->discards, d, next); 785 786 found: 787 /* Merge discard requests if they are adjacent now */ 788 QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { 789 if (p == d 790 || p->offset > d->offset + d->bytes 791 || d->offset > p->offset + p->bytes) 792 { 793 continue; 794 } 795 796 /* Still no overlap possible */ 797 assert(p->offset == d->offset + d->bytes 798 || d->offset == p->offset + p->bytes); 799 800 QTAILQ_REMOVE(&s->discards, p, next); 801 d->offset = MIN(d->offset, p->offset); 802 d->bytes += p->bytes; 803 g_free(p); 804 } 805 } 806 807 /* XXX: cache several refcount block clusters ? */ 808 /* @addend is the absolute value of the addend; if @decrease is set, @addend 809 * will be subtracted from the current refcount, otherwise it will be added */ 810 static int GRAPH_RDLOCK 811 update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, 812 uint64_t addend, bool decrease, enum qcow2_discard_type type) 813 { 814 BDRVQcow2State *s = bs->opaque; 815 int64_t start, last, cluster_offset; 816 void *refcount_block = NULL; 817 int64_t old_table_index = -1; 818 int ret; 819 820 #ifdef DEBUG_ALLOC2 821 fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 822 " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "", 823 addend); 824 #endif 825 if (length < 0) { 826 return -EINVAL; 827 } else if (length == 0) { 828 return 0; 829 } 830 831 if (decrease) { 832 qcow2_cache_set_dependency(bs, s->refcount_block_cache, 833 s->l2_table_cache); 834 } 835 836 start = start_of_cluster(s, offset); 837 last = start_of_cluster(s, offset + length - 1); 838 for(cluster_offset = start; cluster_offset <= last; 839 cluster_offset += s->cluster_size) 840 { 841 int block_index; 842 uint64_t refcount; 843 int64_t cluster_index = cluster_offset >> s->cluster_bits; 844 int64_t table_index = cluster_index >> s->refcount_block_bits; 845 846 /* Load the refcount block and allocate it if needed */ 847 if (table_index != old_table_index) { 848 if (refcount_block) { 849 qcow2_cache_put(s->refcount_block_cache, &refcount_block); 850 } 851 ret = alloc_refcount_block(bs, cluster_index, &refcount_block); 852 /* If the caller needs to restart the search for free clusters, 853 * try the same ones first to see if they're still free. */ 854 if (ret == -EAGAIN) { 855 if (s->free_cluster_index > (start >> s->cluster_bits)) { 856 s->free_cluster_index = (start >> s->cluster_bits); 857 } 858 } 859 if (ret < 0) { 860 goto fail; 861 } 862 } 863 old_table_index = table_index; 864 865 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block); 866 867 /* we can update the count and save it */ 868 block_index = cluster_index & (s->refcount_block_size - 1); 869 870 refcount = s->get_refcount(refcount_block, block_index); 871 if (decrease ? (refcount - addend > refcount) 872 : (refcount + addend < refcount || 873 refcount + addend > s->refcount_max)) 874 { 875 ret = -EINVAL; 876 goto fail; 877 } 878 if (decrease) { 879 refcount -= addend; 880 } else { 881 refcount += addend; 882 } 883 if (refcount == 0 && cluster_index < s->free_cluster_index) { 884 s->free_cluster_index = cluster_index; 885 } 886 s->set_refcount(refcount_block, block_index, refcount); 887 888 if (refcount == 0) { 889 void *table; 890 891 table = qcow2_cache_is_table_offset(s->refcount_block_cache, 892 offset); 893 if (table != NULL) { 894 qcow2_cache_put(s->refcount_block_cache, &refcount_block); 895 old_table_index = -1; 896 qcow2_cache_discard(s->refcount_block_cache, table); 897 } 898 899 table = qcow2_cache_is_table_offset(s->l2_table_cache, offset); 900 if (table != NULL) { 901 qcow2_cache_discard(s->l2_table_cache, table); 902 } 903 904 if (s->discard_passthrough[type]) { 905 queue_discard(bs, cluster_offset, s->cluster_size); 906 } 907 } 908 } 909 910 ret = 0; 911 fail: 912 if (!s->cache_discards) { 913 qcow2_process_discards(bs, ret); 914 } 915 916 /* Write last changed block to disk */ 917 if (refcount_block) { 918 qcow2_cache_put(s->refcount_block_cache, &refcount_block); 919 } 920 921 /* 922 * Try do undo any updates if an error is returned (This may succeed in 923 * some cases like ENOSPC for allocating a new refcount block) 924 */ 925 if (ret < 0) { 926 int dummy; 927 dummy = update_refcount(bs, offset, cluster_offset - offset, addend, 928 !decrease, QCOW2_DISCARD_NEVER); 929 (void)dummy; 930 } 931 932 return ret; 933 } 934 935 /* 936 * Increases or decreases the refcount of a given cluster. 937 * 938 * @addend is the absolute value of the addend; if @decrease is set, @addend 939 * will be subtracted from the current refcount, otherwise it will be added. 940 * 941 * On success 0 is returned; on failure -errno is returned. 942 */ 943 int qcow2_update_cluster_refcount(BlockDriverState *bs, 944 int64_t cluster_index, 945 uint64_t addend, bool decrease, 946 enum qcow2_discard_type type) 947 { 948 BDRVQcow2State *s = bs->opaque; 949 int ret; 950 951 ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, 952 decrease, type); 953 if (ret < 0) { 954 return ret; 955 } 956 957 return 0; 958 } 959 960 961 962 /*********************************************************/ 963 /* cluster allocation functions */ 964 965 966 967 /* return < 0 if error */ 968 static int64_t GRAPH_RDLOCK 969 alloc_clusters_noref(BlockDriverState *bs, uint64_t size, uint64_t max) 970 { 971 BDRVQcow2State *s = bs->opaque; 972 uint64_t i, nb_clusters, refcount; 973 int ret; 974 975 /* We can't allocate clusters if they may still be queued for discard. */ 976 if (s->cache_discards) { 977 qcow2_process_discards(bs, 0); 978 } 979 980 nb_clusters = size_to_clusters(s, size); 981 retry: 982 for(i = 0; i < nb_clusters; i++) { 983 uint64_t next_cluster_index = s->free_cluster_index++; 984 ret = qcow2_get_refcount(bs, next_cluster_index, &refcount); 985 986 if (ret < 0) { 987 return ret; 988 } else if (refcount != 0) { 989 goto retry; 990 } 991 } 992 993 /* Make sure that all offsets in the "allocated" range are representable 994 * in the requested max */ 995 if (s->free_cluster_index > 0 && 996 s->free_cluster_index - 1 > (max >> s->cluster_bits)) 997 { 998 return -EFBIG; 999 } 1000 1001 #ifdef DEBUG_ALLOC2 1002 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", 1003 size, 1004 (s->free_cluster_index - nb_clusters) << s->cluster_bits); 1005 #endif 1006 return (s->free_cluster_index - nb_clusters) << s->cluster_bits; 1007 } 1008 1009 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) 1010 { 1011 int64_t offset; 1012 int ret; 1013 1014 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); 1015 do { 1016 offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET); 1017 if (offset < 0) { 1018 return offset; 1019 } 1020 1021 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); 1022 } while (ret == -EAGAIN); 1023 1024 if (ret < 0) { 1025 return ret; 1026 } 1027 1028 return offset; 1029 } 1030 1031 int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, 1032 int64_t nb_clusters) 1033 { 1034 BDRVQcow2State *s = bs->opaque; 1035 uint64_t cluster_index, refcount; 1036 uint64_t i; 1037 int ret; 1038 1039 assert(nb_clusters >= 0); 1040 if (nb_clusters == 0) { 1041 return 0; 1042 } 1043 1044 do { 1045 /* Check how many clusters there are free */ 1046 cluster_index = offset >> s->cluster_bits; 1047 for(i = 0; i < nb_clusters; i++) { 1048 ret = qcow2_get_refcount(bs, cluster_index++, &refcount); 1049 if (ret < 0) { 1050 return ret; 1051 } else if (refcount != 0) { 1052 break; 1053 } 1054 } 1055 1056 /* And then allocate them */ 1057 ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false, 1058 QCOW2_DISCARD_NEVER); 1059 } while (ret == -EAGAIN); 1060 1061 if (ret < 0) { 1062 return ret; 1063 } 1064 1065 return i; 1066 } 1067 1068 /* only used to allocate compressed sectors. We try to allocate 1069 contiguous sectors. size must be <= cluster_size */ 1070 int64_t coroutine_fn GRAPH_RDLOCK qcow2_alloc_bytes(BlockDriverState *bs, int size) 1071 { 1072 BDRVQcow2State *s = bs->opaque; 1073 int64_t offset; 1074 size_t free_in_cluster; 1075 int ret; 1076 1077 BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); 1078 assert(size > 0 && size <= s->cluster_size); 1079 assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); 1080 1081 offset = s->free_byte_offset; 1082 1083 if (offset) { 1084 uint64_t refcount; 1085 ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount); 1086 if (ret < 0) { 1087 return ret; 1088 } 1089 1090 if (refcount == s->refcount_max) { 1091 offset = 0; 1092 } 1093 } 1094 1095 free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); 1096 do { 1097 if (!offset || free_in_cluster < size) { 1098 int64_t new_cluster; 1099 1100 new_cluster = alloc_clusters_noref(bs, s->cluster_size, 1101 MIN(s->cluster_offset_mask, 1102 QCOW_MAX_CLUSTER_OFFSET)); 1103 if (new_cluster < 0) { 1104 return new_cluster; 1105 } 1106 1107 if (new_cluster == 0) { 1108 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid " 1109 "allocation of compressed cluster " 1110 "at offset 0"); 1111 return -EIO; 1112 } 1113 1114 if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { 1115 offset = new_cluster; 1116 free_in_cluster = s->cluster_size; 1117 } else { 1118 free_in_cluster += s->cluster_size; 1119 } 1120 } 1121 1122 assert(offset); 1123 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); 1124 if (ret < 0) { 1125 offset = 0; 1126 } 1127 } while (ret == -EAGAIN); 1128 if (ret < 0) { 1129 return ret; 1130 } 1131 1132 /* The cluster refcount was incremented; refcount blocks must be flushed 1133 * before the caller's L2 table updates. */ 1134 qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); 1135 1136 s->free_byte_offset = offset + size; 1137 if (!offset_into_cluster(s, s->free_byte_offset)) { 1138 s->free_byte_offset = 0; 1139 } 1140 1141 return offset; 1142 } 1143 1144 void qcow2_free_clusters(BlockDriverState *bs, 1145 int64_t offset, int64_t size, 1146 enum qcow2_discard_type type) 1147 { 1148 int ret; 1149 1150 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); 1151 ret = update_refcount(bs, offset, size, 1, true, type); 1152 if (ret < 0) { 1153 fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); 1154 /* TODO Remember the clusters to free them later and avoid leaking */ 1155 } 1156 } 1157 1158 /* 1159 * Free a cluster using its L2 entry (handles clusters of all types, e.g. 1160 * normal cluster, compressed cluster, etc.) 1161 */ 1162 void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry, 1163 enum qcow2_discard_type type) 1164 { 1165 BDRVQcow2State *s = bs->opaque; 1166 QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry); 1167 1168 if (has_data_file(bs)) { 1169 if (s->discard_passthrough[type] && 1170 (ctype == QCOW2_CLUSTER_NORMAL || 1171 ctype == QCOW2_CLUSTER_ZERO_ALLOC)) 1172 { 1173 bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK, 1174 s->cluster_size); 1175 } 1176 return; 1177 } 1178 1179 switch (ctype) { 1180 case QCOW2_CLUSTER_COMPRESSED: 1181 { 1182 uint64_t coffset; 1183 int csize; 1184 1185 qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); 1186 qcow2_free_clusters(bs, coffset, csize, type); 1187 } 1188 break; 1189 case QCOW2_CLUSTER_NORMAL: 1190 case QCOW2_CLUSTER_ZERO_ALLOC: 1191 if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { 1192 qcow2_signal_corruption(bs, false, -1, -1, 1193 "Cannot free unaligned cluster %#llx", 1194 l2_entry & L2E_OFFSET_MASK); 1195 } else { 1196 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, 1197 s->cluster_size, type); 1198 } 1199 break; 1200 case QCOW2_CLUSTER_ZERO_PLAIN: 1201 case QCOW2_CLUSTER_UNALLOCATED: 1202 break; 1203 default: 1204 abort(); 1205 } 1206 } 1207 1208 void qcow2_discard_cluster(BlockDriverState *bs, uint64_t offset, 1209 uint64_t length, QCow2ClusterType ctype, 1210 enum qcow2_discard_type dtype) 1211 { 1212 BDRVQcow2State *s = bs->opaque; 1213 1214 if (s->discard_passthrough[dtype] && 1215 (ctype == QCOW2_CLUSTER_NORMAL || 1216 ctype == QCOW2_CLUSTER_ZERO_ALLOC)) { 1217 if (has_data_file(bs)) { 1218 bdrv_pdiscard(s->data_file, offset, length); 1219 } else { 1220 queue_discard(bs, offset, length); 1221 } 1222 } 1223 } 1224 1225 int qcow2_write_caches(BlockDriverState *bs) 1226 { 1227 BDRVQcow2State *s = bs->opaque; 1228 int ret; 1229 1230 ret = qcow2_cache_write(bs, s->l2_table_cache); 1231 if (ret < 0) { 1232 return ret; 1233 } 1234 1235 if (qcow2_need_accurate_refcounts(s)) { 1236 ret = qcow2_cache_write(bs, s->refcount_block_cache); 1237 if (ret < 0) { 1238 return ret; 1239 } 1240 } 1241 1242 return 0; 1243 } 1244 1245 int qcow2_flush_caches(BlockDriverState *bs) 1246 { 1247 int ret = qcow2_write_caches(bs); 1248 if (ret < 0) { 1249 return ret; 1250 } 1251 1252 return bdrv_flush(bs->file->bs); 1253 } 1254 1255 /*********************************************************/ 1256 /* snapshots and image creation */ 1257 1258 1259 1260 /* update the refcounts of snapshots and the copied flag */ 1261 int qcow2_update_snapshot_refcount(BlockDriverState *bs, 1262 int64_t l1_table_offset, int l1_size, int addend) 1263 { 1264 BDRVQcow2State *s = bs->opaque; 1265 uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount; 1266 bool l1_allocated = false; 1267 int64_t old_entry, old_l2_offset; 1268 unsigned slice, slice_size2, n_slices; 1269 int i, j, l1_modified = 0; 1270 int ret; 1271 1272 assert(addend >= -1 && addend <= 1); 1273 1274 l2_slice = NULL; 1275 l1_table = NULL; 1276 l1_size2 = l1_size * L1E_SIZE; 1277 slice_size2 = s->l2_slice_size * l2_entry_size(s); 1278 n_slices = s->cluster_size / slice_size2; 1279 1280 s->cache_discards = true; 1281 1282 /* WARNING: qcow2_snapshot_goto relies on this function not using the 1283 * l1_table_offset when it is the current s->l1_table_offset! Be careful 1284 * when changing this! */ 1285 if (l1_table_offset != s->l1_table_offset) { 1286 l1_table = g_try_malloc0(l1_size2); 1287 if (l1_size2 && l1_table == NULL) { 1288 ret = -ENOMEM; 1289 goto fail; 1290 } 1291 l1_allocated = true; 1292 1293 ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0); 1294 if (ret < 0) { 1295 goto fail; 1296 } 1297 1298 for (i = 0; i < l1_size; i++) { 1299 be64_to_cpus(&l1_table[i]); 1300 } 1301 } else { 1302 assert(l1_size == s->l1_size); 1303 l1_table = s->l1_table; 1304 l1_allocated = false; 1305 } 1306 1307 for (i = 0; i < l1_size; i++) { 1308 l2_offset = l1_table[i]; 1309 if (l2_offset) { 1310 old_l2_offset = l2_offset; 1311 l2_offset &= L1E_OFFSET_MASK; 1312 1313 if (offset_into_cluster(s, l2_offset)) { 1314 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 1315 PRIx64 " unaligned (L1 index: %#x)", 1316 l2_offset, i); 1317 ret = -EIO; 1318 goto fail; 1319 } 1320 1321 for (slice = 0; slice < n_slices; slice++) { 1322 ret = qcow2_cache_get(bs, s->l2_table_cache, 1323 l2_offset + slice * slice_size2, 1324 (void **) &l2_slice); 1325 if (ret < 0) { 1326 goto fail; 1327 } 1328 1329 for (j = 0; j < s->l2_slice_size; j++) { 1330 uint64_t cluster_index; 1331 uint64_t offset; 1332 1333 entry = get_l2_entry(s, l2_slice, j); 1334 old_entry = entry; 1335 entry &= ~QCOW_OFLAG_COPIED; 1336 offset = entry & L2E_OFFSET_MASK; 1337 1338 switch (qcow2_get_cluster_type(bs, entry)) { 1339 case QCOW2_CLUSTER_COMPRESSED: 1340 if (addend != 0) { 1341 uint64_t coffset; 1342 int csize; 1343 1344 qcow2_parse_compressed_l2_entry(bs, entry, 1345 &coffset, &csize); 1346 ret = update_refcount( 1347 bs, coffset, csize, 1348 abs(addend), addend < 0, 1349 QCOW2_DISCARD_SNAPSHOT); 1350 if (ret < 0) { 1351 goto fail; 1352 } 1353 } 1354 /* compressed clusters are never modified */ 1355 refcount = 2; 1356 break; 1357 1358 case QCOW2_CLUSTER_NORMAL: 1359 case QCOW2_CLUSTER_ZERO_ALLOC: 1360 if (offset_into_cluster(s, offset)) { 1361 /* Here l2_index means table (not slice) index */ 1362 int l2_index = slice * s->l2_slice_size + j; 1363 qcow2_signal_corruption( 1364 bs, true, -1, -1, "Cluster " 1365 "allocation offset %#" PRIx64 1366 " unaligned (L2 offset: %#" 1367 PRIx64 ", L2 index: %#x)", 1368 offset, l2_offset, l2_index); 1369 ret = -EIO; 1370 goto fail; 1371 } 1372 1373 cluster_index = offset >> s->cluster_bits; 1374 assert(cluster_index); 1375 if (addend != 0) { 1376 ret = qcow2_update_cluster_refcount( 1377 bs, cluster_index, abs(addend), addend < 0, 1378 QCOW2_DISCARD_SNAPSHOT); 1379 if (ret < 0) { 1380 goto fail; 1381 } 1382 } 1383 1384 ret = qcow2_get_refcount(bs, cluster_index, &refcount); 1385 if (ret < 0) { 1386 goto fail; 1387 } 1388 break; 1389 1390 case QCOW2_CLUSTER_ZERO_PLAIN: 1391 case QCOW2_CLUSTER_UNALLOCATED: 1392 refcount = 0; 1393 break; 1394 1395 default: 1396 abort(); 1397 } 1398 1399 if (refcount == 1) { 1400 entry |= QCOW_OFLAG_COPIED; 1401 } 1402 if (entry != old_entry) { 1403 if (addend > 0) { 1404 qcow2_cache_set_dependency(bs, s->l2_table_cache, 1405 s->refcount_block_cache); 1406 } 1407 set_l2_entry(s, l2_slice, j, entry); 1408 qcow2_cache_entry_mark_dirty(s->l2_table_cache, 1409 l2_slice); 1410 } 1411 } 1412 1413 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1414 } 1415 1416 if (addend != 0) { 1417 ret = qcow2_update_cluster_refcount(bs, l2_offset >> 1418 s->cluster_bits, 1419 abs(addend), addend < 0, 1420 QCOW2_DISCARD_SNAPSHOT); 1421 if (ret < 0) { 1422 goto fail; 1423 } 1424 } 1425 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1426 &refcount); 1427 if (ret < 0) { 1428 goto fail; 1429 } else if (refcount == 1) { 1430 l2_offset |= QCOW_OFLAG_COPIED; 1431 } 1432 if (l2_offset != old_l2_offset) { 1433 l1_table[i] = l2_offset; 1434 l1_modified = 1; 1435 } 1436 } 1437 } 1438 1439 ret = bdrv_flush(bs); 1440 fail: 1441 if (l2_slice) { 1442 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1443 } 1444 1445 s->cache_discards = false; 1446 qcow2_process_discards(bs, ret); 1447 1448 /* Update L1 only if it isn't deleted anyway (addend = -1) */ 1449 if (ret == 0 && addend >= 0 && l1_modified) { 1450 for (i = 0; i < l1_size; i++) { 1451 cpu_to_be64s(&l1_table[i]); 1452 } 1453 1454 ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table, 1455 0); 1456 1457 for (i = 0; i < l1_size; i++) { 1458 be64_to_cpus(&l1_table[i]); 1459 } 1460 } 1461 if (l1_allocated) 1462 g_free(l1_table); 1463 return ret; 1464 } 1465 1466 1467 1468 1469 /*********************************************************/ 1470 /* refcount checking functions */ 1471 1472 1473 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries) 1474 { 1475 /* This assertion holds because there is no way we can address more than 1476 * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because 1477 * offsets have to be representable in bytes); due to every cluster 1478 * corresponding to one refcount entry, we are well below that limit */ 1479 assert(entries < (UINT64_C(1) << (64 - 9))); 1480 1481 /* Thanks to the assertion this will not overflow, because 1482 * s->refcount_order < 7. 1483 * (note: x << s->refcount_order == x * s->refcount_bits) */ 1484 return DIV_ROUND_UP(entries << s->refcount_order, 8); 1485 } 1486 1487 /** 1488 * Reallocates *array so that it can hold new_size entries. *size must contain 1489 * the current number of entries in *array. If the reallocation fails, *array 1490 * and *size will not be modified and -errno will be returned. If the 1491 * reallocation is successful, *array will be set to the new buffer, *size 1492 * will be set to new_size and 0 will be returned. The size of the reallocated 1493 * refcount array buffer will be aligned to a cluster boundary, and the newly 1494 * allocated area will be zeroed. 1495 */ 1496 static int realloc_refcount_array(BDRVQcow2State *s, void **array, 1497 int64_t *size, int64_t new_size) 1498 { 1499 int64_t old_byte_size, new_byte_size; 1500 void *new_ptr; 1501 1502 /* Round to clusters so the array can be directly written to disk */ 1503 old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size)) 1504 * s->cluster_size; 1505 new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size)) 1506 * s->cluster_size; 1507 1508 if (new_byte_size == old_byte_size) { 1509 *size = new_size; 1510 return 0; 1511 } 1512 1513 assert(new_byte_size > 0); 1514 1515 if (new_byte_size > SIZE_MAX) { 1516 return -ENOMEM; 1517 } 1518 1519 new_ptr = g_try_realloc(*array, new_byte_size); 1520 if (!new_ptr) { 1521 return -ENOMEM; 1522 } 1523 1524 if (new_byte_size > old_byte_size) { 1525 memset((char *)new_ptr + old_byte_size, 0, 1526 new_byte_size - old_byte_size); 1527 } 1528 1529 *array = new_ptr; 1530 *size = new_size; 1531 1532 return 0; 1533 } 1534 1535 /* 1536 * Increases the refcount for a range of clusters in a given refcount table. 1537 * This is used to construct a temporary refcount table out of L1 and L2 tables 1538 * which can be compared to the refcount table saved in the image. 1539 * 1540 * Modifies the number of errors in res. 1541 */ 1542 int coroutine_fn GRAPH_RDLOCK 1543 qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, 1544 void **refcount_table, 1545 int64_t *refcount_table_size, 1546 int64_t offset, int64_t size) 1547 { 1548 BDRVQcow2State *s = bs->opaque; 1549 uint64_t start, last, cluster_offset, k, refcount; 1550 int64_t file_len; 1551 int ret; 1552 1553 if (size <= 0) { 1554 return 0; 1555 } 1556 1557 file_len = bdrv_co_getlength(bs->file->bs); 1558 if (file_len < 0) { 1559 return file_len; 1560 } 1561 1562 /* 1563 * Last cluster of qcow2 image may be semi-allocated, so it may be OK to 1564 * reference some space after file end but it should be less than one 1565 * cluster. 1566 */ 1567 if (offset + size - file_len >= s->cluster_size) { 1568 fprintf(stderr, "ERROR: counting reference for region exceeding the " 1569 "end of the file by one cluster or more: offset 0x%" PRIx64 1570 " size 0x%" PRIx64 "\n", offset, size); 1571 res->corruptions++; 1572 return 0; 1573 } 1574 1575 start = start_of_cluster(s, offset); 1576 last = start_of_cluster(s, offset + size - 1); 1577 for(cluster_offset = start; cluster_offset <= last; 1578 cluster_offset += s->cluster_size) { 1579 k = cluster_offset >> s->cluster_bits; 1580 if (k >= *refcount_table_size) { 1581 ret = realloc_refcount_array(s, refcount_table, 1582 refcount_table_size, k + 1); 1583 if (ret < 0) { 1584 res->check_errors++; 1585 return ret; 1586 } 1587 } 1588 1589 refcount = s->get_refcount(*refcount_table, k); 1590 if (refcount == s->refcount_max) { 1591 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 1592 "\n", cluster_offset); 1593 fprintf(stderr, "Use qemu-img amend to increase the refcount entry " 1594 "width or qemu-img convert to create a clean copy if the " 1595 "image cannot be opened for writing\n"); 1596 res->corruptions++; 1597 continue; 1598 } 1599 s->set_refcount(*refcount_table, k, refcount + 1); 1600 } 1601 1602 return 0; 1603 } 1604 1605 /* Flags for check_refcounts_l1() and check_refcounts_l2() */ 1606 enum { 1607 CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ 1608 }; 1609 1610 /* 1611 * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present 1612 * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN). 1613 * 1614 * This function decrements res->corruptions on success, so the caller is 1615 * responsible to increment res->corruptions prior to the call. 1616 * 1617 * On failure in-memory @l2_table may be modified. 1618 */ 1619 static int coroutine_fn GRAPH_RDLOCK 1620 fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res, 1621 uint64_t l2_offset, uint64_t *l2_table, 1622 int l2_index, bool active, 1623 bool *metadata_overlap) 1624 { 1625 BDRVQcow2State *s = bs->opaque; 1626 int ret; 1627 int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t)); 1628 uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s); 1629 int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2; 1630 1631 if (has_subclusters(s)) { 1632 uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index); 1633 1634 /* Allocated subclusters become zero */ 1635 l2_bitmap |= l2_bitmap << 32; 1636 l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES; 1637 1638 set_l2_bitmap(s, l2_table, l2_index, l2_bitmap); 1639 set_l2_entry(s, l2_table, l2_index, 0); 1640 } else { 1641 set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO); 1642 } 1643 1644 ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s), 1645 false); 1646 if (metadata_overlap) { 1647 *metadata_overlap = ret < 0; 1648 } 1649 if (ret < 0) { 1650 fprintf(stderr, "ERROR: Overlap check failed\n"); 1651 goto fail; 1652 } 1653 1654 ret = bdrv_co_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s), 1655 &l2_table[idx], 0); 1656 if (ret < 0) { 1657 fprintf(stderr, "ERROR: Failed to overwrite L2 " 1658 "table entry: %s\n", strerror(-ret)); 1659 goto fail; 1660 } 1661 1662 res->corruptions--; 1663 res->corruptions_fixed++; 1664 return 0; 1665 1666 fail: 1667 res->check_errors++; 1668 return ret; 1669 } 1670 1671 /* 1672 * Increases the refcount in the given refcount table for the all clusters 1673 * referenced in the L2 table. While doing so, performs some checks on L2 1674 * entries. 1675 * 1676 * Returns the number of errors found by the checks or -errno if an internal 1677 * error occurred. 1678 */ 1679 static int coroutine_fn GRAPH_RDLOCK 1680 check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, 1681 void **refcount_table, 1682 int64_t *refcount_table_size, int64_t l2_offset, 1683 int flags, BdrvCheckMode fix, bool active) 1684 { 1685 BDRVQcow2State *s = bs->opaque; 1686 uint64_t l2_entry, l2_bitmap; 1687 uint64_t next_contiguous_offset = 0; 1688 int i, ret; 1689 size_t l2_size_bytes = s->l2_size * l2_entry_size(s); 1690 g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes); 1691 bool metadata_overlap; 1692 1693 /* Read L2 table from disk */ 1694 ret = bdrv_co_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0); 1695 if (ret < 0) { 1696 fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); 1697 res->check_errors++; 1698 return ret; 1699 } 1700 1701 /* Do the actual checks */ 1702 for (i = 0; i < s->l2_size; i++) { 1703 uint64_t coffset; 1704 int csize; 1705 QCow2ClusterType type; 1706 1707 l2_entry = get_l2_entry(s, l2_table, i); 1708 l2_bitmap = get_l2_bitmap(s, l2_table, i); 1709 type = qcow2_get_cluster_type(bs, l2_entry); 1710 1711 if (type != QCOW2_CLUSTER_COMPRESSED) { 1712 /* Check reserved bits of Standard Cluster Descriptor */ 1713 if (l2_entry & L2E_STD_RESERVED_MASK) { 1714 fprintf(stderr, "ERROR found l2 entry with reserved bits set: " 1715 "%" PRIx64 "\n", l2_entry); 1716 res->corruptions++; 1717 } 1718 } 1719 1720 switch (type) { 1721 case QCOW2_CLUSTER_COMPRESSED: 1722 /* Compressed clusters don't have QCOW_OFLAG_COPIED */ 1723 if (l2_entry & QCOW_OFLAG_COPIED) { 1724 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": " 1725 "copied flag must never be set for compressed " 1726 "clusters\n", l2_entry & s->cluster_offset_mask); 1727 l2_entry &= ~QCOW_OFLAG_COPIED; 1728 res->corruptions++; 1729 } 1730 1731 if (has_data_file(bs)) { 1732 fprintf(stderr, "ERROR compressed cluster %d with data file, " 1733 "entry=0x%" PRIx64 "\n", i, l2_entry); 1734 res->corruptions++; 1735 break; 1736 } 1737 1738 if (l2_bitmap) { 1739 fprintf(stderr, "ERROR compressed cluster %d with non-zero " 1740 "subcluster allocation bitmap, entry=0x%" PRIx64 "\n", 1741 i, l2_entry); 1742 res->corruptions++; 1743 break; 1744 } 1745 1746 /* Mark cluster as used */ 1747 qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize); 1748 ret = qcow2_inc_refcounts_imrt( 1749 bs, res, refcount_table, refcount_table_size, coffset, csize); 1750 if (ret < 0) { 1751 return ret; 1752 } 1753 1754 if (flags & CHECK_FRAG_INFO) { 1755 res->bfi.allocated_clusters++; 1756 res->bfi.compressed_clusters++; 1757 1758 /* 1759 * Compressed clusters are fragmented by nature. Since they 1760 * take up sub-sector space but we only have sector granularity 1761 * I/O we need to re-read the same sectors even for adjacent 1762 * compressed clusters. 1763 */ 1764 res->bfi.fragmented_clusters++; 1765 } 1766 break; 1767 1768 case QCOW2_CLUSTER_ZERO_ALLOC: 1769 case QCOW2_CLUSTER_NORMAL: 1770 { 1771 uint64_t offset = l2_entry & L2E_OFFSET_MASK; 1772 1773 if ((l2_bitmap >> 32) & l2_bitmap) { 1774 res->corruptions++; 1775 fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated " 1776 "cluster has corrupted subcluster allocation bitmap\n", 1777 offset); 1778 } 1779 1780 /* Correct offsets are cluster aligned */ 1781 if (offset_into_cluster(s, offset)) { 1782 bool contains_data; 1783 res->corruptions++; 1784 1785 if (has_subclusters(s)) { 1786 contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC); 1787 } else { 1788 contains_data = !(l2_entry & QCOW_OFLAG_ZERO); 1789 } 1790 1791 if (!contains_data) { 1792 fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated " 1793 "cluster is not properly aligned; L2 entry " 1794 "corrupted.\n", 1795 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", 1796 offset); 1797 if (fix & BDRV_FIX_ERRORS) { 1798 ret = fix_l2_entry_by_zero(bs, res, l2_offset, 1799 l2_table, i, active, 1800 &metadata_overlap); 1801 if (metadata_overlap) { 1802 /* 1803 * Something is seriously wrong, so abort checking 1804 * this L2 table. 1805 */ 1806 return ret; 1807 } 1808 1809 if (ret == 0) { 1810 /* 1811 * Skip marking the cluster as used 1812 * (it is unused now). 1813 */ 1814 continue; 1815 } 1816 1817 /* 1818 * Failed to fix. 1819 * Do not abort, continue checking the rest of this 1820 * L2 table's entries. 1821 */ 1822 } 1823 } else { 1824 fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is " 1825 "not properly aligned; L2 entry corrupted.\n", offset); 1826 } 1827 } 1828 1829 if (flags & CHECK_FRAG_INFO) { 1830 res->bfi.allocated_clusters++; 1831 if (next_contiguous_offset && 1832 offset != next_contiguous_offset) { 1833 res->bfi.fragmented_clusters++; 1834 } 1835 next_contiguous_offset = offset + s->cluster_size; 1836 } 1837 1838 /* Mark cluster as used */ 1839 if (!has_data_file(bs)) { 1840 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, 1841 refcount_table_size, 1842 offset, s->cluster_size); 1843 if (ret < 0) { 1844 return ret; 1845 } 1846 } 1847 break; 1848 } 1849 1850 case QCOW2_CLUSTER_ZERO_PLAIN: 1851 /* Impossible when image has subclusters */ 1852 assert(!l2_bitmap); 1853 break; 1854 1855 case QCOW2_CLUSTER_UNALLOCATED: 1856 if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) { 1857 res->corruptions++; 1858 fprintf(stderr, "ERROR: Unallocated " 1859 "cluster has non-zero subcluster allocation map\n"); 1860 } 1861 break; 1862 1863 default: 1864 abort(); 1865 } 1866 } 1867 1868 return 0; 1869 } 1870 1871 /* 1872 * Increases the refcount for the L1 table, its L2 tables and all referenced 1873 * clusters in the given refcount table. While doing so, performs some checks 1874 * on L1 and L2 entries. 1875 * 1876 * Returns the number of errors found by the checks or -errno if an internal 1877 * error occurred. 1878 */ 1879 static int coroutine_fn GRAPH_RDLOCK 1880 check_refcounts_l1(BlockDriverState *bs, BdrvCheckResult *res, 1881 void **refcount_table, int64_t *refcount_table_size, 1882 int64_t l1_table_offset, int l1_size, 1883 int flags, BdrvCheckMode fix, bool active) 1884 { 1885 BDRVQcow2State *s = bs->opaque; 1886 size_t l1_size_bytes = l1_size * L1E_SIZE; 1887 g_autofree uint64_t *l1_table = NULL; 1888 uint64_t l2_offset; 1889 int i, ret; 1890 1891 if (!l1_size) { 1892 return 0; 1893 } 1894 1895 /* Mark L1 table as used */ 1896 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size, 1897 l1_table_offset, l1_size_bytes); 1898 if (ret < 0) { 1899 return ret; 1900 } 1901 1902 l1_table = g_try_malloc(l1_size_bytes); 1903 if (l1_table == NULL) { 1904 res->check_errors++; 1905 return -ENOMEM; 1906 } 1907 1908 /* Read L1 table entries from disk */ 1909 ret = bdrv_co_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0); 1910 if (ret < 0) { 1911 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); 1912 res->check_errors++; 1913 return ret; 1914 } 1915 1916 for (i = 0; i < l1_size; i++) { 1917 be64_to_cpus(&l1_table[i]); 1918 } 1919 1920 /* Do the actual checks */ 1921 for (i = 0; i < l1_size; i++) { 1922 if (!l1_table[i]) { 1923 continue; 1924 } 1925 1926 if (l1_table[i] & L1E_RESERVED_MASK) { 1927 fprintf(stderr, "ERROR found L1 entry with reserved bits set: " 1928 "%" PRIx64 "\n", l1_table[i]); 1929 res->corruptions++; 1930 } 1931 1932 l2_offset = l1_table[i] & L1E_OFFSET_MASK; 1933 1934 /* Mark L2 table as used */ 1935 ret = qcow2_inc_refcounts_imrt(bs, res, 1936 refcount_table, refcount_table_size, 1937 l2_offset, s->cluster_size); 1938 if (ret < 0) { 1939 return ret; 1940 } 1941 1942 /* L2 tables are cluster aligned */ 1943 if (offset_into_cluster(s, l2_offset)) { 1944 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " 1945 "cluster aligned; L1 entry corrupted\n", l2_offset); 1946 res->corruptions++; 1947 } 1948 1949 /* Process and check L2 entries */ 1950 ret = check_refcounts_l2(bs, res, refcount_table, 1951 refcount_table_size, l2_offset, flags, 1952 fix, active); 1953 if (ret < 0) { 1954 return ret; 1955 } 1956 } 1957 1958 return 0; 1959 } 1960 1961 /* 1962 * Checks the OFLAG_COPIED flag for all L1 and L2 entries. 1963 * 1964 * This function does not print an error message nor does it increment 1965 * check_errors if qcow2_get_refcount fails (this is because such an error will 1966 * have been already detected and sufficiently signaled by the calling function 1967 * (qcow2_check_refcounts) by the time this function is called). 1968 */ 1969 static int coroutine_fn GRAPH_RDLOCK 1970 check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) 1971 { 1972 BDRVQcow2State *s = bs->opaque; 1973 uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); 1974 int ret; 1975 uint64_t refcount; 1976 int i, j; 1977 bool repair; 1978 1979 if (fix & BDRV_FIX_ERRORS) { 1980 /* Always repair */ 1981 repair = true; 1982 } else if (fix & BDRV_FIX_LEAKS) { 1983 /* Repair only if that seems safe: This function is always 1984 * called after the refcounts have been fixed, so the refcount 1985 * is accurate if that repair was successful */ 1986 repair = !res->check_errors && !res->corruptions && !res->leaks; 1987 } else { 1988 repair = false; 1989 } 1990 1991 for (i = 0; i < s->l1_size; i++) { 1992 uint64_t l1_entry = s->l1_table[i]; 1993 uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; 1994 int l2_dirty = 0; 1995 1996 if (!l2_offset) { 1997 continue; 1998 } 1999 2000 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 2001 &refcount); 2002 if (ret < 0) { 2003 /* don't print message nor increment check_errors */ 2004 continue; 2005 } 2006 if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { 2007 res->corruptions++; 2008 fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " 2009 "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n", 2010 repair ? "Repairing" : "ERROR", i, l1_entry, refcount); 2011 if (repair) { 2012 s->l1_table[i] = refcount == 1 2013 ? l1_entry | QCOW_OFLAG_COPIED 2014 : l1_entry & ~QCOW_OFLAG_COPIED; 2015 ret = qcow2_write_l1_entry(bs, i); 2016 if (ret < 0) { 2017 res->check_errors++; 2018 goto fail; 2019 } 2020 res->corruptions--; 2021 res->corruptions_fixed++; 2022 } 2023 } 2024 2025 ret = bdrv_co_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s), 2026 l2_table, 0); 2027 if (ret < 0) { 2028 fprintf(stderr, "ERROR: Could not read L2 table: %s\n", 2029 strerror(-ret)); 2030 res->check_errors++; 2031 goto fail; 2032 } 2033 2034 for (j = 0; j < s->l2_size; j++) { 2035 uint64_t l2_entry = get_l2_entry(s, l2_table, j); 2036 uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; 2037 QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry); 2038 2039 if (cluster_type == QCOW2_CLUSTER_NORMAL || 2040 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) { 2041 if (has_data_file(bs)) { 2042 refcount = 1; 2043 } else { 2044 ret = qcow2_get_refcount(bs, 2045 data_offset >> s->cluster_bits, 2046 &refcount); 2047 if (ret < 0) { 2048 /* don't print message nor increment check_errors */ 2049 continue; 2050 } 2051 } 2052 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { 2053 res->corruptions++; 2054 fprintf(stderr, "%s OFLAG_COPIED data cluster: " 2055 "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n", 2056 repair ? "Repairing" : "ERROR", l2_entry, refcount); 2057 if (repair) { 2058 set_l2_entry(s, l2_table, j, 2059 refcount == 1 ? 2060 l2_entry | QCOW_OFLAG_COPIED : 2061 l2_entry & ~QCOW_OFLAG_COPIED); 2062 l2_dirty++; 2063 } 2064 } 2065 } 2066 } 2067 2068 if (l2_dirty > 0) { 2069 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, 2070 l2_offset, s->cluster_size, 2071 false); 2072 if (ret < 0) { 2073 fprintf(stderr, "ERROR: Could not write L2 table; metadata " 2074 "overlap check failed: %s\n", strerror(-ret)); 2075 res->check_errors++; 2076 goto fail; 2077 } 2078 2079 ret = bdrv_co_pwrite(bs->file, l2_offset, s->cluster_size, l2_table, 0); 2080 if (ret < 0) { 2081 fprintf(stderr, "ERROR: Could not write L2 table: %s\n", 2082 strerror(-ret)); 2083 res->check_errors++; 2084 goto fail; 2085 } 2086 res->corruptions -= l2_dirty; 2087 res->corruptions_fixed += l2_dirty; 2088 } 2089 } 2090 2091 ret = 0; 2092 2093 fail: 2094 qemu_vfree(l2_table); 2095 return ret; 2096 } 2097 2098 /* 2099 * Checks consistency of refblocks and accounts for each refblock in 2100 * *refcount_table. 2101 */ 2102 static int coroutine_fn GRAPH_RDLOCK 2103 check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, 2104 BdrvCheckMode fix, bool *rebuild, 2105 void **refcount_table, int64_t *nb_clusters) 2106 { 2107 BDRVQcow2State *s = bs->opaque; 2108 int64_t i, size; 2109 int ret; 2110 2111 for(i = 0; i < s->refcount_table_size; i++) { 2112 uint64_t offset, cluster; 2113 offset = s->refcount_table[i] & REFT_OFFSET_MASK; 2114 cluster = offset >> s->cluster_bits; 2115 2116 if (s->refcount_table[i] & REFT_RESERVED_MASK) { 2117 fprintf(stderr, "ERROR refcount table entry %" PRId64 " has " 2118 "reserved bits set\n", i); 2119 res->corruptions++; 2120 *rebuild = true; 2121 continue; 2122 } 2123 2124 /* Refcount blocks are cluster aligned */ 2125 if (offset_into_cluster(s, offset)) { 2126 fprintf(stderr, "ERROR refcount block %" PRId64 " is not " 2127 "cluster aligned; refcount table entry corrupted\n", i); 2128 res->corruptions++; 2129 *rebuild = true; 2130 continue; 2131 } 2132 2133 if (cluster >= *nb_clusters) { 2134 res->corruptions++; 2135 fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n", 2136 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); 2137 2138 if (fix & BDRV_FIX_ERRORS) { 2139 int64_t new_nb_clusters; 2140 Error *local_err = NULL; 2141 2142 if (offset > INT64_MAX - s->cluster_size) { 2143 ret = -EINVAL; 2144 goto resize_fail; 2145 } 2146 2147 ret = bdrv_co_truncate(bs->file, offset + s->cluster_size, false, 2148 PREALLOC_MODE_OFF, 0, &local_err); 2149 if (ret < 0) { 2150 error_report_err(local_err); 2151 goto resize_fail; 2152 } 2153 size = bdrv_co_getlength(bs->file->bs); 2154 if (size < 0) { 2155 ret = size; 2156 goto resize_fail; 2157 } 2158 2159 new_nb_clusters = size_to_clusters(s, size); 2160 assert(new_nb_clusters >= *nb_clusters); 2161 2162 ret = realloc_refcount_array(s, refcount_table, 2163 nb_clusters, new_nb_clusters); 2164 if (ret < 0) { 2165 res->check_errors++; 2166 return ret; 2167 } 2168 2169 if (cluster >= *nb_clusters) { 2170 ret = -EINVAL; 2171 goto resize_fail; 2172 } 2173 2174 res->corruptions--; 2175 res->corruptions_fixed++; 2176 ret = qcow2_inc_refcounts_imrt(bs, res, 2177 refcount_table, nb_clusters, 2178 offset, s->cluster_size); 2179 if (ret < 0) { 2180 return ret; 2181 } 2182 /* No need to check whether the refcount is now greater than 1: 2183 * This area was just allocated and zeroed, so it can only be 2184 * exactly 1 after qcow2_inc_refcounts_imrt() */ 2185 continue; 2186 2187 resize_fail: 2188 *rebuild = true; 2189 fprintf(stderr, "ERROR could not resize image: %s\n", 2190 strerror(-ret)); 2191 } 2192 continue; 2193 } 2194 2195 if (offset != 0) { 2196 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2197 offset, s->cluster_size); 2198 if (ret < 0) { 2199 return ret; 2200 } 2201 if (s->get_refcount(*refcount_table, cluster) != 1) { 2202 fprintf(stderr, "ERROR refcount block %" PRId64 2203 " refcount=%" PRIu64 "\n", i, 2204 s->get_refcount(*refcount_table, cluster)); 2205 res->corruptions++; 2206 *rebuild = true; 2207 } 2208 } 2209 } 2210 2211 return 0; 2212 } 2213 2214 /* 2215 * Calculates an in-memory refcount table. 2216 */ 2217 static int coroutine_fn GRAPH_RDLOCK 2218 calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 2219 BdrvCheckMode fix, bool *rebuild, 2220 void **refcount_table, int64_t *nb_clusters) 2221 { 2222 BDRVQcow2State *s = bs->opaque; 2223 int64_t i; 2224 QCowSnapshot *sn; 2225 int ret; 2226 2227 if (!*refcount_table) { 2228 int64_t old_size = 0; 2229 ret = realloc_refcount_array(s, refcount_table, 2230 &old_size, *nb_clusters); 2231 if (ret < 0) { 2232 res->check_errors++; 2233 return ret; 2234 } 2235 } 2236 2237 /* header */ 2238 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2239 0, s->cluster_size); 2240 if (ret < 0) { 2241 return ret; 2242 } 2243 2244 /* current L1 table */ 2245 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 2246 s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO, 2247 fix, true); 2248 if (ret < 0) { 2249 return ret; 2250 } 2251 2252 /* snapshots */ 2253 if (has_data_file(bs) && s->nb_snapshots) { 2254 fprintf(stderr, "ERROR %d snapshots in image with data file\n", 2255 s->nb_snapshots); 2256 res->corruptions++; 2257 } 2258 2259 for (i = 0; i < s->nb_snapshots; i++) { 2260 sn = s->snapshots + i; 2261 if (offset_into_cluster(s, sn->l1_table_offset)) { 2262 fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": " 2263 "L1 table is not cluster aligned; snapshot table entry " 2264 "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset); 2265 res->corruptions++; 2266 continue; 2267 } 2268 if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) { 2269 fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": " 2270 "L1 table is too large; snapshot table entry corrupted\n", 2271 sn->id_str, sn->name, sn->l1_size); 2272 res->corruptions++; 2273 continue; 2274 } 2275 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 2276 sn->l1_table_offset, sn->l1_size, 0, fix, 2277 false); 2278 if (ret < 0) { 2279 return ret; 2280 } 2281 } 2282 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2283 s->snapshots_offset, s->snapshots_size); 2284 if (ret < 0) { 2285 return ret; 2286 } 2287 2288 /* refcount data */ 2289 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2290 s->refcount_table_offset, 2291 s->refcount_table_size * 2292 REFTABLE_ENTRY_SIZE); 2293 if (ret < 0) { 2294 return ret; 2295 } 2296 2297 /* encryption */ 2298 if (s->crypto_header.length) { 2299 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 2300 s->crypto_header.offset, 2301 s->crypto_header.length); 2302 if (ret < 0) { 2303 return ret; 2304 } 2305 } 2306 2307 /* bitmaps */ 2308 ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters); 2309 if (ret < 0) { 2310 return ret; 2311 } 2312 2313 return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters); 2314 } 2315 2316 /* 2317 * Compares the actual reference count for each cluster in the image against the 2318 * refcount as reported by the refcount structures on-disk. 2319 */ 2320 static void coroutine_fn GRAPH_RDLOCK 2321 compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 2322 BdrvCheckMode fix, bool *rebuild, 2323 int64_t *highest_cluster, 2324 void *refcount_table, int64_t nb_clusters) 2325 { 2326 BDRVQcow2State *s = bs->opaque; 2327 int64_t i; 2328 uint64_t refcount1, refcount2; 2329 int ret; 2330 2331 for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { 2332 ret = qcow2_get_refcount(bs, i, &refcount1); 2333 if (ret < 0) { 2334 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", 2335 i, strerror(-ret)); 2336 res->check_errors++; 2337 continue; 2338 } 2339 2340 refcount2 = s->get_refcount(refcount_table, i); 2341 2342 if (refcount1 > 0 || refcount2 > 0) { 2343 *highest_cluster = i; 2344 } 2345 2346 if (refcount1 != refcount2) { 2347 /* Check if we're allowed to fix the mismatch */ 2348 int *num_fixed = NULL; 2349 if (refcount1 == 0) { 2350 *rebuild = true; 2351 } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { 2352 num_fixed = &res->leaks_fixed; 2353 } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { 2354 num_fixed = &res->corruptions_fixed; 2355 } 2356 2357 fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64 2358 " reference=%" PRIu64 "\n", 2359 num_fixed != NULL ? "Repairing" : 2360 refcount1 < refcount2 ? "ERROR" : 2361 "Leaked", 2362 i, refcount1, refcount2); 2363 2364 if (num_fixed) { 2365 ret = update_refcount(bs, i << s->cluster_bits, 1, 2366 refcount_diff(refcount1, refcount2), 2367 refcount1 > refcount2, 2368 QCOW2_DISCARD_ALWAYS); 2369 if (ret >= 0) { 2370 (*num_fixed)++; 2371 continue; 2372 } 2373 } 2374 2375 /* And if we couldn't, print an error */ 2376 if (refcount1 < refcount2) { 2377 res->corruptions++; 2378 } else { 2379 res->leaks++; 2380 } 2381 } 2382 } 2383 } 2384 2385 /* 2386 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to 2387 * the on-disk refcount structures. 2388 * 2389 * On input, *first_free_cluster tells where to start looking, and need not 2390 * actually be a free cluster; the returned offset will not be before that 2391 * cluster. On output, *first_free_cluster points to the first gap found, even 2392 * if that gap was too small to be used as the returned offset. 2393 * 2394 * Note that *first_free_cluster is a cluster index whereas the return value is 2395 * an offset. 2396 */ 2397 static int64_t alloc_clusters_imrt(BlockDriverState *bs, 2398 int cluster_count, 2399 void **refcount_table, 2400 int64_t *imrt_nb_clusters, 2401 int64_t *first_free_cluster) 2402 { 2403 BDRVQcow2State *s = bs->opaque; 2404 int64_t cluster = *first_free_cluster, i; 2405 bool first_gap = true; 2406 int contiguous_free_clusters; 2407 int ret; 2408 2409 /* Starting at *first_free_cluster, find a range of at least cluster_count 2410 * continuously free clusters */ 2411 for (contiguous_free_clusters = 0; 2412 cluster < *imrt_nb_clusters && 2413 contiguous_free_clusters < cluster_count; 2414 cluster++) 2415 { 2416 if (!s->get_refcount(*refcount_table, cluster)) { 2417 contiguous_free_clusters++; 2418 if (first_gap) { 2419 /* If this is the first free cluster found, update 2420 * *first_free_cluster accordingly */ 2421 *first_free_cluster = cluster; 2422 first_gap = false; 2423 } 2424 } else if (contiguous_free_clusters) { 2425 contiguous_free_clusters = 0; 2426 } 2427 } 2428 2429 /* If contiguous_free_clusters is greater than zero, it contains the number 2430 * of continuously free clusters until the current cluster; the first free 2431 * cluster in the current "gap" is therefore 2432 * cluster - contiguous_free_clusters */ 2433 2434 /* If no such range could be found, grow the in-memory refcount table 2435 * accordingly to append free clusters at the end of the image */ 2436 if (contiguous_free_clusters < cluster_count) { 2437 /* contiguous_free_clusters clusters are already empty at the image end; 2438 * we need cluster_count clusters; therefore, we have to allocate 2439 * cluster_count - contiguous_free_clusters new clusters at the end of 2440 * the image (which is the current value of cluster; note that cluster 2441 * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond 2442 * the image end) */ 2443 ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters, 2444 cluster + cluster_count 2445 - contiguous_free_clusters); 2446 if (ret < 0) { 2447 return ret; 2448 } 2449 } 2450 2451 /* Go back to the first free cluster */ 2452 cluster -= contiguous_free_clusters; 2453 for (i = 0; i < cluster_count; i++) { 2454 s->set_refcount(*refcount_table, cluster + i, 1); 2455 } 2456 2457 return cluster << s->cluster_bits; 2458 } 2459 2460 /* 2461 * Helper function for rebuild_refcount_structure(). 2462 * 2463 * Scan the range of clusters [first_cluster, end_cluster) for allocated 2464 * clusters and write all corresponding refblocks to disk. The refblock 2465 * and allocation data is taken from the in-memory refcount table 2466 * *refcount_table[] (of size *nb_clusters), which is basically one big 2467 * (unlimited size) refblock for the whole image. 2468 * 2469 * For these refblocks, clusters are allocated using said in-memory 2470 * refcount table. Care is taken that these allocations are reflected 2471 * in the refblocks written to disk. 2472 * 2473 * The refblocks' offsets are written into a reftable, which is 2474 * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr). If 2475 * that reftable is of insufficient size, it will be resized to fit. 2476 * This reftable is not written to disk. 2477 * 2478 * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed 2479 * to point to existing valid refblocks that do not need to be allocated 2480 * again.) 2481 * 2482 * Return whether the on-disk reftable array was resized (true/false), 2483 * or -errno on error. 2484 */ 2485 static int coroutine_fn GRAPH_RDLOCK 2486 rebuild_refcounts_write_refblocks( 2487 BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters, 2488 int64_t first_cluster, int64_t end_cluster, 2489 uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr, 2490 Error **errp 2491 ) 2492 { 2493 BDRVQcow2State *s = bs->opaque; 2494 int64_t cluster; 2495 int64_t refblock_offset, refblock_start, refblock_index; 2496 int64_t first_free_cluster = 0; 2497 uint64_t *on_disk_reftable = *on_disk_reftable_ptr; 2498 uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr; 2499 void *on_disk_refblock; 2500 bool reftable_grown = false; 2501 int ret; 2502 2503 for (cluster = first_cluster; cluster < end_cluster; cluster++) { 2504 /* Check all clusters to find refblocks that contain non-zero entries */ 2505 if (!s->get_refcount(*refcount_table, cluster)) { 2506 continue; 2507 } 2508 2509 /* 2510 * This cluster is allocated, so we need to create a refblock 2511 * for it. The data we will write to disk is just the 2512 * respective slice from *refcount_table, so it will contain 2513 * accurate refcounts for all clusters belonging to this 2514 * refblock. After we have written it, we will therefore skip 2515 * all remaining clusters in this refblock. 2516 */ 2517 2518 refblock_index = cluster >> s->refcount_block_bits; 2519 refblock_start = refblock_index << s->refcount_block_bits; 2520 2521 if (on_disk_reftable_entries > refblock_index && 2522 on_disk_reftable[refblock_index]) 2523 { 2524 /* 2525 * We can get here after a `goto write_refblocks`: We have a 2526 * reftable from a previous run, and the refblock is already 2527 * allocated. No need to allocate it again. 2528 */ 2529 refblock_offset = on_disk_reftable[refblock_index]; 2530 } else { 2531 int64_t refblock_cluster_index; 2532 2533 /* Don't allocate a cluster in a refblock already written to disk */ 2534 if (first_free_cluster < refblock_start) { 2535 first_free_cluster = refblock_start; 2536 } 2537 refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, 2538 nb_clusters, 2539 &first_free_cluster); 2540 if (refblock_offset < 0) { 2541 error_setg_errno(errp, -refblock_offset, 2542 "ERROR allocating refblock"); 2543 return refblock_offset; 2544 } 2545 2546 refblock_cluster_index = refblock_offset / s->cluster_size; 2547 if (refblock_cluster_index >= end_cluster) { 2548 /* 2549 * We must write the refblock that holds this refblock's 2550 * refcount 2551 */ 2552 end_cluster = refblock_cluster_index + 1; 2553 } 2554 2555 if (on_disk_reftable_entries <= refblock_index) { 2556 on_disk_reftable_entries = 2557 ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE, 2558 s->cluster_size) / REFTABLE_ENTRY_SIZE; 2559 on_disk_reftable = 2560 g_try_realloc(on_disk_reftable, 2561 on_disk_reftable_entries * 2562 REFTABLE_ENTRY_SIZE); 2563 if (!on_disk_reftable) { 2564 error_setg(errp, "ERROR allocating reftable memory"); 2565 return -ENOMEM; 2566 } 2567 2568 memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0, 2569 (on_disk_reftable_entries - 2570 *on_disk_reftable_entries_ptr) * 2571 REFTABLE_ENTRY_SIZE); 2572 2573 *on_disk_reftable_ptr = on_disk_reftable; 2574 *on_disk_reftable_entries_ptr = on_disk_reftable_entries; 2575 2576 reftable_grown = true; 2577 } else { 2578 assert(on_disk_reftable); 2579 } 2580 on_disk_reftable[refblock_index] = refblock_offset; 2581 } 2582 2583 /* Refblock is allocated, write it to disk */ 2584 2585 ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, 2586 s->cluster_size, false); 2587 if (ret < 0) { 2588 error_setg_errno(errp, -ret, "ERROR writing refblock"); 2589 return ret; 2590 } 2591 2592 /* 2593 * The refblock is simply a slice of *refcount_table. 2594 * Note that the size of *refcount_table is always aligned to 2595 * whole clusters, so the write operation will not result in 2596 * out-of-bounds accesses. 2597 */ 2598 on_disk_refblock = (void *)((char *) *refcount_table + 2599 refblock_index * s->cluster_size); 2600 2601 ret = bdrv_co_pwrite(bs->file, refblock_offset, s->cluster_size, 2602 on_disk_refblock, 0); 2603 if (ret < 0) { 2604 error_setg_errno(errp, -ret, "ERROR writing refblock"); 2605 return ret; 2606 } 2607 2608 /* This refblock is done, skip to its end */ 2609 cluster = refblock_start + s->refcount_block_size - 1; 2610 } 2611 2612 return reftable_grown; 2613 } 2614 2615 /* 2616 * Creates a new refcount structure based solely on the in-memory information 2617 * given through *refcount_table (this in-memory information is basically just 2618 * the concatenation of all refblocks). All necessary allocations will be 2619 * reflected in that array. 2620 * 2621 * On success, the old refcount structure is leaked (it will be covered by the 2622 * new refcount structure). 2623 */ 2624 static int coroutine_fn GRAPH_RDLOCK 2625 rebuild_refcount_structure(BlockDriverState *bs, BdrvCheckResult *res, 2626 void **refcount_table, int64_t *nb_clusters, 2627 Error **errp) 2628 { 2629 BDRVQcow2State *s = bs->opaque; 2630 int64_t reftable_offset = -1; 2631 int64_t reftable_length = 0; 2632 int64_t reftable_clusters; 2633 int64_t refblock_index; 2634 uint32_t on_disk_reftable_entries = 0; 2635 uint64_t *on_disk_reftable = NULL; 2636 int ret = 0; 2637 int reftable_size_changed = 0; 2638 struct { 2639 uint64_t reftable_offset; 2640 uint32_t reftable_clusters; 2641 } QEMU_PACKED reftable_offset_and_clusters; 2642 2643 qcow2_cache_empty(bs, s->refcount_block_cache); 2644 2645 /* 2646 * For each refblock containing entries, we try to allocate a 2647 * cluster (in the in-memory refcount table) and write its offset 2648 * into on_disk_reftable[]. We then write the whole refblock to 2649 * disk (as a slice of the in-memory refcount table). 2650 * This is done by rebuild_refcounts_write_refblocks(). 2651 * 2652 * Once we have scanned all clusters, we try to find space for the 2653 * reftable. This will dirty the in-memory refcount table (i.e. 2654 * make it differ from the refblocks we have already written), so we 2655 * need to run rebuild_refcounts_write_refblocks() again for the 2656 * range of clusters where the reftable has been allocated. 2657 * 2658 * This second run might make the reftable grow again, in which case 2659 * we will need to allocate another space for it, which is why we 2660 * repeat all this until the reftable stops growing. 2661 * 2662 * (This loop will terminate, because with every cluster the 2663 * reftable grows, it can accommodate a multitude of more refcounts, 2664 * so that at some point this must be able to cover the reftable 2665 * and all refblocks describing it.) 2666 * 2667 * We then convert the reftable to big-endian and write it to disk. 2668 * 2669 * Note that we never free any reftable allocations. Doing so would 2670 * needlessly complicate the algorithm: The eventual second check 2671 * run we do will clean up all leaks we have caused. 2672 */ 2673 2674 reftable_size_changed = 2675 rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters, 2676 0, *nb_clusters, 2677 &on_disk_reftable, 2678 &on_disk_reftable_entries, errp); 2679 if (reftable_size_changed < 0) { 2680 res->check_errors++; 2681 ret = reftable_size_changed; 2682 goto fail; 2683 } 2684 2685 /* 2686 * There was no reftable before, so rebuild_refcounts_write_refblocks() 2687 * must have increased its size (from 0 to something). 2688 */ 2689 assert(reftable_size_changed); 2690 2691 do { 2692 int64_t reftable_start_cluster, reftable_end_cluster; 2693 int64_t first_free_cluster = 0; 2694 2695 reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE; 2696 reftable_clusters = size_to_clusters(s, reftable_length); 2697 2698 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, 2699 refcount_table, nb_clusters, 2700 &first_free_cluster); 2701 if (reftable_offset < 0) { 2702 error_setg_errno(errp, -reftable_offset, 2703 "ERROR allocating reftable"); 2704 res->check_errors++; 2705 ret = reftable_offset; 2706 goto fail; 2707 } 2708 2709 /* 2710 * We need to update the affected refblocks, so re-run the 2711 * write_refblocks loop for the reftable's range of clusters. 2712 */ 2713 assert(offset_into_cluster(s, reftable_offset) == 0); 2714 reftable_start_cluster = reftable_offset / s->cluster_size; 2715 reftable_end_cluster = reftable_start_cluster + reftable_clusters; 2716 reftable_size_changed = 2717 rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters, 2718 reftable_start_cluster, 2719 reftable_end_cluster, 2720 &on_disk_reftable, 2721 &on_disk_reftable_entries, errp); 2722 if (reftable_size_changed < 0) { 2723 res->check_errors++; 2724 ret = reftable_size_changed; 2725 goto fail; 2726 } 2727 2728 /* 2729 * If the reftable size has changed, we will need to find a new 2730 * allocation, repeating the loop. 2731 */ 2732 } while (reftable_size_changed); 2733 2734 /* The above loop must have run at least once */ 2735 assert(reftable_offset >= 0); 2736 2737 /* 2738 * All allocations are done, all refblocks are written, convert the 2739 * reftable to big-endian and write it to disk. 2740 */ 2741 2742 for (refblock_index = 0; refblock_index < on_disk_reftable_entries; 2743 refblock_index++) 2744 { 2745 cpu_to_be64s(&on_disk_reftable[refblock_index]); 2746 } 2747 2748 ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length, 2749 false); 2750 if (ret < 0) { 2751 error_setg_errno(errp, -ret, "ERROR writing reftable"); 2752 goto fail; 2753 } 2754 2755 assert(reftable_length < INT_MAX); 2756 ret = bdrv_co_pwrite(bs->file, reftable_offset, reftable_length, 2757 on_disk_reftable, 0); 2758 if (ret < 0) { 2759 error_setg_errno(errp, -ret, "ERROR writing reftable"); 2760 goto fail; 2761 } 2762 2763 /* Enter new reftable into the image header */ 2764 reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset); 2765 reftable_offset_and_clusters.reftable_clusters = 2766 cpu_to_be32(reftable_clusters); 2767 ret = bdrv_co_pwrite_sync(bs->file, 2768 offsetof(QCowHeader, refcount_table_offset), 2769 sizeof(reftable_offset_and_clusters), 2770 &reftable_offset_and_clusters, 0); 2771 if (ret < 0) { 2772 error_setg_errno(errp, -ret, "ERROR setting reftable"); 2773 goto fail; 2774 } 2775 2776 for (refblock_index = 0; refblock_index < on_disk_reftable_entries; 2777 refblock_index++) 2778 { 2779 be64_to_cpus(&on_disk_reftable[refblock_index]); 2780 } 2781 s->refcount_table = on_disk_reftable; 2782 s->refcount_table_offset = reftable_offset; 2783 s->refcount_table_size = on_disk_reftable_entries; 2784 update_max_refcount_table_index(s); 2785 2786 return 0; 2787 2788 fail: 2789 g_free(on_disk_reftable); 2790 return ret; 2791 } 2792 2793 /* 2794 * Checks an image for refcount consistency. 2795 * 2796 * Returns 0 if no errors are found, the number of errors in case the image is 2797 * detected as corrupted, and -errno when an internal error occurred. 2798 */ 2799 int coroutine_fn GRAPH_RDLOCK 2800 qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) 2801 { 2802 BDRVQcow2State *s = bs->opaque; 2803 BdrvCheckResult pre_compare_res; 2804 int64_t size, highest_cluster, nb_clusters; 2805 void *refcount_table = NULL; 2806 bool rebuild = false; 2807 int ret; 2808 2809 size = bdrv_co_getlength(bs->file->bs); 2810 if (size < 0) { 2811 res->check_errors++; 2812 return size; 2813 } 2814 2815 nb_clusters = size_to_clusters(s, size); 2816 if (nb_clusters > INT_MAX) { 2817 res->check_errors++; 2818 return -EFBIG; 2819 } 2820 2821 res->bfi.total_clusters = 2822 size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); 2823 2824 ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table, 2825 &nb_clusters); 2826 if (ret < 0) { 2827 goto fail; 2828 } 2829 2830 /* In case we don't need to rebuild the refcount structure (but want to fix 2831 * something), this function is immediately called again, in which case the 2832 * result should be ignored */ 2833 pre_compare_res = *res; 2834 compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table, 2835 nb_clusters); 2836 2837 if (rebuild && (fix & BDRV_FIX_ERRORS)) { 2838 BdrvCheckResult old_res = *res; 2839 int fresh_leaks = 0; 2840 Error *local_err = NULL; 2841 2842 fprintf(stderr, "Rebuilding refcount structure\n"); 2843 ret = rebuild_refcount_structure(bs, res, &refcount_table, 2844 &nb_clusters, &local_err); 2845 if (ret < 0) { 2846 error_report_err(local_err); 2847 goto fail; 2848 } 2849 2850 res->corruptions = 0; 2851 res->leaks = 0; 2852 2853 /* Because the old reftable has been exchanged for a new one the 2854 * references have to be recalculated */ 2855 rebuild = false; 2856 memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters)); 2857 ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table, 2858 &nb_clusters); 2859 if (ret < 0) { 2860 goto fail; 2861 } 2862 2863 if (fix & BDRV_FIX_LEAKS) { 2864 /* The old refcount structures are now leaked, fix it; the result 2865 * can be ignored, aside from leaks which were introduced by 2866 * rebuild_refcount_structure() that could not be fixed */ 2867 BdrvCheckResult saved_res = *res; 2868 *res = (BdrvCheckResult){ 0 }; 2869 2870 compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild, 2871 &highest_cluster, refcount_table, nb_clusters); 2872 if (rebuild) { 2873 fprintf(stderr, "ERROR rebuilt refcount structure is still " 2874 "broken\n"); 2875 } 2876 2877 /* Any leaks accounted for here were introduced by 2878 * rebuild_refcount_structure() because that function has created a 2879 * new refcount structure from scratch */ 2880 fresh_leaks = res->leaks; 2881 *res = saved_res; 2882 } 2883 2884 if (res->corruptions < old_res.corruptions) { 2885 res->corruptions_fixed += old_res.corruptions - res->corruptions; 2886 } 2887 if (res->leaks < old_res.leaks) { 2888 res->leaks_fixed += old_res.leaks - res->leaks; 2889 } 2890 res->leaks += fresh_leaks; 2891 } else if (fix) { 2892 if (rebuild) { 2893 fprintf(stderr, "ERROR need to rebuild refcount structures\n"); 2894 res->check_errors++; 2895 ret = -EIO; 2896 goto fail; 2897 } 2898 2899 if (res->leaks || res->corruptions) { 2900 *res = pre_compare_res; 2901 compare_refcounts(bs, res, fix, &rebuild, &highest_cluster, 2902 refcount_table, nb_clusters); 2903 } 2904 } 2905 2906 /* check OFLAG_COPIED */ 2907 ret = check_oflag_copied(bs, res, fix); 2908 if (ret < 0) { 2909 goto fail; 2910 } 2911 2912 res->image_end_offset = (highest_cluster + 1) * s->cluster_size; 2913 ret = 0; 2914 2915 fail: 2916 g_free(refcount_table); 2917 2918 return ret; 2919 } 2920 2921 #define overlaps_with(ofs, sz) \ 2922 ranges_overlap(offset, size, ofs, sz) 2923 2924 /* 2925 * Checks if the given offset into the image file is actually free to use by 2926 * looking for overlaps with important metadata sections (L1/L2 tables etc.), 2927 * i.e. a sanity check without relying on the refcount tables. 2928 * 2929 * The ign parameter specifies what checks not to perform (being a bitmask of 2930 * QCow2MetadataOverlap values), i.e., what sections to ignore. 2931 * 2932 * Returns: 2933 * - 0 if writing to this offset will not affect the mentioned metadata 2934 * - a positive QCow2MetadataOverlap value indicating one overlapping section 2935 * - a negative value (-errno) indicating an error while performing a check, 2936 * e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2 2937 */ 2938 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, 2939 int64_t size) 2940 { 2941 BDRVQcow2State *s = bs->opaque; 2942 int chk = s->overlap_check & ~ign; 2943 int i, j; 2944 2945 if (!size) { 2946 return 0; 2947 } 2948 2949 if (chk & QCOW2_OL_MAIN_HEADER) { 2950 if (offset < s->cluster_size) { 2951 return QCOW2_OL_MAIN_HEADER; 2952 } 2953 } 2954 2955 /* align range to test to cluster boundaries */ 2956 size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size); 2957 offset = start_of_cluster(s, offset); 2958 2959 if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { 2960 if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) { 2961 return QCOW2_OL_ACTIVE_L1; 2962 } 2963 } 2964 2965 if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { 2966 if (overlaps_with(s->refcount_table_offset, 2967 s->refcount_table_size * REFTABLE_ENTRY_SIZE)) { 2968 return QCOW2_OL_REFCOUNT_TABLE; 2969 } 2970 } 2971 2972 if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { 2973 if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { 2974 return QCOW2_OL_SNAPSHOT_TABLE; 2975 } 2976 } 2977 2978 if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { 2979 for (i = 0; i < s->nb_snapshots; i++) { 2980 if (s->snapshots[i].l1_size && 2981 overlaps_with(s->snapshots[i].l1_table_offset, 2982 s->snapshots[i].l1_size * L1E_SIZE)) { 2983 return QCOW2_OL_INACTIVE_L1; 2984 } 2985 } 2986 } 2987 2988 if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { 2989 for (i = 0; i < s->l1_size; i++) { 2990 if ((s->l1_table[i] & L1E_OFFSET_MASK) && 2991 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, 2992 s->cluster_size)) { 2993 return QCOW2_OL_ACTIVE_L2; 2994 } 2995 } 2996 } 2997 2998 if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { 2999 unsigned last_entry = s->max_refcount_table_index; 3000 assert(last_entry < s->refcount_table_size); 3001 assert(last_entry + 1 == s->refcount_table_size || 3002 (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0); 3003 for (i = 0; i <= last_entry; i++) { 3004 if ((s->refcount_table[i] & REFT_OFFSET_MASK) && 3005 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, 3006 s->cluster_size)) { 3007 return QCOW2_OL_REFCOUNT_BLOCK; 3008 } 3009 } 3010 } 3011 3012 if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { 3013 for (i = 0; i < s->nb_snapshots; i++) { 3014 uint64_t l1_ofs = s->snapshots[i].l1_table_offset; 3015 uint32_t l1_sz = s->snapshots[i].l1_size; 3016 uint64_t l1_sz2 = l1_sz * L1E_SIZE; 3017 uint64_t *l1; 3018 int ret; 3019 3020 ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE, 3021 QCOW_MAX_L1_SIZE, "", NULL); 3022 if (ret < 0) { 3023 return ret; 3024 } 3025 3026 l1 = g_try_malloc(l1_sz2); 3027 3028 if (l1_sz2 && l1 == NULL) { 3029 return -ENOMEM; 3030 } 3031 3032 ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0); 3033 if (ret < 0) { 3034 g_free(l1); 3035 return ret; 3036 } 3037 3038 for (j = 0; j < l1_sz; j++) { 3039 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; 3040 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { 3041 g_free(l1); 3042 return QCOW2_OL_INACTIVE_L2; 3043 } 3044 } 3045 3046 g_free(l1); 3047 } 3048 } 3049 3050 if ((chk & QCOW2_OL_BITMAP_DIRECTORY) && 3051 (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) 3052 { 3053 if (overlaps_with(s->bitmap_directory_offset, 3054 s->bitmap_directory_size)) 3055 { 3056 return QCOW2_OL_BITMAP_DIRECTORY; 3057 } 3058 } 3059 3060 return 0; 3061 } 3062 3063 static const char *metadata_ol_names[] = { 3064 [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", 3065 [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", 3066 [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", 3067 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", 3068 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", 3069 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", 3070 [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", 3071 [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", 3072 [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = "bitmap directory", 3073 }; 3074 QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names)); 3075 3076 /* 3077 * First performs a check for metadata overlaps (through 3078 * qcow2_check_metadata_overlap); if that fails with a negative value (error 3079 * while performing a check), that value is returned. If an impending overlap 3080 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt 3081 * and -EIO returned. 3082 * 3083 * Returns 0 if there were neither overlaps nor errors while checking for 3084 * overlaps; or a negative value (-errno) on error. 3085 */ 3086 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, 3087 int64_t size, bool data_file) 3088 { 3089 int ret; 3090 3091 if (data_file && has_data_file(bs)) { 3092 return 0; 3093 } 3094 3095 ret = qcow2_check_metadata_overlap(bs, ign, offset, size); 3096 if (ret < 0) { 3097 return ret; 3098 } else if (ret > 0) { 3099 int metadata_ol_bitnr = ctz32(ret); 3100 assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); 3101 3102 qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " 3103 "write on metadata (overlaps with %s)", 3104 metadata_ol_names[metadata_ol_bitnr]); 3105 return -EIO; 3106 } 3107 3108 return 0; 3109 } 3110 3111 /* A pointer to a function of this type is given to walk_over_reftable(). That 3112 * function will create refblocks and pass them to a RefblockFinishOp once they 3113 * are completed (@refblock). @refblock_empty is set if the refblock is 3114 * completely empty. 3115 * 3116 * Along with the refblock, a corresponding reftable entry is passed, in the 3117 * reftable @reftable (which may be reallocated) at @reftable_index. 3118 * 3119 * @allocated should be set to true if a new cluster has been allocated. 3120 */ 3121 typedef int /* GRAPH_RDLOCK_PTR */ 3122 (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable, 3123 uint64_t reftable_index, uint64_t *reftable_size, 3124 void *refblock, bool refblock_empty, 3125 bool *allocated, Error **errp); 3126 3127 /** 3128 * This "operation" for walk_over_reftable() allocates the refblock on disk (if 3129 * it is not empty) and inserts its offset into the new reftable. The size of 3130 * this new reftable is increased as required. 3131 */ 3132 static int GRAPH_RDLOCK 3133 alloc_refblock(BlockDriverState *bs, uint64_t **reftable, 3134 uint64_t reftable_index, uint64_t *reftable_size, 3135 void *refblock, bool refblock_empty, bool *allocated, 3136 Error **errp) 3137 { 3138 BDRVQcow2State *s = bs->opaque; 3139 int64_t offset; 3140 3141 if (!refblock_empty && reftable_index >= *reftable_size) { 3142 uint64_t *new_reftable; 3143 uint64_t new_reftable_size; 3144 3145 new_reftable_size = ROUND_UP(reftable_index + 1, 3146 s->cluster_size / REFTABLE_ENTRY_SIZE); 3147 if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) { 3148 error_setg(errp, 3149 "This operation would make the refcount table grow " 3150 "beyond the maximum size supported by QEMU, aborting"); 3151 return -ENOTSUP; 3152 } 3153 3154 new_reftable = g_try_realloc(*reftable, new_reftable_size * 3155 REFTABLE_ENTRY_SIZE); 3156 if (!new_reftable) { 3157 error_setg(errp, "Failed to increase reftable buffer size"); 3158 return -ENOMEM; 3159 } 3160 3161 memset(new_reftable + *reftable_size, 0, 3162 (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE); 3163 3164 *reftable = new_reftable; 3165 *reftable_size = new_reftable_size; 3166 } 3167 3168 if (!refblock_empty && !(*reftable)[reftable_index]) { 3169 offset = qcow2_alloc_clusters(bs, s->cluster_size); 3170 if (offset < 0) { 3171 error_setg_errno(errp, -offset, "Failed to allocate refblock"); 3172 return offset; 3173 } 3174 (*reftable)[reftable_index] = offset; 3175 *allocated = true; 3176 } 3177 3178 return 0; 3179 } 3180 3181 /** 3182 * This "operation" for walk_over_reftable() writes the refblock to disk at the 3183 * offset specified by the new reftable's entry. It does not modify the new 3184 * reftable or change any refcounts. 3185 */ 3186 static int GRAPH_RDLOCK 3187 flush_refblock(BlockDriverState *bs, uint64_t **reftable, 3188 uint64_t reftable_index, uint64_t *reftable_size, 3189 void *refblock, bool refblock_empty, bool *allocated, 3190 Error **errp) 3191 { 3192 BDRVQcow2State *s = bs->opaque; 3193 int64_t offset; 3194 int ret; 3195 3196 if (reftable_index < *reftable_size && (*reftable)[reftable_index]) { 3197 offset = (*reftable)[reftable_index]; 3198 3199 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size, 3200 false); 3201 if (ret < 0) { 3202 error_setg_errno(errp, -ret, "Overlap check failed"); 3203 return ret; 3204 } 3205 3206 ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0); 3207 if (ret < 0) { 3208 error_setg_errno(errp, -ret, "Failed to write refblock"); 3209 return ret; 3210 } 3211 } else { 3212 assert(refblock_empty); 3213 } 3214 3215 return 0; 3216 } 3217 3218 /** 3219 * This function walks over the existing reftable and every referenced refblock; 3220 * if @new_set_refcount is non-NULL, it is called for every refcount entry to 3221 * create an equal new entry in the passed @new_refblock. Once that 3222 * @new_refblock is completely filled, @operation will be called. 3223 * 3224 * @status_cb and @cb_opaque are used for the amend operation's status callback. 3225 * @index is the index of the walk_over_reftable() calls and @total is the total 3226 * number of walk_over_reftable() calls per amend operation. Both are used for 3227 * calculating the parameters for the status callback. 3228 * 3229 * @allocated is set to true if a new cluster has been allocated. 3230 */ 3231 static int GRAPH_RDLOCK 3232 walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable, 3233 uint64_t *new_reftable_index, 3234 uint64_t *new_reftable_size, 3235 void *new_refblock, int new_refblock_size, 3236 int new_refcount_bits, 3237 RefblockFinishOp *operation, bool *allocated, 3238 Qcow2SetRefcountFunc *new_set_refcount, 3239 BlockDriverAmendStatusCB *status_cb, 3240 void *cb_opaque, int index, int total, 3241 Error **errp) 3242 { 3243 BDRVQcow2State *s = bs->opaque; 3244 uint64_t reftable_index; 3245 bool new_refblock_empty = true; 3246 int refblock_index; 3247 int new_refblock_index = 0; 3248 int ret; 3249 3250 for (reftable_index = 0; reftable_index < s->refcount_table_size; 3251 reftable_index++) 3252 { 3253 uint64_t refblock_offset = s->refcount_table[reftable_index] 3254 & REFT_OFFSET_MASK; 3255 3256 status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index, 3257 (uint64_t)total * s->refcount_table_size, cb_opaque); 3258 3259 if (refblock_offset) { 3260 void *refblock; 3261 3262 if (offset_into_cluster(s, refblock_offset)) { 3263 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 3264 PRIx64 " unaligned (reftable index: %#" 3265 PRIx64 ")", refblock_offset, 3266 reftable_index); 3267 error_setg(errp, 3268 "Image is corrupt (unaligned refblock offset)"); 3269 return -EIO; 3270 } 3271 3272 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset, 3273 &refblock); 3274 if (ret < 0) { 3275 error_setg_errno(errp, -ret, "Failed to retrieve refblock"); 3276 return ret; 3277 } 3278 3279 for (refblock_index = 0; refblock_index < s->refcount_block_size; 3280 refblock_index++) 3281 { 3282 uint64_t refcount; 3283 3284 if (new_refblock_index >= new_refblock_size) { 3285 /* new_refblock is now complete */ 3286 ret = operation(bs, new_reftable, *new_reftable_index, 3287 new_reftable_size, new_refblock, 3288 new_refblock_empty, allocated, errp); 3289 if (ret < 0) { 3290 qcow2_cache_put(s->refcount_block_cache, &refblock); 3291 return ret; 3292 } 3293 3294 (*new_reftable_index)++; 3295 new_refblock_index = 0; 3296 new_refblock_empty = true; 3297 } 3298 3299 refcount = s->get_refcount(refblock, refblock_index); 3300 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) { 3301 uint64_t offset; 3302 3303 qcow2_cache_put(s->refcount_block_cache, &refblock); 3304 3305 offset = ((reftable_index << s->refcount_block_bits) 3306 + refblock_index) << s->cluster_bits; 3307 3308 error_setg(errp, "Cannot decrease refcount entry width to " 3309 "%i bits: Cluster at offset %#" PRIx64 " has a " 3310 "refcount of %" PRIu64, new_refcount_bits, 3311 offset, refcount); 3312 return -EINVAL; 3313 } 3314 3315 if (new_set_refcount) { 3316 new_set_refcount(new_refblock, new_refblock_index++, 3317 refcount); 3318 } else { 3319 new_refblock_index++; 3320 } 3321 new_refblock_empty = new_refblock_empty && refcount == 0; 3322 } 3323 3324 qcow2_cache_put(s->refcount_block_cache, &refblock); 3325 } else { 3326 /* No refblock means every refcount is 0 */ 3327 for (refblock_index = 0; refblock_index < s->refcount_block_size; 3328 refblock_index++) 3329 { 3330 if (new_refblock_index >= new_refblock_size) { 3331 /* new_refblock is now complete */ 3332 ret = operation(bs, new_reftable, *new_reftable_index, 3333 new_reftable_size, new_refblock, 3334 new_refblock_empty, allocated, errp); 3335 if (ret < 0) { 3336 return ret; 3337 } 3338 3339 (*new_reftable_index)++; 3340 new_refblock_index = 0; 3341 new_refblock_empty = true; 3342 } 3343 3344 if (new_set_refcount) { 3345 new_set_refcount(new_refblock, new_refblock_index++, 0); 3346 } else { 3347 new_refblock_index++; 3348 } 3349 } 3350 } 3351 } 3352 3353 if (new_refblock_index > 0) { 3354 /* Complete the potentially existing partially filled final refblock */ 3355 if (new_set_refcount) { 3356 for (; new_refblock_index < new_refblock_size; 3357 new_refblock_index++) 3358 { 3359 new_set_refcount(new_refblock, new_refblock_index, 0); 3360 } 3361 } 3362 3363 ret = operation(bs, new_reftable, *new_reftable_index, 3364 new_reftable_size, new_refblock, new_refblock_empty, 3365 allocated, errp); 3366 if (ret < 0) { 3367 return ret; 3368 } 3369 3370 (*new_reftable_index)++; 3371 } 3372 3373 status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size, 3374 (uint64_t)total * s->refcount_table_size, cb_opaque); 3375 3376 return 0; 3377 } 3378 3379 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, 3380 BlockDriverAmendStatusCB *status_cb, 3381 void *cb_opaque, Error **errp) 3382 { 3383 BDRVQcow2State *s = bs->opaque; 3384 Qcow2GetRefcountFunc *new_get_refcount; 3385 Qcow2SetRefcountFunc *new_set_refcount; 3386 void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size); 3387 uint64_t *new_reftable = NULL, new_reftable_size = 0; 3388 uint64_t *old_reftable, old_reftable_size, old_reftable_offset; 3389 uint64_t new_reftable_index = 0; 3390 uint64_t i; 3391 int64_t new_reftable_offset = 0, allocated_reftable_size = 0; 3392 int new_refblock_size, new_refcount_bits = 1 << refcount_order; 3393 int old_refcount_order; 3394 int walk_index = 0; 3395 int ret; 3396 bool new_allocation; 3397 3398 assert(s->qcow_version >= 3); 3399 assert(refcount_order >= 0 && refcount_order <= 6); 3400 3401 /* see qcow2_open() */ 3402 new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3)); 3403 3404 new_get_refcount = get_refcount_funcs[refcount_order]; 3405 new_set_refcount = set_refcount_funcs[refcount_order]; 3406 3407 3408 do { 3409 int total_walks; 3410 3411 new_allocation = false; 3412 3413 /* At least we have to do this walk and the one which writes the 3414 * refblocks; also, at least we have to do this loop here at least 3415 * twice (normally), first to do the allocations, and second to 3416 * determine that everything is correctly allocated, this then makes 3417 * three walks in total */ 3418 total_walks = MAX(walk_index + 2, 3); 3419 3420 /* First, allocate the structures so they are present in the refcount 3421 * structures */ 3422 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, 3423 &new_reftable_size, NULL, new_refblock_size, 3424 new_refcount_bits, &alloc_refblock, 3425 &new_allocation, NULL, status_cb, cb_opaque, 3426 walk_index++, total_walks, errp); 3427 if (ret < 0) { 3428 goto done; 3429 } 3430 3431 new_reftable_index = 0; 3432 3433 if (new_allocation) { 3434 if (new_reftable_offset) { 3435 qcow2_free_clusters( 3436 bs, new_reftable_offset, 3437 allocated_reftable_size * REFTABLE_ENTRY_SIZE, 3438 QCOW2_DISCARD_NEVER); 3439 } 3440 3441 new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size * 3442 REFTABLE_ENTRY_SIZE); 3443 if (new_reftable_offset < 0) { 3444 error_setg_errno(errp, -new_reftable_offset, 3445 "Failed to allocate the new reftable"); 3446 ret = new_reftable_offset; 3447 goto done; 3448 } 3449 allocated_reftable_size = new_reftable_size; 3450 } 3451 } while (new_allocation); 3452 3453 /* Second, write the new refblocks */ 3454 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, 3455 &new_reftable_size, new_refblock, 3456 new_refblock_size, new_refcount_bits, 3457 &flush_refblock, &new_allocation, new_set_refcount, 3458 status_cb, cb_opaque, walk_index, walk_index + 1, 3459 errp); 3460 if (ret < 0) { 3461 goto done; 3462 } 3463 assert(!new_allocation); 3464 3465 3466 /* Write the new reftable */ 3467 ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset, 3468 new_reftable_size * REFTABLE_ENTRY_SIZE, 3469 false); 3470 if (ret < 0) { 3471 error_setg_errno(errp, -ret, "Overlap check failed"); 3472 goto done; 3473 } 3474 3475 for (i = 0; i < new_reftable_size; i++) { 3476 cpu_to_be64s(&new_reftable[i]); 3477 } 3478 3479 ret = bdrv_pwrite(bs->file, new_reftable_offset, 3480 new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable, 3481 0); 3482 3483 for (i = 0; i < new_reftable_size; i++) { 3484 be64_to_cpus(&new_reftable[i]); 3485 } 3486 3487 if (ret < 0) { 3488 error_setg_errno(errp, -ret, "Failed to write the new reftable"); 3489 goto done; 3490 } 3491 3492 3493 /* Empty the refcount cache */ 3494 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 3495 if (ret < 0) { 3496 error_setg_errno(errp, -ret, "Failed to flush the refblock cache"); 3497 goto done; 3498 } 3499 3500 /* Update the image header to point to the new reftable; this only updates 3501 * the fields which are relevant to qcow2_update_header(); other fields 3502 * such as s->refcount_table or s->refcount_bits stay stale for now 3503 * (because we have to restore everything if qcow2_update_header() fails) */ 3504 old_refcount_order = s->refcount_order; 3505 old_reftable_size = s->refcount_table_size; 3506 old_reftable_offset = s->refcount_table_offset; 3507 3508 s->refcount_order = refcount_order; 3509 s->refcount_table_size = new_reftable_size; 3510 s->refcount_table_offset = new_reftable_offset; 3511 3512 ret = qcow2_update_header(bs); 3513 if (ret < 0) { 3514 s->refcount_order = old_refcount_order; 3515 s->refcount_table_size = old_reftable_size; 3516 s->refcount_table_offset = old_reftable_offset; 3517 error_setg_errno(errp, -ret, "Failed to update the qcow2 header"); 3518 goto done; 3519 } 3520 3521 /* Now update the rest of the in-memory information */ 3522 old_reftable = s->refcount_table; 3523 s->refcount_table = new_reftable; 3524 update_max_refcount_table_index(s); 3525 3526 s->refcount_bits = 1 << refcount_order; 3527 s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); 3528 s->refcount_max += s->refcount_max - 1; 3529 3530 s->refcount_block_bits = s->cluster_bits - (refcount_order - 3); 3531 s->refcount_block_size = 1 << s->refcount_block_bits; 3532 3533 s->get_refcount = new_get_refcount; 3534 s->set_refcount = new_set_refcount; 3535 3536 /* For cleaning up all old refblocks and the old reftable below the "done" 3537 * label */ 3538 new_reftable = old_reftable; 3539 new_reftable_size = old_reftable_size; 3540 new_reftable_offset = old_reftable_offset; 3541 3542 done: 3543 if (new_reftable) { 3544 /* On success, new_reftable actually points to the old reftable (and 3545 * new_reftable_size is the old reftable's size); but that is just 3546 * fine */ 3547 for (i = 0; i < new_reftable_size; i++) { 3548 uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK; 3549 if (offset) { 3550 qcow2_free_clusters(bs, offset, s->cluster_size, 3551 QCOW2_DISCARD_OTHER); 3552 } 3553 } 3554 g_free(new_reftable); 3555 3556 if (new_reftable_offset > 0) { 3557 qcow2_free_clusters(bs, new_reftable_offset, 3558 new_reftable_size * REFTABLE_ENTRY_SIZE, 3559 QCOW2_DISCARD_OTHER); 3560 } 3561 } 3562 3563 qemu_vfree(new_refblock); 3564 return ret; 3565 } 3566 3567 static int64_t coroutine_fn GRAPH_RDLOCK 3568 get_refblock_offset(BlockDriverState *bs, uint64_t offset) 3569 { 3570 BDRVQcow2State *s = bs->opaque; 3571 uint32_t index = offset_to_reftable_index(s, offset); 3572 int64_t covering_refblock_offset = 0; 3573 3574 if (index < s->refcount_table_size) { 3575 covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK; 3576 } 3577 if (!covering_refblock_offset) { 3578 qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is " 3579 "not covered by the refcount structures", 3580 offset); 3581 return -EIO; 3582 } 3583 3584 return covering_refblock_offset; 3585 } 3586 3587 static int coroutine_fn GRAPH_RDLOCK 3588 qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs) 3589 { 3590 BDRVQcow2State *s = bs->opaque; 3591 int64_t refblock_offs; 3592 uint64_t cluster_index = discard_block_offs >> s->cluster_bits; 3593 uint32_t block_index = cluster_index & (s->refcount_block_size - 1); 3594 void *refblock; 3595 int ret; 3596 3597 refblock_offs = get_refblock_offset(bs, discard_block_offs); 3598 if (refblock_offs < 0) { 3599 return refblock_offs; 3600 } 3601 3602 assert(discard_block_offs != 0); 3603 3604 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs, 3605 &refblock); 3606 if (ret < 0) { 3607 return ret; 3608 } 3609 3610 if (s->get_refcount(refblock, block_index) != 1) { 3611 qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:" 3612 " refblock offset %#" PRIx64 3613 ", reftable index %u" 3614 ", block offset %#" PRIx64 3615 ", refcount %#" PRIx64, 3616 refblock_offs, 3617 offset_to_reftable_index(s, discard_block_offs), 3618 discard_block_offs, 3619 s->get_refcount(refblock, block_index)); 3620 qcow2_cache_put(s->refcount_block_cache, &refblock); 3621 return -EINVAL; 3622 } 3623 s->set_refcount(refblock, block_index, 0); 3624 3625 qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock); 3626 3627 qcow2_cache_put(s->refcount_block_cache, &refblock); 3628 3629 if (cluster_index < s->free_cluster_index) { 3630 s->free_cluster_index = cluster_index; 3631 } 3632 3633 refblock = qcow2_cache_is_table_offset(s->refcount_block_cache, 3634 discard_block_offs); 3635 if (refblock) { 3636 /* discard refblock from the cache if refblock is cached */ 3637 qcow2_cache_discard(s->refcount_block_cache, refblock); 3638 } 3639 queue_discard(bs, discard_block_offs, s->cluster_size); 3640 3641 return 0; 3642 } 3643 3644 int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs) 3645 { 3646 BDRVQcow2State *s = bs->opaque; 3647 uint64_t *reftable_tmp = 3648 g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE); 3649 int i, ret; 3650 3651 for (i = 0; i < s->refcount_table_size; i++) { 3652 int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK; 3653 void *refblock; 3654 bool unused_block; 3655 3656 if (refblock_offs == 0) { 3657 reftable_tmp[i] = 0; 3658 continue; 3659 } 3660 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs, 3661 &refblock); 3662 if (ret < 0) { 3663 goto out; 3664 } 3665 3666 /* the refblock has own reference */ 3667 if (i == offset_to_reftable_index(s, refblock_offs)) { 3668 uint64_t block_index = (refblock_offs >> s->cluster_bits) & 3669 (s->refcount_block_size - 1); 3670 uint64_t refcount = s->get_refcount(refblock, block_index); 3671 3672 s->set_refcount(refblock, block_index, 0); 3673 3674 unused_block = buffer_is_zero(refblock, s->cluster_size); 3675 3676 s->set_refcount(refblock, block_index, refcount); 3677 } else { 3678 unused_block = buffer_is_zero(refblock, s->cluster_size); 3679 } 3680 qcow2_cache_put(s->refcount_block_cache, &refblock); 3681 3682 reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]); 3683 } 3684 3685 ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset, 3686 s->refcount_table_size * REFTABLE_ENTRY_SIZE, 3687 reftable_tmp, 0); 3688 /* 3689 * If the write in the reftable failed the image may contain a partially 3690 * overwritten reftable. In this case it would be better to clear the 3691 * reftable in memory to avoid possible image corruption. 3692 */ 3693 for (i = 0; i < s->refcount_table_size; i++) { 3694 if (s->refcount_table[i] && !reftable_tmp[i]) { 3695 if (ret == 0) { 3696 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] & 3697 REFT_OFFSET_MASK); 3698 } 3699 s->refcount_table[i] = 0; 3700 } 3701 } 3702 3703 if (!s->cache_discards) { 3704 qcow2_process_discards(bs, ret); 3705 } 3706 3707 out: 3708 g_free(reftable_tmp); 3709 return ret; 3710 } 3711 3712 int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size) 3713 { 3714 BDRVQcow2State *s = bs->opaque; 3715 int64_t i; 3716 3717 for (i = size_to_clusters(s, size) - 1; i >= 0; i--) { 3718 uint64_t refcount; 3719 int ret = qcow2_get_refcount(bs, i, &refcount); 3720 if (ret < 0) { 3721 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", 3722 i, strerror(-ret)); 3723 return ret; 3724 } 3725 if (refcount > 0) { 3726 return i; 3727 } 3728 } 3729 qcow2_signal_corruption(bs, true, -1, -1, 3730 "There are no references in the refcount table."); 3731 return -EIO; 3732 } 3733 3734 int coroutine_fn GRAPH_RDLOCK 3735 qcow2_detect_metadata_preallocation(BlockDriverState *bs) 3736 { 3737 BDRVQcow2State *s = bs->opaque; 3738 int64_t i, end_cluster, cluster_count = 0, threshold; 3739 int64_t file_length, real_allocation, real_clusters; 3740 3741 qemu_co_mutex_assert_locked(&s->lock); 3742 3743 file_length = bdrv_co_getlength(bs->file->bs); 3744 if (file_length < 0) { 3745 return file_length; 3746 } 3747 3748 real_allocation = bdrv_co_get_allocated_file_size(bs->file->bs); 3749 if (real_allocation < 0) { 3750 return real_allocation; 3751 } 3752 3753 real_clusters = real_allocation / s->cluster_size; 3754 threshold = MAX(real_clusters * 10 / 9, real_clusters + 2); 3755 3756 end_cluster = size_to_clusters(s, file_length); 3757 for (i = 0; i < end_cluster && cluster_count < threshold; i++) { 3758 uint64_t refcount; 3759 int ret = qcow2_get_refcount(bs, i, &refcount); 3760 if (ret < 0) { 3761 return ret; 3762 } 3763 cluster_count += !!refcount; 3764 } 3765 3766 return cluster_count >= threshold; 3767 } 3768