1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "qapi/error.h" 27 #include "qemu-common.h" 28 #include "block/block_int.h" 29 #include "block/qcow2.h" 30 #include "qemu/range.h" 31 #include "qemu/bswap.h" 32 #include "qemu/cutils.h" 33 34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); 35 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, 36 int64_t offset, int64_t length, uint64_t addend, 37 bool decrease, enum qcow2_discard_type type); 38 39 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index); 40 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index); 41 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index); 42 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index); 43 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index); 44 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index); 45 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index); 46 47 static void set_refcount_ro0(void *refcount_array, uint64_t index, 48 uint64_t value); 49 static void set_refcount_ro1(void *refcount_array, uint64_t index, 50 uint64_t value); 51 static void set_refcount_ro2(void *refcount_array, uint64_t index, 52 uint64_t value); 53 static void set_refcount_ro3(void *refcount_array, uint64_t index, 54 uint64_t value); 55 static void set_refcount_ro4(void *refcount_array, uint64_t index, 56 uint64_t value); 57 static void set_refcount_ro5(void *refcount_array, uint64_t index, 58 uint64_t value); 59 static void set_refcount_ro6(void *refcount_array, uint64_t index, 60 uint64_t value); 61 62 63 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = { 64 &get_refcount_ro0, 65 &get_refcount_ro1, 66 &get_refcount_ro2, 67 &get_refcount_ro3, 68 &get_refcount_ro4, 69 &get_refcount_ro5, 70 &get_refcount_ro6 71 }; 72 73 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { 74 &set_refcount_ro0, 75 &set_refcount_ro1, 76 &set_refcount_ro2, 77 &set_refcount_ro3, 78 &set_refcount_ro4, 79 &set_refcount_ro5, 80 &set_refcount_ro6 81 }; 82 83 84 /*********************************************************/ 85 /* refcount handling */ 86 87 static void update_max_refcount_table_index(BDRVQcow2State *s) 88 { 89 unsigned i = s->refcount_table_size - 1; 90 while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) { 91 i--; 92 } 93 /* Set s->max_refcount_table_index to the index of the last used entry */ 94 s->max_refcount_table_index = i; 95 } 96 97 int qcow2_refcount_init(BlockDriverState *bs) 98 { 99 BDRVQcow2State *s = bs->opaque; 100 unsigned int refcount_table_size2, i; 101 int ret; 102 103 assert(s->refcount_order >= 0 && s->refcount_order <= 6); 104 105 s->get_refcount = get_refcount_funcs[s->refcount_order]; 106 s->set_refcount = set_refcount_funcs[s->refcount_order]; 107 108 assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); 109 refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); 110 s->refcount_table = g_try_malloc(refcount_table_size2); 111 112 if (s->refcount_table_size > 0) { 113 if (s->refcount_table == NULL) { 114 ret = -ENOMEM; 115 goto fail; 116 } 117 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); 118 ret = bdrv_pread(bs->file, s->refcount_table_offset, 119 s->refcount_table, refcount_table_size2); 120 if (ret < 0) { 121 goto fail; 122 } 123 for(i = 0; i < s->refcount_table_size; i++) 124 be64_to_cpus(&s->refcount_table[i]); 125 update_max_refcount_table_index(s); 126 } 127 return 0; 128 fail: 129 return ret; 130 } 131 132 void qcow2_refcount_close(BlockDriverState *bs) 133 { 134 BDRVQcow2State *s = bs->opaque; 135 g_free(s->refcount_table); 136 } 137 138 139 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index) 140 { 141 return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1; 142 } 143 144 static void set_refcount_ro0(void *refcount_array, uint64_t index, 145 uint64_t value) 146 { 147 assert(!(value >> 1)); 148 ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8)); 149 ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8); 150 } 151 152 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index) 153 { 154 return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4))) 155 & 0x3; 156 } 157 158 static void set_refcount_ro1(void *refcount_array, uint64_t index, 159 uint64_t value) 160 { 161 assert(!(value >> 2)); 162 ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4))); 163 ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4)); 164 } 165 166 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index) 167 { 168 return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2))) 169 & 0xf; 170 } 171 172 static void set_refcount_ro2(void *refcount_array, uint64_t index, 173 uint64_t value) 174 { 175 assert(!(value >> 4)); 176 ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2))); 177 ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2)); 178 } 179 180 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index) 181 { 182 return ((const uint8_t *)refcount_array)[index]; 183 } 184 185 static void set_refcount_ro3(void *refcount_array, uint64_t index, 186 uint64_t value) 187 { 188 assert(!(value >> 8)); 189 ((uint8_t *)refcount_array)[index] = value; 190 } 191 192 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index) 193 { 194 return be16_to_cpu(((const uint16_t *)refcount_array)[index]); 195 } 196 197 static void set_refcount_ro4(void *refcount_array, uint64_t index, 198 uint64_t value) 199 { 200 assert(!(value >> 16)); 201 ((uint16_t *)refcount_array)[index] = cpu_to_be16(value); 202 } 203 204 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index) 205 { 206 return be32_to_cpu(((const uint32_t *)refcount_array)[index]); 207 } 208 209 static void set_refcount_ro5(void *refcount_array, uint64_t index, 210 uint64_t value) 211 { 212 assert(!(value >> 32)); 213 ((uint32_t *)refcount_array)[index] = cpu_to_be32(value); 214 } 215 216 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index) 217 { 218 return be64_to_cpu(((const uint64_t *)refcount_array)[index]); 219 } 220 221 static void set_refcount_ro6(void *refcount_array, uint64_t index, 222 uint64_t value) 223 { 224 ((uint64_t *)refcount_array)[index] = cpu_to_be64(value); 225 } 226 227 228 static int load_refcount_block(BlockDriverState *bs, 229 int64_t refcount_block_offset, 230 void **refcount_block) 231 { 232 BDRVQcow2State *s = bs->opaque; 233 234 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); 235 return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 236 refcount_block); 237 } 238 239 /* 240 * Retrieves the refcount of the cluster given by its index and stores it in 241 * *refcount. Returns 0 on success and -errno on failure. 242 */ 243 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, 244 uint64_t *refcount) 245 { 246 BDRVQcow2State *s = bs->opaque; 247 uint64_t refcount_table_index, block_index; 248 int64_t refcount_block_offset; 249 int ret; 250 void *refcount_block; 251 252 refcount_table_index = cluster_index >> s->refcount_block_bits; 253 if (refcount_table_index >= s->refcount_table_size) { 254 *refcount = 0; 255 return 0; 256 } 257 refcount_block_offset = 258 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 259 if (!refcount_block_offset) { 260 *refcount = 0; 261 return 0; 262 } 263 264 if (offset_into_cluster(s, refcount_block_offset)) { 265 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 266 " unaligned (reftable index: %#" PRIx64 ")", 267 refcount_block_offset, refcount_table_index); 268 return -EIO; 269 } 270 271 ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 272 &refcount_block); 273 if (ret < 0) { 274 return ret; 275 } 276 277 block_index = cluster_index & (s->refcount_block_size - 1); 278 *refcount = s->get_refcount(refcount_block, block_index); 279 280 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); 281 282 return 0; 283 } 284 285 /* Checks if two offsets are described by the same refcount block */ 286 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a, 287 uint64_t offset_b) 288 { 289 uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits); 290 uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits); 291 292 return (block_a == block_b); 293 } 294 295 /* 296 * Loads a refcount block. If it doesn't exist yet, it is allocated first 297 * (including growing the refcount table if needed). 298 * 299 * Returns 0 on success or -errno in error case 300 */ 301 static int alloc_refcount_block(BlockDriverState *bs, 302 int64_t cluster_index, void **refcount_block) 303 { 304 BDRVQcow2State *s = bs->opaque; 305 unsigned int refcount_table_index; 306 int64_t ret; 307 308 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); 309 310 /* Find the refcount block for the given cluster */ 311 refcount_table_index = cluster_index >> s->refcount_block_bits; 312 313 if (refcount_table_index < s->refcount_table_size) { 314 315 uint64_t refcount_block_offset = 316 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 317 318 /* If it's already there, we're done */ 319 if (refcount_block_offset) { 320 if (offset_into_cluster(s, refcount_block_offset)) { 321 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 322 PRIx64 " unaligned (reftable index: " 323 "%#x)", refcount_block_offset, 324 refcount_table_index); 325 return -EIO; 326 } 327 328 return load_refcount_block(bs, refcount_block_offset, 329 refcount_block); 330 } 331 } 332 333 /* 334 * If we came here, we need to allocate something. Something is at least 335 * a cluster for the new refcount block. It may also include a new refcount 336 * table if the old refcount table is too small. 337 * 338 * Note that allocating clusters here needs some special care: 339 * 340 * - We can't use the normal qcow2_alloc_clusters(), it would try to 341 * increase the refcount and very likely we would end up with an endless 342 * recursion. Instead we must place the refcount blocks in a way that 343 * they can describe them themselves. 344 * 345 * - We need to consider that at this point we are inside update_refcounts 346 * and potentially doing an initial refcount increase. This means that 347 * some clusters have already been allocated by the caller, but their 348 * refcount isn't accurate yet. If we allocate clusters for metadata, we 349 * need to return -EAGAIN to signal the caller that it needs to restart 350 * the search for free clusters. 351 * 352 * - alloc_clusters_noref and qcow2_free_clusters may load a different 353 * refcount block into the cache 354 */ 355 356 *refcount_block = NULL; 357 358 /* We write to the refcount table, so we might depend on L2 tables */ 359 ret = qcow2_cache_flush(bs, s->l2_table_cache); 360 if (ret < 0) { 361 return ret; 362 } 363 364 /* Allocate the refcount block itself and mark it as used */ 365 int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); 366 if (new_block < 0) { 367 return new_block; 368 } 369 370 #ifdef DEBUG_ALLOC2 371 fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 372 " at %" PRIx64 "\n", 373 refcount_table_index, cluster_index << s->cluster_bits, new_block); 374 #endif 375 376 if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { 377 /* Zero the new refcount block before updating it */ 378 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 379 refcount_block); 380 if (ret < 0) { 381 goto fail; 382 } 383 384 memset(*refcount_block, 0, s->cluster_size); 385 386 /* The block describes itself, need to update the cache */ 387 int block_index = (new_block >> s->cluster_bits) & 388 (s->refcount_block_size - 1); 389 s->set_refcount(*refcount_block, block_index, 1); 390 } else { 391 /* Described somewhere else. This can recurse at most twice before we 392 * arrive at a block that describes itself. */ 393 ret = update_refcount(bs, new_block, s->cluster_size, 1, false, 394 QCOW2_DISCARD_NEVER); 395 if (ret < 0) { 396 goto fail; 397 } 398 399 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 400 if (ret < 0) { 401 goto fail; 402 } 403 404 /* Initialize the new refcount block only after updating its refcount, 405 * update_refcount uses the refcount cache itself */ 406 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 407 refcount_block); 408 if (ret < 0) { 409 goto fail; 410 } 411 412 memset(*refcount_block, 0, s->cluster_size); 413 } 414 415 /* Now the new refcount block needs to be written to disk */ 416 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); 417 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block); 418 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 419 if (ret < 0) { 420 goto fail; 421 } 422 423 /* If the refcount table is big enough, just hook the block up there */ 424 if (refcount_table_index < s->refcount_table_size) { 425 uint64_t data64 = cpu_to_be64(new_block); 426 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); 427 ret = bdrv_pwrite_sync(bs->file, 428 s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), 429 &data64, sizeof(data64)); 430 if (ret < 0) { 431 goto fail; 432 } 433 434 s->refcount_table[refcount_table_index] = new_block; 435 /* If there's a hole in s->refcount_table then it can happen 436 * that refcount_table_index < s->max_refcount_table_index */ 437 s->max_refcount_table_index = 438 MAX(s->max_refcount_table_index, refcount_table_index); 439 440 /* The new refcount block may be where the caller intended to put its 441 * data, so let it restart the search. */ 442 return -EAGAIN; 443 } 444 445 qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); 446 447 /* 448 * If we come here, we need to grow the refcount table. Again, a new 449 * refcount table needs some space and we can't simply allocate to avoid 450 * endless recursion. 451 * 452 * Therefore let's grab new refcount blocks at the end of the image, which 453 * will describe themselves and the new refcount table. This way we can 454 * reference them only in the new table and do the switch to the new 455 * refcount table at once without producing an inconsistent state in 456 * between. 457 */ 458 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); 459 460 /* Calculate the number of refcount blocks needed so far; this will be the 461 * basis for calculating the index of the first cluster used for the 462 * self-describing refcount structures which we are about to create. 463 * 464 * Because we reached this point, there cannot be any refcount entries for 465 * cluster_index or higher indices yet. However, because new_block has been 466 * allocated to describe that cluster (and it will assume this role later 467 * on), we cannot use that index; also, new_block may actually have a higher 468 * cluster index than cluster_index, so it needs to be taken into account 469 * here (and 1 needs to be added to its value because that cluster is used). 470 */ 471 uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1, 472 (new_block >> s->cluster_bits) + 1), 473 s->refcount_block_size); 474 475 /* Create the new refcount table and blocks */ 476 uint64_t meta_offset = (blocks_used * s->refcount_block_size) * 477 s->cluster_size; 478 479 ret = qcow2_refcount_area(bs, meta_offset, 0, false, 480 refcount_table_index, new_block); 481 if (ret < 0) { 482 return ret; 483 } 484 485 ret = load_refcount_block(bs, new_block, refcount_block); 486 if (ret < 0) { 487 return ret; 488 } 489 490 /* If we were trying to do the initial refcount update for some cluster 491 * allocation, we might have used the same clusters to store newly 492 * allocated metadata. Make the caller search some new space. */ 493 return -EAGAIN; 494 495 fail: 496 if (*refcount_block != NULL) { 497 qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); 498 } 499 return ret; 500 } 501 502 /* 503 * Starting at @start_offset, this function creates new self-covering refcount 504 * structures: A new refcount table and refcount blocks which cover all of 505 * themselves, and a number of @additional_clusters beyond their end. 506 * @start_offset must be at the end of the image file, that is, there must be 507 * only empty space beyond it. 508 * If @exact_size is false, the refcount table will have 50 % more entries than 509 * necessary so it will not need to grow again soon. 510 * If @new_refblock_offset is not zero, it contains the offset of a refcount 511 * block that should be entered into the new refcount table at index 512 * @new_refblock_index. 513 * 514 * Returns: The offset after the new refcount structures (i.e. where the 515 * @additional_clusters may be placed) on success, -errno on error. 516 */ 517 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset, 518 uint64_t additional_clusters, bool exact_size, 519 int new_refblock_index, 520 uint64_t new_refblock_offset) 521 { 522 BDRVQcow2State *s = bs->opaque; 523 uint64_t total_refblock_count_u64, additional_refblock_count; 524 int total_refblock_count, table_size, area_reftable_index, table_clusters; 525 int i; 526 uint64_t table_offset, block_offset, end_offset; 527 int ret; 528 uint64_t *new_table; 529 530 assert(!(start_offset % s->cluster_size)); 531 532 qcow2_refcount_metadata_size(start_offset / s->cluster_size + 533 additional_clusters, 534 s->cluster_size, s->refcount_order, 535 !exact_size, &total_refblock_count_u64); 536 if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) { 537 return -EFBIG; 538 } 539 total_refblock_count = total_refblock_count_u64; 540 541 /* Index in the refcount table of the first refcount block to cover the area 542 * of refcount structures we are about to create; we know that 543 * @total_refblock_count can cover @start_offset, so this will definitely 544 * fit into an int. */ 545 area_reftable_index = (start_offset / s->cluster_size) / 546 s->refcount_block_size; 547 548 if (exact_size) { 549 table_size = total_refblock_count; 550 } else { 551 table_size = total_refblock_count + 552 DIV_ROUND_UP(total_refblock_count, 2); 553 } 554 /* The qcow2 file can only store the reftable size in number of clusters */ 555 table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t)); 556 table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size; 557 558 if (table_size > QCOW_MAX_REFTABLE_SIZE) { 559 return -EFBIG; 560 } 561 562 new_table = g_try_new0(uint64_t, table_size); 563 564 assert(table_size > 0); 565 if (new_table == NULL) { 566 ret = -ENOMEM; 567 goto fail; 568 } 569 570 /* Fill the new refcount table */ 571 if (table_size > s->max_refcount_table_index) { 572 /* We're actually growing the reftable */ 573 memcpy(new_table, s->refcount_table, 574 (s->max_refcount_table_index + 1) * sizeof(uint64_t)); 575 } else { 576 /* Improbable case: We're shrinking the reftable. However, the caller 577 * has assured us that there is only empty space beyond @start_offset, 578 * so we can simply drop all of the refblocks that won't fit into the 579 * new reftable. */ 580 memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t)); 581 } 582 583 if (new_refblock_offset) { 584 assert(new_refblock_index < total_refblock_count); 585 new_table[new_refblock_index] = new_refblock_offset; 586 } 587 588 /* Count how many new refblocks we have to create */ 589 additional_refblock_count = 0; 590 for (i = area_reftable_index; i < total_refblock_count; i++) { 591 if (!new_table[i]) { 592 additional_refblock_count++; 593 } 594 } 595 596 table_offset = start_offset + additional_refblock_count * s->cluster_size; 597 end_offset = table_offset + table_clusters * s->cluster_size; 598 599 /* Fill the refcount blocks, and create new ones, if necessary */ 600 block_offset = start_offset; 601 for (i = area_reftable_index; i < total_refblock_count; i++) { 602 void *refblock_data; 603 uint64_t first_offset_covered; 604 605 /* Reuse an existing refblock if possible, create a new one otherwise */ 606 if (new_table[i]) { 607 ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i], 608 &refblock_data); 609 if (ret < 0) { 610 goto fail; 611 } 612 } else { 613 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, 614 block_offset, &refblock_data); 615 if (ret < 0) { 616 goto fail; 617 } 618 memset(refblock_data, 0, s->cluster_size); 619 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, 620 refblock_data); 621 622 new_table[i] = block_offset; 623 block_offset += s->cluster_size; 624 } 625 626 /* First host offset covered by this refblock */ 627 first_offset_covered = (uint64_t)i * s->refcount_block_size * 628 s->cluster_size; 629 if (first_offset_covered < end_offset) { 630 int j, end_index; 631 632 /* Set the refcount of all of the new refcount structures to 1 */ 633 634 if (first_offset_covered < start_offset) { 635 assert(i == area_reftable_index); 636 j = (start_offset - first_offset_covered) / s->cluster_size; 637 assert(j < s->refcount_block_size); 638 } else { 639 j = 0; 640 } 641 642 end_index = MIN((end_offset - first_offset_covered) / 643 s->cluster_size, 644 s->refcount_block_size); 645 646 for (; j < end_index; j++) { 647 /* The caller guaranteed us this space would be empty */ 648 assert(s->get_refcount(refblock_data, j) == 0); 649 s->set_refcount(refblock_data, j, 1); 650 } 651 652 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, 653 refblock_data); 654 } 655 656 qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data); 657 } 658 659 assert(block_offset == table_offset); 660 661 /* Write refcount blocks to disk */ 662 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); 663 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 664 if (ret < 0) { 665 goto fail; 666 } 667 668 /* Write refcount table to disk */ 669 for (i = 0; i < total_refblock_count; i++) { 670 cpu_to_be64s(&new_table[i]); 671 } 672 673 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); 674 ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, 675 table_size * sizeof(uint64_t)); 676 if (ret < 0) { 677 goto fail; 678 } 679 680 for (i = 0; i < total_refblock_count; i++) { 681 be64_to_cpus(&new_table[i]); 682 } 683 684 /* Hook up the new refcount table in the qcow2 header */ 685 struct QEMU_PACKED { 686 uint64_t d64; 687 uint32_t d32; 688 } data; 689 data.d64 = cpu_to_be64(table_offset); 690 data.d32 = cpu_to_be32(table_clusters); 691 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); 692 ret = bdrv_pwrite_sync(bs->file, 693 offsetof(QCowHeader, refcount_table_offset), 694 &data, sizeof(data)); 695 if (ret < 0) { 696 goto fail; 697 } 698 699 /* And switch it in memory */ 700 uint64_t old_table_offset = s->refcount_table_offset; 701 uint64_t old_table_size = s->refcount_table_size; 702 703 g_free(s->refcount_table); 704 s->refcount_table = new_table; 705 s->refcount_table_size = table_size; 706 s->refcount_table_offset = table_offset; 707 update_max_refcount_table_index(s); 708 709 /* Free old table. */ 710 qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), 711 QCOW2_DISCARD_OTHER); 712 713 return end_offset; 714 715 fail: 716 g_free(new_table); 717 return ret; 718 } 719 720 void qcow2_process_discards(BlockDriverState *bs, int ret) 721 { 722 BDRVQcow2State *s = bs->opaque; 723 Qcow2DiscardRegion *d, *next; 724 725 QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { 726 QTAILQ_REMOVE(&s->discards, d, next); 727 728 /* Discard is optional, ignore the return value */ 729 if (ret >= 0) { 730 bdrv_pdiscard(bs->file->bs, d->offset, d->bytes); 731 } 732 733 g_free(d); 734 } 735 } 736 737 static void update_refcount_discard(BlockDriverState *bs, 738 uint64_t offset, uint64_t length) 739 { 740 BDRVQcow2State *s = bs->opaque; 741 Qcow2DiscardRegion *d, *p, *next; 742 743 QTAILQ_FOREACH(d, &s->discards, next) { 744 uint64_t new_start = MIN(offset, d->offset); 745 uint64_t new_end = MAX(offset + length, d->offset + d->bytes); 746 747 if (new_end - new_start <= length + d->bytes) { 748 /* There can't be any overlap, areas ending up here have no 749 * references any more and therefore shouldn't get freed another 750 * time. */ 751 assert(d->bytes + length == new_end - new_start); 752 d->offset = new_start; 753 d->bytes = new_end - new_start; 754 goto found; 755 } 756 } 757 758 d = g_malloc(sizeof(*d)); 759 *d = (Qcow2DiscardRegion) { 760 .bs = bs, 761 .offset = offset, 762 .bytes = length, 763 }; 764 QTAILQ_INSERT_TAIL(&s->discards, d, next); 765 766 found: 767 /* Merge discard requests if they are adjacent now */ 768 QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { 769 if (p == d 770 || p->offset > d->offset + d->bytes 771 || d->offset > p->offset + p->bytes) 772 { 773 continue; 774 } 775 776 /* Still no overlap possible */ 777 assert(p->offset == d->offset + d->bytes 778 || d->offset == p->offset + p->bytes); 779 780 QTAILQ_REMOVE(&s->discards, p, next); 781 d->offset = MIN(d->offset, p->offset); 782 d->bytes += p->bytes; 783 g_free(p); 784 } 785 } 786 787 /* XXX: cache several refcount block clusters ? */ 788 /* @addend is the absolute value of the addend; if @decrease is set, @addend 789 * will be subtracted from the current refcount, otherwise it will be added */ 790 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, 791 int64_t offset, 792 int64_t length, 793 uint64_t addend, 794 bool decrease, 795 enum qcow2_discard_type type) 796 { 797 BDRVQcow2State *s = bs->opaque; 798 int64_t start, last, cluster_offset; 799 void *refcount_block = NULL; 800 int64_t old_table_index = -1; 801 int ret; 802 803 #ifdef DEBUG_ALLOC2 804 fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 805 " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "", 806 addend); 807 #endif 808 if (length < 0) { 809 return -EINVAL; 810 } else if (length == 0) { 811 return 0; 812 } 813 814 if (decrease) { 815 qcow2_cache_set_dependency(bs, s->refcount_block_cache, 816 s->l2_table_cache); 817 } 818 819 start = start_of_cluster(s, offset); 820 last = start_of_cluster(s, offset + length - 1); 821 for(cluster_offset = start; cluster_offset <= last; 822 cluster_offset += s->cluster_size) 823 { 824 int block_index; 825 uint64_t refcount; 826 int64_t cluster_index = cluster_offset >> s->cluster_bits; 827 int64_t table_index = cluster_index >> s->refcount_block_bits; 828 829 /* Load the refcount block and allocate it if needed */ 830 if (table_index != old_table_index) { 831 if (refcount_block) { 832 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); 833 } 834 ret = alloc_refcount_block(bs, cluster_index, &refcount_block); 835 if (ret < 0) { 836 goto fail; 837 } 838 } 839 old_table_index = table_index; 840 841 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, 842 refcount_block); 843 844 /* we can update the count and save it */ 845 block_index = cluster_index & (s->refcount_block_size - 1); 846 847 refcount = s->get_refcount(refcount_block, block_index); 848 if (decrease ? (refcount - addend > refcount) 849 : (refcount + addend < refcount || 850 refcount + addend > s->refcount_max)) 851 { 852 ret = -EINVAL; 853 goto fail; 854 } 855 if (decrease) { 856 refcount -= addend; 857 } else { 858 refcount += addend; 859 } 860 if (refcount == 0 && cluster_index < s->free_cluster_index) { 861 s->free_cluster_index = cluster_index; 862 } 863 s->set_refcount(refcount_block, block_index, refcount); 864 865 if (refcount == 0) { 866 void *table; 867 868 table = qcow2_cache_is_table_offset(bs, s->refcount_block_cache, 869 offset); 870 if (table != NULL) { 871 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); 872 qcow2_cache_discard(bs, s->refcount_block_cache, table); 873 } 874 875 table = qcow2_cache_is_table_offset(bs, s->l2_table_cache, offset); 876 if (table != NULL) { 877 qcow2_cache_discard(bs, s->l2_table_cache, table); 878 } 879 880 if (s->discard_passthrough[type]) { 881 update_refcount_discard(bs, cluster_offset, s->cluster_size); 882 } 883 } 884 } 885 886 ret = 0; 887 fail: 888 if (!s->cache_discards) { 889 qcow2_process_discards(bs, ret); 890 } 891 892 /* Write last changed block to disk */ 893 if (refcount_block) { 894 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); 895 } 896 897 /* 898 * Try do undo any updates if an error is returned (This may succeed in 899 * some cases like ENOSPC for allocating a new refcount block) 900 */ 901 if (ret < 0) { 902 int dummy; 903 dummy = update_refcount(bs, offset, cluster_offset - offset, addend, 904 !decrease, QCOW2_DISCARD_NEVER); 905 (void)dummy; 906 } 907 908 return ret; 909 } 910 911 /* 912 * Increases or decreases the refcount of a given cluster. 913 * 914 * @addend is the absolute value of the addend; if @decrease is set, @addend 915 * will be subtracted from the current refcount, otherwise it will be added. 916 * 917 * On success 0 is returned; on failure -errno is returned. 918 */ 919 int qcow2_update_cluster_refcount(BlockDriverState *bs, 920 int64_t cluster_index, 921 uint64_t addend, bool decrease, 922 enum qcow2_discard_type type) 923 { 924 BDRVQcow2State *s = bs->opaque; 925 int ret; 926 927 ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, 928 decrease, type); 929 if (ret < 0) { 930 return ret; 931 } 932 933 return 0; 934 } 935 936 937 938 /*********************************************************/ 939 /* cluster allocation functions */ 940 941 942 943 /* return < 0 if error */ 944 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) 945 { 946 BDRVQcow2State *s = bs->opaque; 947 uint64_t i, nb_clusters, refcount; 948 int ret; 949 950 /* We can't allocate clusters if they may still be queued for discard. */ 951 if (s->cache_discards) { 952 qcow2_process_discards(bs, 0); 953 } 954 955 nb_clusters = size_to_clusters(s, size); 956 retry: 957 for(i = 0; i < nb_clusters; i++) { 958 uint64_t next_cluster_index = s->free_cluster_index++; 959 ret = qcow2_get_refcount(bs, next_cluster_index, &refcount); 960 961 if (ret < 0) { 962 return ret; 963 } else if (refcount != 0) { 964 goto retry; 965 } 966 } 967 968 /* Make sure that all offsets in the "allocated" range are representable 969 * in an int64_t */ 970 if (s->free_cluster_index > 0 && 971 s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) 972 { 973 return -EFBIG; 974 } 975 976 #ifdef DEBUG_ALLOC2 977 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", 978 size, 979 (s->free_cluster_index - nb_clusters) << s->cluster_bits); 980 #endif 981 return (s->free_cluster_index - nb_clusters) << s->cluster_bits; 982 } 983 984 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) 985 { 986 int64_t offset; 987 int ret; 988 989 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); 990 do { 991 offset = alloc_clusters_noref(bs, size); 992 if (offset < 0) { 993 return offset; 994 } 995 996 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); 997 } while (ret == -EAGAIN); 998 999 if (ret < 0) { 1000 return ret; 1001 } 1002 1003 return offset; 1004 } 1005 1006 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, 1007 int64_t nb_clusters) 1008 { 1009 BDRVQcow2State *s = bs->opaque; 1010 uint64_t cluster_index, refcount; 1011 uint64_t i; 1012 int ret; 1013 1014 assert(nb_clusters >= 0); 1015 if (nb_clusters == 0) { 1016 return 0; 1017 } 1018 1019 do { 1020 /* Check how many clusters there are free */ 1021 cluster_index = offset >> s->cluster_bits; 1022 for(i = 0; i < nb_clusters; i++) { 1023 ret = qcow2_get_refcount(bs, cluster_index++, &refcount); 1024 if (ret < 0) { 1025 return ret; 1026 } else if (refcount != 0) { 1027 break; 1028 } 1029 } 1030 1031 /* And then allocate them */ 1032 ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false, 1033 QCOW2_DISCARD_NEVER); 1034 } while (ret == -EAGAIN); 1035 1036 if (ret < 0) { 1037 return ret; 1038 } 1039 1040 return i; 1041 } 1042 1043 /* only used to allocate compressed sectors. We try to allocate 1044 contiguous sectors. size must be <= cluster_size */ 1045 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) 1046 { 1047 BDRVQcow2State *s = bs->opaque; 1048 int64_t offset; 1049 size_t free_in_cluster; 1050 int ret; 1051 1052 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); 1053 assert(size > 0 && size <= s->cluster_size); 1054 assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); 1055 1056 offset = s->free_byte_offset; 1057 1058 if (offset) { 1059 uint64_t refcount; 1060 ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount); 1061 if (ret < 0) { 1062 return ret; 1063 } 1064 1065 if (refcount == s->refcount_max) { 1066 offset = 0; 1067 } 1068 } 1069 1070 free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); 1071 do { 1072 if (!offset || free_in_cluster < size) { 1073 int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); 1074 if (new_cluster < 0) { 1075 return new_cluster; 1076 } 1077 1078 if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { 1079 offset = new_cluster; 1080 free_in_cluster = s->cluster_size; 1081 } else { 1082 free_in_cluster += s->cluster_size; 1083 } 1084 } 1085 1086 assert(offset); 1087 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); 1088 if (ret < 0) { 1089 offset = 0; 1090 } 1091 } while (ret == -EAGAIN); 1092 if (ret < 0) { 1093 return ret; 1094 } 1095 1096 /* The cluster refcount was incremented; refcount blocks must be flushed 1097 * before the caller's L2 table updates. */ 1098 qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); 1099 1100 s->free_byte_offset = offset + size; 1101 if (!offset_into_cluster(s, s->free_byte_offset)) { 1102 s->free_byte_offset = 0; 1103 } 1104 1105 return offset; 1106 } 1107 1108 void qcow2_free_clusters(BlockDriverState *bs, 1109 int64_t offset, int64_t size, 1110 enum qcow2_discard_type type) 1111 { 1112 int ret; 1113 1114 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); 1115 ret = update_refcount(bs, offset, size, 1, true, type); 1116 if (ret < 0) { 1117 fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); 1118 /* TODO Remember the clusters to free them later and avoid leaking */ 1119 } 1120 } 1121 1122 /* 1123 * Free a cluster using its L2 entry (handles clusters of all types, e.g. 1124 * normal cluster, compressed cluster, etc.) 1125 */ 1126 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, 1127 int nb_clusters, enum qcow2_discard_type type) 1128 { 1129 BDRVQcow2State *s = bs->opaque; 1130 1131 switch (qcow2_get_cluster_type(l2_entry)) { 1132 case QCOW2_CLUSTER_COMPRESSED: 1133 { 1134 int nb_csectors; 1135 nb_csectors = ((l2_entry >> s->csize_shift) & 1136 s->csize_mask) + 1; 1137 qcow2_free_clusters(bs, 1138 (l2_entry & s->cluster_offset_mask) & ~511, 1139 nb_csectors * 512, type); 1140 } 1141 break; 1142 case QCOW2_CLUSTER_NORMAL: 1143 case QCOW2_CLUSTER_ZERO_ALLOC: 1144 if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { 1145 qcow2_signal_corruption(bs, false, -1, -1, 1146 "Cannot free unaligned cluster %#llx", 1147 l2_entry & L2E_OFFSET_MASK); 1148 } else { 1149 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, 1150 nb_clusters << s->cluster_bits, type); 1151 } 1152 break; 1153 case QCOW2_CLUSTER_ZERO_PLAIN: 1154 case QCOW2_CLUSTER_UNALLOCATED: 1155 break; 1156 default: 1157 abort(); 1158 } 1159 } 1160 1161 1162 1163 /*********************************************************/ 1164 /* snapshots and image creation */ 1165 1166 1167 1168 /* update the refcounts of snapshots and the copied flag */ 1169 int qcow2_update_snapshot_refcount(BlockDriverState *bs, 1170 int64_t l1_table_offset, int l1_size, int addend) 1171 { 1172 BDRVQcow2State *s = bs->opaque; 1173 uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount; 1174 bool l1_allocated = false; 1175 int64_t old_entry, old_l2_offset; 1176 int i, j, l1_modified = 0, nb_csectors; 1177 int ret; 1178 1179 assert(addend >= -1 && addend <= 1); 1180 1181 l2_table = NULL; 1182 l1_table = NULL; 1183 l1_size2 = l1_size * sizeof(uint64_t); 1184 1185 s->cache_discards = true; 1186 1187 /* WARNING: qcow2_snapshot_goto relies on this function not using the 1188 * l1_table_offset when it is the current s->l1_table_offset! Be careful 1189 * when changing this! */ 1190 if (l1_table_offset != s->l1_table_offset) { 1191 l1_table = g_try_malloc0(align_offset(l1_size2, 512)); 1192 if (l1_size2 && l1_table == NULL) { 1193 ret = -ENOMEM; 1194 goto fail; 1195 } 1196 l1_allocated = true; 1197 1198 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); 1199 if (ret < 0) { 1200 goto fail; 1201 } 1202 1203 for (i = 0; i < l1_size; i++) { 1204 be64_to_cpus(&l1_table[i]); 1205 } 1206 } else { 1207 assert(l1_size == s->l1_size); 1208 l1_table = s->l1_table; 1209 l1_allocated = false; 1210 } 1211 1212 for (i = 0; i < l1_size; i++) { 1213 l2_offset = l1_table[i]; 1214 if (l2_offset) { 1215 old_l2_offset = l2_offset; 1216 l2_offset &= L1E_OFFSET_MASK; 1217 1218 if (offset_into_cluster(s, l2_offset)) { 1219 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 1220 PRIx64 " unaligned (L1 index: %#x)", 1221 l2_offset, i); 1222 ret = -EIO; 1223 goto fail; 1224 } 1225 1226 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 1227 (void**) &l2_table); 1228 if (ret < 0) { 1229 goto fail; 1230 } 1231 1232 for (j = 0; j < s->l2_size; j++) { 1233 uint64_t cluster_index; 1234 uint64_t offset; 1235 1236 entry = be64_to_cpu(l2_table[j]); 1237 old_entry = entry; 1238 entry &= ~QCOW_OFLAG_COPIED; 1239 offset = entry & L2E_OFFSET_MASK; 1240 1241 switch (qcow2_get_cluster_type(entry)) { 1242 case QCOW2_CLUSTER_COMPRESSED: 1243 nb_csectors = ((entry >> s->csize_shift) & 1244 s->csize_mask) + 1; 1245 if (addend != 0) { 1246 ret = update_refcount(bs, 1247 (entry & s->cluster_offset_mask) & ~511, 1248 nb_csectors * 512, abs(addend), addend < 0, 1249 QCOW2_DISCARD_SNAPSHOT); 1250 if (ret < 0) { 1251 goto fail; 1252 } 1253 } 1254 /* compressed clusters are never modified */ 1255 refcount = 2; 1256 break; 1257 1258 case QCOW2_CLUSTER_NORMAL: 1259 case QCOW2_CLUSTER_ZERO_ALLOC: 1260 if (offset_into_cluster(s, offset)) { 1261 qcow2_signal_corruption(bs, true, -1, -1, "Cluster " 1262 "allocation offset %#" PRIx64 1263 " unaligned (L2 offset: %#" 1264 PRIx64 ", L2 index: %#x)", 1265 offset, l2_offset, j); 1266 ret = -EIO; 1267 goto fail; 1268 } 1269 1270 cluster_index = offset >> s->cluster_bits; 1271 assert(cluster_index); 1272 if (addend != 0) { 1273 ret = qcow2_update_cluster_refcount(bs, 1274 cluster_index, abs(addend), addend < 0, 1275 QCOW2_DISCARD_SNAPSHOT); 1276 if (ret < 0) { 1277 goto fail; 1278 } 1279 } 1280 1281 ret = qcow2_get_refcount(bs, cluster_index, &refcount); 1282 if (ret < 0) { 1283 goto fail; 1284 } 1285 break; 1286 1287 case QCOW2_CLUSTER_ZERO_PLAIN: 1288 case QCOW2_CLUSTER_UNALLOCATED: 1289 refcount = 0; 1290 break; 1291 1292 default: 1293 abort(); 1294 } 1295 1296 if (refcount == 1) { 1297 entry |= QCOW_OFLAG_COPIED; 1298 } 1299 if (entry != old_entry) { 1300 if (addend > 0) { 1301 qcow2_cache_set_dependency(bs, s->l2_table_cache, 1302 s->refcount_block_cache); 1303 } 1304 l2_table[j] = cpu_to_be64(entry); 1305 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, 1306 l2_table); 1307 } 1308 } 1309 1310 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1311 1312 if (addend != 0) { 1313 ret = qcow2_update_cluster_refcount(bs, l2_offset >> 1314 s->cluster_bits, 1315 abs(addend), addend < 0, 1316 QCOW2_DISCARD_SNAPSHOT); 1317 if (ret < 0) { 1318 goto fail; 1319 } 1320 } 1321 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1322 &refcount); 1323 if (ret < 0) { 1324 goto fail; 1325 } else if (refcount == 1) { 1326 l2_offset |= QCOW_OFLAG_COPIED; 1327 } 1328 if (l2_offset != old_l2_offset) { 1329 l1_table[i] = l2_offset; 1330 l1_modified = 1; 1331 } 1332 } 1333 } 1334 1335 ret = bdrv_flush(bs); 1336 fail: 1337 if (l2_table) { 1338 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1339 } 1340 1341 s->cache_discards = false; 1342 qcow2_process_discards(bs, ret); 1343 1344 /* Update L1 only if it isn't deleted anyway (addend = -1) */ 1345 if (ret == 0 && addend >= 0 && l1_modified) { 1346 for (i = 0; i < l1_size; i++) { 1347 cpu_to_be64s(&l1_table[i]); 1348 } 1349 1350 ret = bdrv_pwrite_sync(bs->file, l1_table_offset, 1351 l1_table, l1_size2); 1352 1353 for (i = 0; i < l1_size; i++) { 1354 be64_to_cpus(&l1_table[i]); 1355 } 1356 } 1357 if (l1_allocated) 1358 g_free(l1_table); 1359 return ret; 1360 } 1361 1362 1363 1364 1365 /*********************************************************/ 1366 /* refcount checking functions */ 1367 1368 1369 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries) 1370 { 1371 /* This assertion holds because there is no way we can address more than 1372 * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because 1373 * offsets have to be representable in bytes); due to every cluster 1374 * corresponding to one refcount entry, we are well below that limit */ 1375 assert(entries < (UINT64_C(1) << (64 - 9))); 1376 1377 /* Thanks to the assertion this will not overflow, because 1378 * s->refcount_order < 7. 1379 * (note: x << s->refcount_order == x * s->refcount_bits) */ 1380 return DIV_ROUND_UP(entries << s->refcount_order, 8); 1381 } 1382 1383 /** 1384 * Reallocates *array so that it can hold new_size entries. *size must contain 1385 * the current number of entries in *array. If the reallocation fails, *array 1386 * and *size will not be modified and -errno will be returned. If the 1387 * reallocation is successful, *array will be set to the new buffer, *size 1388 * will be set to new_size and 0 will be returned. The size of the reallocated 1389 * refcount array buffer will be aligned to a cluster boundary, and the newly 1390 * allocated area will be zeroed. 1391 */ 1392 static int realloc_refcount_array(BDRVQcow2State *s, void **array, 1393 int64_t *size, int64_t new_size) 1394 { 1395 int64_t old_byte_size, new_byte_size; 1396 void *new_ptr; 1397 1398 /* Round to clusters so the array can be directly written to disk */ 1399 old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size)) 1400 * s->cluster_size; 1401 new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size)) 1402 * s->cluster_size; 1403 1404 if (new_byte_size == old_byte_size) { 1405 *size = new_size; 1406 return 0; 1407 } 1408 1409 assert(new_byte_size > 0); 1410 1411 if (new_byte_size > SIZE_MAX) { 1412 return -ENOMEM; 1413 } 1414 1415 new_ptr = g_try_realloc(*array, new_byte_size); 1416 if (!new_ptr) { 1417 return -ENOMEM; 1418 } 1419 1420 if (new_byte_size > old_byte_size) { 1421 memset((char *)new_ptr + old_byte_size, 0, 1422 new_byte_size - old_byte_size); 1423 } 1424 1425 *array = new_ptr; 1426 *size = new_size; 1427 1428 return 0; 1429 } 1430 1431 /* 1432 * Increases the refcount for a range of clusters in a given refcount table. 1433 * This is used to construct a temporary refcount table out of L1 and L2 tables 1434 * which can be compared to the refcount table saved in the image. 1435 * 1436 * Modifies the number of errors in res. 1437 */ 1438 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, 1439 void **refcount_table, 1440 int64_t *refcount_table_size, 1441 int64_t offset, int64_t size) 1442 { 1443 BDRVQcow2State *s = bs->opaque; 1444 uint64_t start, last, cluster_offset, k, refcount; 1445 int ret; 1446 1447 if (size <= 0) { 1448 return 0; 1449 } 1450 1451 start = start_of_cluster(s, offset); 1452 last = start_of_cluster(s, offset + size - 1); 1453 for(cluster_offset = start; cluster_offset <= last; 1454 cluster_offset += s->cluster_size) { 1455 k = cluster_offset >> s->cluster_bits; 1456 if (k >= *refcount_table_size) { 1457 ret = realloc_refcount_array(s, refcount_table, 1458 refcount_table_size, k + 1); 1459 if (ret < 0) { 1460 res->check_errors++; 1461 return ret; 1462 } 1463 } 1464 1465 refcount = s->get_refcount(*refcount_table, k); 1466 if (refcount == s->refcount_max) { 1467 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 1468 "\n", cluster_offset); 1469 fprintf(stderr, "Use qemu-img amend to increase the refcount entry " 1470 "width or qemu-img convert to create a clean copy if the " 1471 "image cannot be opened for writing\n"); 1472 res->corruptions++; 1473 continue; 1474 } 1475 s->set_refcount(*refcount_table, k, refcount + 1); 1476 } 1477 1478 return 0; 1479 } 1480 1481 /* Flags for check_refcounts_l1() and check_refcounts_l2() */ 1482 enum { 1483 CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ 1484 }; 1485 1486 /* 1487 * Increases the refcount in the given refcount table for the all clusters 1488 * referenced in the L2 table. While doing so, performs some checks on L2 1489 * entries. 1490 * 1491 * Returns the number of errors found by the checks or -errno if an internal 1492 * error occurred. 1493 */ 1494 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, 1495 void **refcount_table, 1496 int64_t *refcount_table_size, int64_t l2_offset, 1497 int flags) 1498 { 1499 BDRVQcow2State *s = bs->opaque; 1500 uint64_t *l2_table, l2_entry; 1501 uint64_t next_contiguous_offset = 0; 1502 int i, l2_size, nb_csectors, ret; 1503 1504 /* Read L2 table from disk */ 1505 l2_size = s->l2_size * sizeof(uint64_t); 1506 l2_table = g_malloc(l2_size); 1507 1508 ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size); 1509 if (ret < 0) { 1510 fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); 1511 res->check_errors++; 1512 goto fail; 1513 } 1514 1515 /* Do the actual checks */ 1516 for(i = 0; i < s->l2_size; i++) { 1517 l2_entry = be64_to_cpu(l2_table[i]); 1518 1519 switch (qcow2_get_cluster_type(l2_entry)) { 1520 case QCOW2_CLUSTER_COMPRESSED: 1521 /* Compressed clusters don't have QCOW_OFLAG_COPIED */ 1522 if (l2_entry & QCOW_OFLAG_COPIED) { 1523 fprintf(stderr, "ERROR: cluster %" PRId64 ": " 1524 "copied flag must never be set for compressed " 1525 "clusters\n", l2_entry >> s->cluster_bits); 1526 l2_entry &= ~QCOW_OFLAG_COPIED; 1527 res->corruptions++; 1528 } 1529 1530 /* Mark cluster as used */ 1531 nb_csectors = ((l2_entry >> s->csize_shift) & 1532 s->csize_mask) + 1; 1533 l2_entry &= s->cluster_offset_mask; 1534 ret = qcow2_inc_refcounts_imrt(bs, res, 1535 refcount_table, refcount_table_size, 1536 l2_entry & ~511, nb_csectors * 512); 1537 if (ret < 0) { 1538 goto fail; 1539 } 1540 1541 if (flags & CHECK_FRAG_INFO) { 1542 res->bfi.allocated_clusters++; 1543 res->bfi.compressed_clusters++; 1544 1545 /* Compressed clusters are fragmented by nature. Since they 1546 * take up sub-sector space but we only have sector granularity 1547 * I/O we need to re-read the same sectors even for adjacent 1548 * compressed clusters. 1549 */ 1550 res->bfi.fragmented_clusters++; 1551 } 1552 break; 1553 1554 case QCOW2_CLUSTER_ZERO_ALLOC: 1555 case QCOW2_CLUSTER_NORMAL: 1556 { 1557 uint64_t offset = l2_entry & L2E_OFFSET_MASK; 1558 1559 if (flags & CHECK_FRAG_INFO) { 1560 res->bfi.allocated_clusters++; 1561 if (next_contiguous_offset && 1562 offset != next_contiguous_offset) { 1563 res->bfi.fragmented_clusters++; 1564 } 1565 next_contiguous_offset = offset + s->cluster_size; 1566 } 1567 1568 /* Mark cluster as used */ 1569 ret = qcow2_inc_refcounts_imrt(bs, res, 1570 refcount_table, refcount_table_size, 1571 offset, s->cluster_size); 1572 if (ret < 0) { 1573 goto fail; 1574 } 1575 1576 /* Correct offsets are cluster aligned */ 1577 if (offset_into_cluster(s, offset)) { 1578 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " 1579 "properly aligned; L2 entry corrupted.\n", offset); 1580 res->corruptions++; 1581 } 1582 break; 1583 } 1584 1585 case QCOW2_CLUSTER_ZERO_PLAIN: 1586 case QCOW2_CLUSTER_UNALLOCATED: 1587 break; 1588 1589 default: 1590 abort(); 1591 } 1592 } 1593 1594 g_free(l2_table); 1595 return 0; 1596 1597 fail: 1598 g_free(l2_table); 1599 return ret; 1600 } 1601 1602 /* 1603 * Increases the refcount for the L1 table, its L2 tables and all referenced 1604 * clusters in the given refcount table. While doing so, performs some checks 1605 * on L1 and L2 entries. 1606 * 1607 * Returns the number of errors found by the checks or -errno if an internal 1608 * error occurred. 1609 */ 1610 static int check_refcounts_l1(BlockDriverState *bs, 1611 BdrvCheckResult *res, 1612 void **refcount_table, 1613 int64_t *refcount_table_size, 1614 int64_t l1_table_offset, int l1_size, 1615 int flags) 1616 { 1617 BDRVQcow2State *s = bs->opaque; 1618 uint64_t *l1_table = NULL, l2_offset, l1_size2; 1619 int i, ret; 1620 1621 l1_size2 = l1_size * sizeof(uint64_t); 1622 1623 /* Mark L1 table as used */ 1624 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size, 1625 l1_table_offset, l1_size2); 1626 if (ret < 0) { 1627 goto fail; 1628 } 1629 1630 /* Read L1 table entries from disk */ 1631 if (l1_size2 > 0) { 1632 l1_table = g_try_malloc(l1_size2); 1633 if (l1_table == NULL) { 1634 ret = -ENOMEM; 1635 res->check_errors++; 1636 goto fail; 1637 } 1638 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); 1639 if (ret < 0) { 1640 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); 1641 res->check_errors++; 1642 goto fail; 1643 } 1644 for(i = 0;i < l1_size; i++) 1645 be64_to_cpus(&l1_table[i]); 1646 } 1647 1648 /* Do the actual checks */ 1649 for(i = 0; i < l1_size; i++) { 1650 l2_offset = l1_table[i]; 1651 if (l2_offset) { 1652 /* Mark L2 table as used */ 1653 l2_offset &= L1E_OFFSET_MASK; 1654 ret = qcow2_inc_refcounts_imrt(bs, res, 1655 refcount_table, refcount_table_size, 1656 l2_offset, s->cluster_size); 1657 if (ret < 0) { 1658 goto fail; 1659 } 1660 1661 /* L2 tables are cluster aligned */ 1662 if (offset_into_cluster(s, l2_offset)) { 1663 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " 1664 "cluster aligned; L1 entry corrupted\n", l2_offset); 1665 res->corruptions++; 1666 } 1667 1668 /* Process and check L2 entries */ 1669 ret = check_refcounts_l2(bs, res, refcount_table, 1670 refcount_table_size, l2_offset, flags); 1671 if (ret < 0) { 1672 goto fail; 1673 } 1674 } 1675 } 1676 g_free(l1_table); 1677 return 0; 1678 1679 fail: 1680 g_free(l1_table); 1681 return ret; 1682 } 1683 1684 /* 1685 * Checks the OFLAG_COPIED flag for all L1 and L2 entries. 1686 * 1687 * This function does not print an error message nor does it increment 1688 * check_errors if qcow2_get_refcount fails (this is because such an error will 1689 * have been already detected and sufficiently signaled by the calling function 1690 * (qcow2_check_refcounts) by the time this function is called). 1691 */ 1692 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, 1693 BdrvCheckMode fix) 1694 { 1695 BDRVQcow2State *s = bs->opaque; 1696 uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); 1697 int ret; 1698 uint64_t refcount; 1699 int i, j; 1700 1701 for (i = 0; i < s->l1_size; i++) { 1702 uint64_t l1_entry = s->l1_table[i]; 1703 uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; 1704 bool l2_dirty = false; 1705 1706 if (!l2_offset) { 1707 continue; 1708 } 1709 1710 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1711 &refcount); 1712 if (ret < 0) { 1713 /* don't print message nor increment check_errors */ 1714 continue; 1715 } 1716 if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { 1717 fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " 1718 "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n", 1719 fix & BDRV_FIX_ERRORS ? "Repairing" : 1720 "ERROR", 1721 i, l1_entry, refcount); 1722 if (fix & BDRV_FIX_ERRORS) { 1723 s->l1_table[i] = refcount == 1 1724 ? l1_entry | QCOW_OFLAG_COPIED 1725 : l1_entry & ~QCOW_OFLAG_COPIED; 1726 ret = qcow2_write_l1_entry(bs, i); 1727 if (ret < 0) { 1728 res->check_errors++; 1729 goto fail; 1730 } 1731 res->corruptions_fixed++; 1732 } else { 1733 res->corruptions++; 1734 } 1735 } 1736 1737 ret = bdrv_pread(bs->file, l2_offset, l2_table, 1738 s->l2_size * sizeof(uint64_t)); 1739 if (ret < 0) { 1740 fprintf(stderr, "ERROR: Could not read L2 table: %s\n", 1741 strerror(-ret)); 1742 res->check_errors++; 1743 goto fail; 1744 } 1745 1746 for (j = 0; j < s->l2_size; j++) { 1747 uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1748 uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; 1749 QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 1750 1751 if (cluster_type == QCOW2_CLUSTER_NORMAL || 1752 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) { 1753 ret = qcow2_get_refcount(bs, 1754 data_offset >> s->cluster_bits, 1755 &refcount); 1756 if (ret < 0) { 1757 /* don't print message nor increment check_errors */ 1758 continue; 1759 } 1760 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { 1761 fprintf(stderr, "%s OFLAG_COPIED data cluster: " 1762 "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n", 1763 fix & BDRV_FIX_ERRORS ? "Repairing" : 1764 "ERROR", 1765 l2_entry, refcount); 1766 if (fix & BDRV_FIX_ERRORS) { 1767 l2_table[j] = cpu_to_be64(refcount == 1 1768 ? l2_entry | QCOW_OFLAG_COPIED 1769 : l2_entry & ~QCOW_OFLAG_COPIED); 1770 l2_dirty = true; 1771 res->corruptions_fixed++; 1772 } else { 1773 res->corruptions++; 1774 } 1775 } 1776 } 1777 } 1778 1779 if (l2_dirty) { 1780 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, 1781 l2_offset, s->cluster_size); 1782 if (ret < 0) { 1783 fprintf(stderr, "ERROR: Could not write L2 table; metadata " 1784 "overlap check failed: %s\n", strerror(-ret)); 1785 res->check_errors++; 1786 goto fail; 1787 } 1788 1789 ret = bdrv_pwrite(bs->file, l2_offset, l2_table, 1790 s->cluster_size); 1791 if (ret < 0) { 1792 fprintf(stderr, "ERROR: Could not write L2 table: %s\n", 1793 strerror(-ret)); 1794 res->check_errors++; 1795 goto fail; 1796 } 1797 } 1798 } 1799 1800 ret = 0; 1801 1802 fail: 1803 qemu_vfree(l2_table); 1804 return ret; 1805 } 1806 1807 /* 1808 * Checks consistency of refblocks and accounts for each refblock in 1809 * *refcount_table. 1810 */ 1811 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, 1812 BdrvCheckMode fix, bool *rebuild, 1813 void **refcount_table, int64_t *nb_clusters) 1814 { 1815 BDRVQcow2State *s = bs->opaque; 1816 int64_t i, size; 1817 int ret; 1818 1819 for(i = 0; i < s->refcount_table_size; i++) { 1820 uint64_t offset, cluster; 1821 offset = s->refcount_table[i]; 1822 cluster = offset >> s->cluster_bits; 1823 1824 /* Refcount blocks are cluster aligned */ 1825 if (offset_into_cluster(s, offset)) { 1826 fprintf(stderr, "ERROR refcount block %" PRId64 " is not " 1827 "cluster aligned; refcount table entry corrupted\n", i); 1828 res->corruptions++; 1829 *rebuild = true; 1830 continue; 1831 } 1832 1833 if (cluster >= *nb_clusters) { 1834 fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n", 1835 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); 1836 1837 if (fix & BDRV_FIX_ERRORS) { 1838 int64_t new_nb_clusters; 1839 Error *local_err = NULL; 1840 1841 if (offset > INT64_MAX - s->cluster_size) { 1842 ret = -EINVAL; 1843 goto resize_fail; 1844 } 1845 1846 ret = bdrv_truncate(bs->file, offset + s->cluster_size, 1847 PREALLOC_MODE_OFF, &local_err); 1848 if (ret < 0) { 1849 error_report_err(local_err); 1850 goto resize_fail; 1851 } 1852 size = bdrv_getlength(bs->file->bs); 1853 if (size < 0) { 1854 ret = size; 1855 goto resize_fail; 1856 } 1857 1858 new_nb_clusters = size_to_clusters(s, size); 1859 assert(new_nb_clusters >= *nb_clusters); 1860 1861 ret = realloc_refcount_array(s, refcount_table, 1862 nb_clusters, new_nb_clusters); 1863 if (ret < 0) { 1864 res->check_errors++; 1865 return ret; 1866 } 1867 1868 if (cluster >= *nb_clusters) { 1869 ret = -EINVAL; 1870 goto resize_fail; 1871 } 1872 1873 res->corruptions_fixed++; 1874 ret = qcow2_inc_refcounts_imrt(bs, res, 1875 refcount_table, nb_clusters, 1876 offset, s->cluster_size); 1877 if (ret < 0) { 1878 return ret; 1879 } 1880 /* No need to check whether the refcount is now greater than 1: 1881 * This area was just allocated and zeroed, so it can only be 1882 * exactly 1 after qcow2_inc_refcounts_imrt() */ 1883 continue; 1884 1885 resize_fail: 1886 res->corruptions++; 1887 *rebuild = true; 1888 fprintf(stderr, "ERROR could not resize image: %s\n", 1889 strerror(-ret)); 1890 } else { 1891 res->corruptions++; 1892 } 1893 continue; 1894 } 1895 1896 if (offset != 0) { 1897 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1898 offset, s->cluster_size); 1899 if (ret < 0) { 1900 return ret; 1901 } 1902 if (s->get_refcount(*refcount_table, cluster) != 1) { 1903 fprintf(stderr, "ERROR refcount block %" PRId64 1904 " refcount=%" PRIu64 "\n", i, 1905 s->get_refcount(*refcount_table, cluster)); 1906 res->corruptions++; 1907 *rebuild = true; 1908 } 1909 } 1910 } 1911 1912 return 0; 1913 } 1914 1915 /* 1916 * Calculates an in-memory refcount table. 1917 */ 1918 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 1919 BdrvCheckMode fix, bool *rebuild, 1920 void **refcount_table, int64_t *nb_clusters) 1921 { 1922 BDRVQcow2State *s = bs->opaque; 1923 int64_t i; 1924 QCowSnapshot *sn; 1925 int ret; 1926 1927 if (!*refcount_table) { 1928 int64_t old_size = 0; 1929 ret = realloc_refcount_array(s, refcount_table, 1930 &old_size, *nb_clusters); 1931 if (ret < 0) { 1932 res->check_errors++; 1933 return ret; 1934 } 1935 } 1936 1937 /* header */ 1938 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1939 0, s->cluster_size); 1940 if (ret < 0) { 1941 return ret; 1942 } 1943 1944 /* current L1 table */ 1945 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 1946 s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO); 1947 if (ret < 0) { 1948 return ret; 1949 } 1950 1951 /* snapshots */ 1952 for (i = 0; i < s->nb_snapshots; i++) { 1953 sn = s->snapshots + i; 1954 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 1955 sn->l1_table_offset, sn->l1_size, 0); 1956 if (ret < 0) { 1957 return ret; 1958 } 1959 } 1960 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1961 s->snapshots_offset, s->snapshots_size); 1962 if (ret < 0) { 1963 return ret; 1964 } 1965 1966 /* refcount data */ 1967 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1968 s->refcount_table_offset, 1969 s->refcount_table_size * sizeof(uint64_t)); 1970 if (ret < 0) { 1971 return ret; 1972 } 1973 1974 /* encryption */ 1975 if (s->crypto_header.length) { 1976 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1977 s->crypto_header.offset, 1978 s->crypto_header.length); 1979 if (ret < 0) { 1980 return ret; 1981 } 1982 } 1983 1984 /* bitmaps */ 1985 ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters); 1986 if (ret < 0) { 1987 return ret; 1988 } 1989 1990 return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters); 1991 } 1992 1993 /* 1994 * Compares the actual reference count for each cluster in the image against the 1995 * refcount as reported by the refcount structures on-disk. 1996 */ 1997 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 1998 BdrvCheckMode fix, bool *rebuild, 1999 int64_t *highest_cluster, 2000 void *refcount_table, int64_t nb_clusters) 2001 { 2002 BDRVQcow2State *s = bs->opaque; 2003 int64_t i; 2004 uint64_t refcount1, refcount2; 2005 int ret; 2006 2007 for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { 2008 ret = qcow2_get_refcount(bs, i, &refcount1); 2009 if (ret < 0) { 2010 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", 2011 i, strerror(-ret)); 2012 res->check_errors++; 2013 continue; 2014 } 2015 2016 refcount2 = s->get_refcount(refcount_table, i); 2017 2018 if (refcount1 > 0 || refcount2 > 0) { 2019 *highest_cluster = i; 2020 } 2021 2022 if (refcount1 != refcount2) { 2023 /* Check if we're allowed to fix the mismatch */ 2024 int *num_fixed = NULL; 2025 if (refcount1 == 0) { 2026 *rebuild = true; 2027 } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { 2028 num_fixed = &res->leaks_fixed; 2029 } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { 2030 num_fixed = &res->corruptions_fixed; 2031 } 2032 2033 fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64 2034 " reference=%" PRIu64 "\n", 2035 num_fixed != NULL ? "Repairing" : 2036 refcount1 < refcount2 ? "ERROR" : 2037 "Leaked", 2038 i, refcount1, refcount2); 2039 2040 if (num_fixed) { 2041 ret = update_refcount(bs, i << s->cluster_bits, 1, 2042 refcount_diff(refcount1, refcount2), 2043 refcount1 > refcount2, 2044 QCOW2_DISCARD_ALWAYS); 2045 if (ret >= 0) { 2046 (*num_fixed)++; 2047 continue; 2048 } 2049 } 2050 2051 /* And if we couldn't, print an error */ 2052 if (refcount1 < refcount2) { 2053 res->corruptions++; 2054 } else { 2055 res->leaks++; 2056 } 2057 } 2058 } 2059 } 2060 2061 /* 2062 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to 2063 * the on-disk refcount structures. 2064 * 2065 * On input, *first_free_cluster tells where to start looking, and need not 2066 * actually be a free cluster; the returned offset will not be before that 2067 * cluster. On output, *first_free_cluster points to the first gap found, even 2068 * if that gap was too small to be used as the returned offset. 2069 * 2070 * Note that *first_free_cluster is a cluster index whereas the return value is 2071 * an offset. 2072 */ 2073 static int64_t alloc_clusters_imrt(BlockDriverState *bs, 2074 int cluster_count, 2075 void **refcount_table, 2076 int64_t *imrt_nb_clusters, 2077 int64_t *first_free_cluster) 2078 { 2079 BDRVQcow2State *s = bs->opaque; 2080 int64_t cluster = *first_free_cluster, i; 2081 bool first_gap = true; 2082 int contiguous_free_clusters; 2083 int ret; 2084 2085 /* Starting at *first_free_cluster, find a range of at least cluster_count 2086 * continuously free clusters */ 2087 for (contiguous_free_clusters = 0; 2088 cluster < *imrt_nb_clusters && 2089 contiguous_free_clusters < cluster_count; 2090 cluster++) 2091 { 2092 if (!s->get_refcount(*refcount_table, cluster)) { 2093 contiguous_free_clusters++; 2094 if (first_gap) { 2095 /* If this is the first free cluster found, update 2096 * *first_free_cluster accordingly */ 2097 *first_free_cluster = cluster; 2098 first_gap = false; 2099 } 2100 } else if (contiguous_free_clusters) { 2101 contiguous_free_clusters = 0; 2102 } 2103 } 2104 2105 /* If contiguous_free_clusters is greater than zero, it contains the number 2106 * of continuously free clusters until the current cluster; the first free 2107 * cluster in the current "gap" is therefore 2108 * cluster - contiguous_free_clusters */ 2109 2110 /* If no such range could be found, grow the in-memory refcount table 2111 * accordingly to append free clusters at the end of the image */ 2112 if (contiguous_free_clusters < cluster_count) { 2113 /* contiguous_free_clusters clusters are already empty at the image end; 2114 * we need cluster_count clusters; therefore, we have to allocate 2115 * cluster_count - contiguous_free_clusters new clusters at the end of 2116 * the image (which is the current value of cluster; note that cluster 2117 * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond 2118 * the image end) */ 2119 ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters, 2120 cluster + cluster_count 2121 - contiguous_free_clusters); 2122 if (ret < 0) { 2123 return ret; 2124 } 2125 } 2126 2127 /* Go back to the first free cluster */ 2128 cluster -= contiguous_free_clusters; 2129 for (i = 0; i < cluster_count; i++) { 2130 s->set_refcount(*refcount_table, cluster + i, 1); 2131 } 2132 2133 return cluster << s->cluster_bits; 2134 } 2135 2136 /* 2137 * Creates a new refcount structure based solely on the in-memory information 2138 * given through *refcount_table. All necessary allocations will be reflected 2139 * in that array. 2140 * 2141 * On success, the old refcount structure is leaked (it will be covered by the 2142 * new refcount structure). 2143 */ 2144 static int rebuild_refcount_structure(BlockDriverState *bs, 2145 BdrvCheckResult *res, 2146 void **refcount_table, 2147 int64_t *nb_clusters) 2148 { 2149 BDRVQcow2State *s = bs->opaque; 2150 int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; 2151 int64_t refblock_offset, refblock_start, refblock_index; 2152 uint32_t reftable_size = 0; 2153 uint64_t *on_disk_reftable = NULL; 2154 void *on_disk_refblock; 2155 int ret = 0; 2156 struct { 2157 uint64_t reftable_offset; 2158 uint32_t reftable_clusters; 2159 } QEMU_PACKED reftable_offset_and_clusters; 2160 2161 qcow2_cache_empty(bs, s->refcount_block_cache); 2162 2163 write_refblocks: 2164 for (; cluster < *nb_clusters; cluster++) { 2165 if (!s->get_refcount(*refcount_table, cluster)) { 2166 continue; 2167 } 2168 2169 refblock_index = cluster >> s->refcount_block_bits; 2170 refblock_start = refblock_index << s->refcount_block_bits; 2171 2172 /* Don't allocate a cluster in a refblock already written to disk */ 2173 if (first_free_cluster < refblock_start) { 2174 first_free_cluster = refblock_start; 2175 } 2176 refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, 2177 nb_clusters, &first_free_cluster); 2178 if (refblock_offset < 0) { 2179 fprintf(stderr, "ERROR allocating refblock: %s\n", 2180 strerror(-refblock_offset)); 2181 res->check_errors++; 2182 ret = refblock_offset; 2183 goto fail; 2184 } 2185 2186 if (reftable_size <= refblock_index) { 2187 uint32_t old_reftable_size = reftable_size; 2188 uint64_t *new_on_disk_reftable; 2189 2190 reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t), 2191 s->cluster_size) / sizeof(uint64_t); 2192 new_on_disk_reftable = g_try_realloc(on_disk_reftable, 2193 reftable_size * 2194 sizeof(uint64_t)); 2195 if (!new_on_disk_reftable) { 2196 res->check_errors++; 2197 ret = -ENOMEM; 2198 goto fail; 2199 } 2200 on_disk_reftable = new_on_disk_reftable; 2201 2202 memset(on_disk_reftable + old_reftable_size, 0, 2203 (reftable_size - old_reftable_size) * sizeof(uint64_t)); 2204 2205 /* The offset we have for the reftable is now no longer valid; 2206 * this will leak that range, but we can easily fix that by running 2207 * a leak-fixing check after this rebuild operation */ 2208 reftable_offset = -1; 2209 } else { 2210 assert(on_disk_reftable); 2211 } 2212 on_disk_reftable[refblock_index] = refblock_offset; 2213 2214 /* If this is apparently the last refblock (for now), try to squeeze the 2215 * reftable in */ 2216 if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits && 2217 reftable_offset < 0) 2218 { 2219 uint64_t reftable_clusters = size_to_clusters(s, reftable_size * 2220 sizeof(uint64_t)); 2221 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, 2222 refcount_table, nb_clusters, 2223 &first_free_cluster); 2224 if (reftable_offset < 0) { 2225 fprintf(stderr, "ERROR allocating reftable: %s\n", 2226 strerror(-reftable_offset)); 2227 res->check_errors++; 2228 ret = reftable_offset; 2229 goto fail; 2230 } 2231 } 2232 2233 ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, 2234 s->cluster_size); 2235 if (ret < 0) { 2236 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); 2237 goto fail; 2238 } 2239 2240 /* The size of *refcount_table is always cluster-aligned, therefore the 2241 * write operation will not overflow */ 2242 on_disk_refblock = (void *)((char *) *refcount_table + 2243 refblock_index * s->cluster_size); 2244 2245 ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE, 2246 on_disk_refblock, s->cluster_sectors); 2247 if (ret < 0) { 2248 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); 2249 goto fail; 2250 } 2251 2252 /* Go to the end of this refblock */ 2253 cluster = refblock_start + s->refcount_block_size - 1; 2254 } 2255 2256 if (reftable_offset < 0) { 2257 uint64_t post_refblock_start, reftable_clusters; 2258 2259 post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size); 2260 reftable_clusters = size_to_clusters(s, 2261 reftable_size * sizeof(uint64_t)); 2262 /* Not pretty but simple */ 2263 if (first_free_cluster < post_refblock_start) { 2264 first_free_cluster = post_refblock_start; 2265 } 2266 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, 2267 refcount_table, nb_clusters, 2268 &first_free_cluster); 2269 if (reftable_offset < 0) { 2270 fprintf(stderr, "ERROR allocating reftable: %s\n", 2271 strerror(-reftable_offset)); 2272 res->check_errors++; 2273 ret = reftable_offset; 2274 goto fail; 2275 } 2276 2277 goto write_refblocks; 2278 } 2279 2280 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { 2281 cpu_to_be64s(&on_disk_reftable[refblock_index]); 2282 } 2283 2284 ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, 2285 reftable_size * sizeof(uint64_t)); 2286 if (ret < 0) { 2287 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); 2288 goto fail; 2289 } 2290 2291 assert(reftable_size < INT_MAX / sizeof(uint64_t)); 2292 ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, 2293 reftable_size * sizeof(uint64_t)); 2294 if (ret < 0) { 2295 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); 2296 goto fail; 2297 } 2298 2299 /* Enter new reftable into the image header */ 2300 reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset); 2301 reftable_offset_and_clusters.reftable_clusters = 2302 cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t))); 2303 ret = bdrv_pwrite_sync(bs->file, 2304 offsetof(QCowHeader, refcount_table_offset), 2305 &reftable_offset_and_clusters, 2306 sizeof(reftable_offset_and_clusters)); 2307 if (ret < 0) { 2308 fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret)); 2309 goto fail; 2310 } 2311 2312 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { 2313 be64_to_cpus(&on_disk_reftable[refblock_index]); 2314 } 2315 s->refcount_table = on_disk_reftable; 2316 s->refcount_table_offset = reftable_offset; 2317 s->refcount_table_size = reftable_size; 2318 update_max_refcount_table_index(s); 2319 2320 return 0; 2321 2322 fail: 2323 g_free(on_disk_reftable); 2324 return ret; 2325 } 2326 2327 /* 2328 * Checks an image for refcount consistency. 2329 * 2330 * Returns 0 if no errors are found, the number of errors in case the image is 2331 * detected as corrupted, and -errno when an internal error occurred. 2332 */ 2333 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 2334 BdrvCheckMode fix) 2335 { 2336 BDRVQcow2State *s = bs->opaque; 2337 BdrvCheckResult pre_compare_res; 2338 int64_t size, highest_cluster, nb_clusters; 2339 void *refcount_table = NULL; 2340 bool rebuild = false; 2341 int ret; 2342 2343 size = bdrv_getlength(bs->file->bs); 2344 if (size < 0) { 2345 res->check_errors++; 2346 return size; 2347 } 2348 2349 nb_clusters = size_to_clusters(s, size); 2350 if (nb_clusters > INT_MAX) { 2351 res->check_errors++; 2352 return -EFBIG; 2353 } 2354 2355 res->bfi.total_clusters = 2356 size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); 2357 2358 ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table, 2359 &nb_clusters); 2360 if (ret < 0) { 2361 goto fail; 2362 } 2363 2364 /* In case we don't need to rebuild the refcount structure (but want to fix 2365 * something), this function is immediately called again, in which case the 2366 * result should be ignored */ 2367 pre_compare_res = *res; 2368 compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table, 2369 nb_clusters); 2370 2371 if (rebuild && (fix & BDRV_FIX_ERRORS)) { 2372 BdrvCheckResult old_res = *res; 2373 int fresh_leaks = 0; 2374 2375 fprintf(stderr, "Rebuilding refcount structure\n"); 2376 ret = rebuild_refcount_structure(bs, res, &refcount_table, 2377 &nb_clusters); 2378 if (ret < 0) { 2379 goto fail; 2380 } 2381 2382 res->corruptions = 0; 2383 res->leaks = 0; 2384 2385 /* Because the old reftable has been exchanged for a new one the 2386 * references have to be recalculated */ 2387 rebuild = false; 2388 memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters)); 2389 ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table, 2390 &nb_clusters); 2391 if (ret < 0) { 2392 goto fail; 2393 } 2394 2395 if (fix & BDRV_FIX_LEAKS) { 2396 /* The old refcount structures are now leaked, fix it; the result 2397 * can be ignored, aside from leaks which were introduced by 2398 * rebuild_refcount_structure() that could not be fixed */ 2399 BdrvCheckResult saved_res = *res; 2400 *res = (BdrvCheckResult){ 0 }; 2401 2402 compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild, 2403 &highest_cluster, refcount_table, nb_clusters); 2404 if (rebuild) { 2405 fprintf(stderr, "ERROR rebuilt refcount structure is still " 2406 "broken\n"); 2407 } 2408 2409 /* Any leaks accounted for here were introduced by 2410 * rebuild_refcount_structure() because that function has created a 2411 * new refcount structure from scratch */ 2412 fresh_leaks = res->leaks; 2413 *res = saved_res; 2414 } 2415 2416 if (res->corruptions < old_res.corruptions) { 2417 res->corruptions_fixed += old_res.corruptions - res->corruptions; 2418 } 2419 if (res->leaks < old_res.leaks) { 2420 res->leaks_fixed += old_res.leaks - res->leaks; 2421 } 2422 res->leaks += fresh_leaks; 2423 } else if (fix) { 2424 if (rebuild) { 2425 fprintf(stderr, "ERROR need to rebuild refcount structures\n"); 2426 res->check_errors++; 2427 ret = -EIO; 2428 goto fail; 2429 } 2430 2431 if (res->leaks || res->corruptions) { 2432 *res = pre_compare_res; 2433 compare_refcounts(bs, res, fix, &rebuild, &highest_cluster, 2434 refcount_table, nb_clusters); 2435 } 2436 } 2437 2438 /* check OFLAG_COPIED */ 2439 ret = check_oflag_copied(bs, res, fix); 2440 if (ret < 0) { 2441 goto fail; 2442 } 2443 2444 res->image_end_offset = (highest_cluster + 1) * s->cluster_size; 2445 ret = 0; 2446 2447 fail: 2448 g_free(refcount_table); 2449 2450 return ret; 2451 } 2452 2453 #define overlaps_with(ofs, sz) \ 2454 ranges_overlap(offset, size, ofs, sz) 2455 2456 /* 2457 * Checks if the given offset into the image file is actually free to use by 2458 * looking for overlaps with important metadata sections (L1/L2 tables etc.), 2459 * i.e. a sanity check without relying on the refcount tables. 2460 * 2461 * The ign parameter specifies what checks not to perform (being a bitmask of 2462 * QCow2MetadataOverlap values), i.e., what sections to ignore. 2463 * 2464 * Returns: 2465 * - 0 if writing to this offset will not affect the mentioned metadata 2466 * - a positive QCow2MetadataOverlap value indicating one overlapping section 2467 * - a negative value (-errno) indicating an error while performing a check, 2468 * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 2469 */ 2470 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, 2471 int64_t size) 2472 { 2473 BDRVQcow2State *s = bs->opaque; 2474 int chk = s->overlap_check & ~ign; 2475 int i, j; 2476 2477 if (!size) { 2478 return 0; 2479 } 2480 2481 if (chk & QCOW2_OL_MAIN_HEADER) { 2482 if (offset < s->cluster_size) { 2483 return QCOW2_OL_MAIN_HEADER; 2484 } 2485 } 2486 2487 /* align range to test to cluster boundaries */ 2488 size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); 2489 offset = start_of_cluster(s, offset); 2490 2491 if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { 2492 if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { 2493 return QCOW2_OL_ACTIVE_L1; 2494 } 2495 } 2496 2497 if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { 2498 if (overlaps_with(s->refcount_table_offset, 2499 s->refcount_table_size * sizeof(uint64_t))) { 2500 return QCOW2_OL_REFCOUNT_TABLE; 2501 } 2502 } 2503 2504 if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { 2505 if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { 2506 return QCOW2_OL_SNAPSHOT_TABLE; 2507 } 2508 } 2509 2510 if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { 2511 for (i = 0; i < s->nb_snapshots; i++) { 2512 if (s->snapshots[i].l1_size && 2513 overlaps_with(s->snapshots[i].l1_table_offset, 2514 s->snapshots[i].l1_size * sizeof(uint64_t))) { 2515 return QCOW2_OL_INACTIVE_L1; 2516 } 2517 } 2518 } 2519 2520 if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { 2521 for (i = 0; i < s->l1_size; i++) { 2522 if ((s->l1_table[i] & L1E_OFFSET_MASK) && 2523 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, 2524 s->cluster_size)) { 2525 return QCOW2_OL_ACTIVE_L2; 2526 } 2527 } 2528 } 2529 2530 if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { 2531 unsigned last_entry = s->max_refcount_table_index; 2532 assert(last_entry < s->refcount_table_size); 2533 assert(last_entry + 1 == s->refcount_table_size || 2534 (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0); 2535 for (i = 0; i <= last_entry; i++) { 2536 if ((s->refcount_table[i] & REFT_OFFSET_MASK) && 2537 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, 2538 s->cluster_size)) { 2539 return QCOW2_OL_REFCOUNT_BLOCK; 2540 } 2541 } 2542 } 2543 2544 if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { 2545 for (i = 0; i < s->nb_snapshots; i++) { 2546 uint64_t l1_ofs = s->snapshots[i].l1_table_offset; 2547 uint32_t l1_sz = s->snapshots[i].l1_size; 2548 uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); 2549 uint64_t *l1 = g_try_malloc(l1_sz2); 2550 int ret; 2551 2552 if (l1_sz2 && l1 == NULL) { 2553 return -ENOMEM; 2554 } 2555 2556 ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); 2557 if (ret < 0) { 2558 g_free(l1); 2559 return ret; 2560 } 2561 2562 for (j = 0; j < l1_sz; j++) { 2563 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; 2564 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { 2565 g_free(l1); 2566 return QCOW2_OL_INACTIVE_L2; 2567 } 2568 } 2569 2570 g_free(l1); 2571 } 2572 } 2573 2574 return 0; 2575 } 2576 2577 static const char *metadata_ol_names[] = { 2578 [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", 2579 [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", 2580 [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", 2581 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", 2582 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", 2583 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", 2584 [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", 2585 [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", 2586 }; 2587 2588 /* 2589 * First performs a check for metadata overlaps (through 2590 * qcow2_check_metadata_overlap); if that fails with a negative value (error 2591 * while performing a check), that value is returned. If an impending overlap 2592 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt 2593 * and -EIO returned. 2594 * 2595 * Returns 0 if there were neither overlaps nor errors while checking for 2596 * overlaps; or a negative value (-errno) on error. 2597 */ 2598 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, 2599 int64_t size) 2600 { 2601 int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); 2602 2603 if (ret < 0) { 2604 return ret; 2605 } else if (ret > 0) { 2606 int metadata_ol_bitnr = ctz32(ret); 2607 assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); 2608 2609 qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " 2610 "write on metadata (overlaps with %s)", 2611 metadata_ol_names[metadata_ol_bitnr]); 2612 return -EIO; 2613 } 2614 2615 return 0; 2616 } 2617 2618 /* A pointer to a function of this type is given to walk_over_reftable(). That 2619 * function will create refblocks and pass them to a RefblockFinishOp once they 2620 * are completed (@refblock). @refblock_empty is set if the refblock is 2621 * completely empty. 2622 * 2623 * Along with the refblock, a corresponding reftable entry is passed, in the 2624 * reftable @reftable (which may be reallocated) at @reftable_index. 2625 * 2626 * @allocated should be set to true if a new cluster has been allocated. 2627 */ 2628 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable, 2629 uint64_t reftable_index, uint64_t *reftable_size, 2630 void *refblock, bool refblock_empty, 2631 bool *allocated, Error **errp); 2632 2633 /** 2634 * This "operation" for walk_over_reftable() allocates the refblock on disk (if 2635 * it is not empty) and inserts its offset into the new reftable. The size of 2636 * this new reftable is increased as required. 2637 */ 2638 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable, 2639 uint64_t reftable_index, uint64_t *reftable_size, 2640 void *refblock, bool refblock_empty, bool *allocated, 2641 Error **errp) 2642 { 2643 BDRVQcow2State *s = bs->opaque; 2644 int64_t offset; 2645 2646 if (!refblock_empty && reftable_index >= *reftable_size) { 2647 uint64_t *new_reftable; 2648 uint64_t new_reftable_size; 2649 2650 new_reftable_size = ROUND_UP(reftable_index + 1, 2651 s->cluster_size / sizeof(uint64_t)); 2652 if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { 2653 error_setg(errp, 2654 "This operation would make the refcount table grow " 2655 "beyond the maximum size supported by QEMU, aborting"); 2656 return -ENOTSUP; 2657 } 2658 2659 new_reftable = g_try_realloc(*reftable, new_reftable_size * 2660 sizeof(uint64_t)); 2661 if (!new_reftable) { 2662 error_setg(errp, "Failed to increase reftable buffer size"); 2663 return -ENOMEM; 2664 } 2665 2666 memset(new_reftable + *reftable_size, 0, 2667 (new_reftable_size - *reftable_size) * sizeof(uint64_t)); 2668 2669 *reftable = new_reftable; 2670 *reftable_size = new_reftable_size; 2671 } 2672 2673 if (!refblock_empty && !(*reftable)[reftable_index]) { 2674 offset = qcow2_alloc_clusters(bs, s->cluster_size); 2675 if (offset < 0) { 2676 error_setg_errno(errp, -offset, "Failed to allocate refblock"); 2677 return offset; 2678 } 2679 (*reftable)[reftable_index] = offset; 2680 *allocated = true; 2681 } 2682 2683 return 0; 2684 } 2685 2686 /** 2687 * This "operation" for walk_over_reftable() writes the refblock to disk at the 2688 * offset specified by the new reftable's entry. It does not modify the new 2689 * reftable or change any refcounts. 2690 */ 2691 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable, 2692 uint64_t reftable_index, uint64_t *reftable_size, 2693 void *refblock, bool refblock_empty, bool *allocated, 2694 Error **errp) 2695 { 2696 BDRVQcow2State *s = bs->opaque; 2697 int64_t offset; 2698 int ret; 2699 2700 if (reftable_index < *reftable_size && (*reftable)[reftable_index]) { 2701 offset = (*reftable)[reftable_index]; 2702 2703 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 2704 if (ret < 0) { 2705 error_setg_errno(errp, -ret, "Overlap check failed"); 2706 return ret; 2707 } 2708 2709 ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size); 2710 if (ret < 0) { 2711 error_setg_errno(errp, -ret, "Failed to write refblock"); 2712 return ret; 2713 } 2714 } else { 2715 assert(refblock_empty); 2716 } 2717 2718 return 0; 2719 } 2720 2721 /** 2722 * This function walks over the existing reftable and every referenced refblock; 2723 * if @new_set_refcount is non-NULL, it is called for every refcount entry to 2724 * create an equal new entry in the passed @new_refblock. Once that 2725 * @new_refblock is completely filled, @operation will be called. 2726 * 2727 * @status_cb and @cb_opaque are used for the amend operation's status callback. 2728 * @index is the index of the walk_over_reftable() calls and @total is the total 2729 * number of walk_over_reftable() calls per amend operation. Both are used for 2730 * calculating the parameters for the status callback. 2731 * 2732 * @allocated is set to true if a new cluster has been allocated. 2733 */ 2734 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable, 2735 uint64_t *new_reftable_index, 2736 uint64_t *new_reftable_size, 2737 void *new_refblock, int new_refblock_size, 2738 int new_refcount_bits, 2739 RefblockFinishOp *operation, bool *allocated, 2740 Qcow2SetRefcountFunc *new_set_refcount, 2741 BlockDriverAmendStatusCB *status_cb, 2742 void *cb_opaque, int index, int total, 2743 Error **errp) 2744 { 2745 BDRVQcow2State *s = bs->opaque; 2746 uint64_t reftable_index; 2747 bool new_refblock_empty = true; 2748 int refblock_index; 2749 int new_refblock_index = 0; 2750 int ret; 2751 2752 for (reftable_index = 0; reftable_index < s->refcount_table_size; 2753 reftable_index++) 2754 { 2755 uint64_t refblock_offset = s->refcount_table[reftable_index] 2756 & REFT_OFFSET_MASK; 2757 2758 status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index, 2759 (uint64_t)total * s->refcount_table_size, cb_opaque); 2760 2761 if (refblock_offset) { 2762 void *refblock; 2763 2764 if (offset_into_cluster(s, refblock_offset)) { 2765 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 2766 PRIx64 " unaligned (reftable index: %#" 2767 PRIx64 ")", refblock_offset, 2768 reftable_index); 2769 error_setg(errp, 2770 "Image is corrupt (unaligned refblock offset)"); 2771 return -EIO; 2772 } 2773 2774 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset, 2775 &refblock); 2776 if (ret < 0) { 2777 error_setg_errno(errp, -ret, "Failed to retrieve refblock"); 2778 return ret; 2779 } 2780 2781 for (refblock_index = 0; refblock_index < s->refcount_block_size; 2782 refblock_index++) 2783 { 2784 uint64_t refcount; 2785 2786 if (new_refblock_index >= new_refblock_size) { 2787 /* new_refblock is now complete */ 2788 ret = operation(bs, new_reftable, *new_reftable_index, 2789 new_reftable_size, new_refblock, 2790 new_refblock_empty, allocated, errp); 2791 if (ret < 0) { 2792 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 2793 return ret; 2794 } 2795 2796 (*new_reftable_index)++; 2797 new_refblock_index = 0; 2798 new_refblock_empty = true; 2799 } 2800 2801 refcount = s->get_refcount(refblock, refblock_index); 2802 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) { 2803 uint64_t offset; 2804 2805 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 2806 2807 offset = ((reftable_index << s->refcount_block_bits) 2808 + refblock_index) << s->cluster_bits; 2809 2810 error_setg(errp, "Cannot decrease refcount entry width to " 2811 "%i bits: Cluster at offset %#" PRIx64 " has a " 2812 "refcount of %" PRIu64, new_refcount_bits, 2813 offset, refcount); 2814 return -EINVAL; 2815 } 2816 2817 if (new_set_refcount) { 2818 new_set_refcount(new_refblock, new_refblock_index++, 2819 refcount); 2820 } else { 2821 new_refblock_index++; 2822 } 2823 new_refblock_empty = new_refblock_empty && refcount == 0; 2824 } 2825 2826 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 2827 } else { 2828 /* No refblock means every refcount is 0 */ 2829 for (refblock_index = 0; refblock_index < s->refcount_block_size; 2830 refblock_index++) 2831 { 2832 if (new_refblock_index >= new_refblock_size) { 2833 /* new_refblock is now complete */ 2834 ret = operation(bs, new_reftable, *new_reftable_index, 2835 new_reftable_size, new_refblock, 2836 new_refblock_empty, allocated, errp); 2837 if (ret < 0) { 2838 return ret; 2839 } 2840 2841 (*new_reftable_index)++; 2842 new_refblock_index = 0; 2843 new_refblock_empty = true; 2844 } 2845 2846 if (new_set_refcount) { 2847 new_set_refcount(new_refblock, new_refblock_index++, 0); 2848 } else { 2849 new_refblock_index++; 2850 } 2851 } 2852 } 2853 } 2854 2855 if (new_refblock_index > 0) { 2856 /* Complete the potentially existing partially filled final refblock */ 2857 if (new_set_refcount) { 2858 for (; new_refblock_index < new_refblock_size; 2859 new_refblock_index++) 2860 { 2861 new_set_refcount(new_refblock, new_refblock_index, 0); 2862 } 2863 } 2864 2865 ret = operation(bs, new_reftable, *new_reftable_index, 2866 new_reftable_size, new_refblock, new_refblock_empty, 2867 allocated, errp); 2868 if (ret < 0) { 2869 return ret; 2870 } 2871 2872 (*new_reftable_index)++; 2873 } 2874 2875 status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size, 2876 (uint64_t)total * s->refcount_table_size, cb_opaque); 2877 2878 return 0; 2879 } 2880 2881 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, 2882 BlockDriverAmendStatusCB *status_cb, 2883 void *cb_opaque, Error **errp) 2884 { 2885 BDRVQcow2State *s = bs->opaque; 2886 Qcow2GetRefcountFunc *new_get_refcount; 2887 Qcow2SetRefcountFunc *new_set_refcount; 2888 void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size); 2889 uint64_t *new_reftable = NULL, new_reftable_size = 0; 2890 uint64_t *old_reftable, old_reftable_size, old_reftable_offset; 2891 uint64_t new_reftable_index = 0; 2892 uint64_t i; 2893 int64_t new_reftable_offset = 0, allocated_reftable_size = 0; 2894 int new_refblock_size, new_refcount_bits = 1 << refcount_order; 2895 int old_refcount_order; 2896 int walk_index = 0; 2897 int ret; 2898 bool new_allocation; 2899 2900 assert(s->qcow_version >= 3); 2901 assert(refcount_order >= 0 && refcount_order <= 6); 2902 2903 /* see qcow2_open() */ 2904 new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3)); 2905 2906 new_get_refcount = get_refcount_funcs[refcount_order]; 2907 new_set_refcount = set_refcount_funcs[refcount_order]; 2908 2909 2910 do { 2911 int total_walks; 2912 2913 new_allocation = false; 2914 2915 /* At least we have to do this walk and the one which writes the 2916 * refblocks; also, at least we have to do this loop here at least 2917 * twice (normally), first to do the allocations, and second to 2918 * determine that everything is correctly allocated, this then makes 2919 * three walks in total */ 2920 total_walks = MAX(walk_index + 2, 3); 2921 2922 /* First, allocate the structures so they are present in the refcount 2923 * structures */ 2924 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, 2925 &new_reftable_size, NULL, new_refblock_size, 2926 new_refcount_bits, &alloc_refblock, 2927 &new_allocation, NULL, status_cb, cb_opaque, 2928 walk_index++, total_walks, errp); 2929 if (ret < 0) { 2930 goto done; 2931 } 2932 2933 new_reftable_index = 0; 2934 2935 if (new_allocation) { 2936 if (new_reftable_offset) { 2937 qcow2_free_clusters(bs, new_reftable_offset, 2938 allocated_reftable_size * sizeof(uint64_t), 2939 QCOW2_DISCARD_NEVER); 2940 } 2941 2942 new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size * 2943 sizeof(uint64_t)); 2944 if (new_reftable_offset < 0) { 2945 error_setg_errno(errp, -new_reftable_offset, 2946 "Failed to allocate the new reftable"); 2947 ret = new_reftable_offset; 2948 goto done; 2949 } 2950 allocated_reftable_size = new_reftable_size; 2951 } 2952 } while (new_allocation); 2953 2954 /* Second, write the new refblocks */ 2955 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, 2956 &new_reftable_size, new_refblock, 2957 new_refblock_size, new_refcount_bits, 2958 &flush_refblock, &new_allocation, new_set_refcount, 2959 status_cb, cb_opaque, walk_index, walk_index + 1, 2960 errp); 2961 if (ret < 0) { 2962 goto done; 2963 } 2964 assert(!new_allocation); 2965 2966 2967 /* Write the new reftable */ 2968 ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset, 2969 new_reftable_size * sizeof(uint64_t)); 2970 if (ret < 0) { 2971 error_setg_errno(errp, -ret, "Overlap check failed"); 2972 goto done; 2973 } 2974 2975 for (i = 0; i < new_reftable_size; i++) { 2976 cpu_to_be64s(&new_reftable[i]); 2977 } 2978 2979 ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable, 2980 new_reftable_size * sizeof(uint64_t)); 2981 2982 for (i = 0; i < new_reftable_size; i++) { 2983 be64_to_cpus(&new_reftable[i]); 2984 } 2985 2986 if (ret < 0) { 2987 error_setg_errno(errp, -ret, "Failed to write the new reftable"); 2988 goto done; 2989 } 2990 2991 2992 /* Empty the refcount cache */ 2993 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 2994 if (ret < 0) { 2995 error_setg_errno(errp, -ret, "Failed to flush the refblock cache"); 2996 goto done; 2997 } 2998 2999 /* Update the image header to point to the new reftable; this only updates 3000 * the fields which are relevant to qcow2_update_header(); other fields 3001 * such as s->refcount_table or s->refcount_bits stay stale for now 3002 * (because we have to restore everything if qcow2_update_header() fails) */ 3003 old_refcount_order = s->refcount_order; 3004 old_reftable_size = s->refcount_table_size; 3005 old_reftable_offset = s->refcount_table_offset; 3006 3007 s->refcount_order = refcount_order; 3008 s->refcount_table_size = new_reftable_size; 3009 s->refcount_table_offset = new_reftable_offset; 3010 3011 ret = qcow2_update_header(bs); 3012 if (ret < 0) { 3013 s->refcount_order = old_refcount_order; 3014 s->refcount_table_size = old_reftable_size; 3015 s->refcount_table_offset = old_reftable_offset; 3016 error_setg_errno(errp, -ret, "Failed to update the qcow2 header"); 3017 goto done; 3018 } 3019 3020 /* Now update the rest of the in-memory information */ 3021 old_reftable = s->refcount_table; 3022 s->refcount_table = new_reftable; 3023 update_max_refcount_table_index(s); 3024 3025 s->refcount_bits = 1 << refcount_order; 3026 s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); 3027 s->refcount_max += s->refcount_max - 1; 3028 3029 s->refcount_block_bits = s->cluster_bits - (refcount_order - 3); 3030 s->refcount_block_size = 1 << s->refcount_block_bits; 3031 3032 s->get_refcount = new_get_refcount; 3033 s->set_refcount = new_set_refcount; 3034 3035 /* For cleaning up all old refblocks and the old reftable below the "done" 3036 * label */ 3037 new_reftable = old_reftable; 3038 new_reftable_size = old_reftable_size; 3039 new_reftable_offset = old_reftable_offset; 3040 3041 done: 3042 if (new_reftable) { 3043 /* On success, new_reftable actually points to the old reftable (and 3044 * new_reftable_size is the old reftable's size); but that is just 3045 * fine */ 3046 for (i = 0; i < new_reftable_size; i++) { 3047 uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK; 3048 if (offset) { 3049 qcow2_free_clusters(bs, offset, s->cluster_size, 3050 QCOW2_DISCARD_OTHER); 3051 } 3052 } 3053 g_free(new_reftable); 3054 3055 if (new_reftable_offset > 0) { 3056 qcow2_free_clusters(bs, new_reftable_offset, 3057 new_reftable_size * sizeof(uint64_t), 3058 QCOW2_DISCARD_OTHER); 3059 } 3060 } 3061 3062 qemu_vfree(new_refblock); 3063 return ret; 3064 } 3065 3066 static int qcow2_discard_refcount_block(BlockDriverState *bs, 3067 uint64_t discard_block_offs) 3068 { 3069 BDRVQcow2State *s = bs->opaque; 3070 uint64_t refblock_offs = get_refblock_offset(s, discard_block_offs); 3071 uint64_t cluster_index = discard_block_offs >> s->cluster_bits; 3072 uint32_t block_index = cluster_index & (s->refcount_block_size - 1); 3073 void *refblock; 3074 int ret; 3075 3076 assert(discard_block_offs != 0); 3077 3078 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs, 3079 &refblock); 3080 if (ret < 0) { 3081 return ret; 3082 } 3083 3084 if (s->get_refcount(refblock, block_index) != 1) { 3085 qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:" 3086 " refblock offset %#" PRIx64 3087 ", reftable index %u" 3088 ", block offset %#" PRIx64 3089 ", refcount %#" PRIx64, 3090 refblock_offs, 3091 offset_to_reftable_index(s, discard_block_offs), 3092 discard_block_offs, 3093 s->get_refcount(refblock, block_index)); 3094 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 3095 return -EINVAL; 3096 } 3097 s->set_refcount(refblock, block_index, 0); 3098 3099 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, refblock); 3100 3101 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 3102 3103 if (cluster_index < s->free_cluster_index) { 3104 s->free_cluster_index = cluster_index; 3105 } 3106 3107 refblock = qcow2_cache_is_table_offset(bs, s->refcount_block_cache, 3108 discard_block_offs); 3109 if (refblock) { 3110 /* discard refblock from the cache if refblock is cached */ 3111 qcow2_cache_discard(bs, s->refcount_block_cache, refblock); 3112 } 3113 update_refcount_discard(bs, discard_block_offs, s->cluster_size); 3114 3115 return 0; 3116 } 3117 3118 int qcow2_shrink_reftable(BlockDriverState *bs) 3119 { 3120 BDRVQcow2State *s = bs->opaque; 3121 uint64_t *reftable_tmp = 3122 g_malloc(s->refcount_table_size * sizeof(uint64_t)); 3123 int i, ret; 3124 3125 for (i = 0; i < s->refcount_table_size; i++) { 3126 int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK; 3127 void *refblock; 3128 bool unused_block; 3129 3130 if (refblock_offs == 0) { 3131 reftable_tmp[i] = 0; 3132 continue; 3133 } 3134 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs, 3135 &refblock); 3136 if (ret < 0) { 3137 goto out; 3138 } 3139 3140 /* the refblock has own reference */ 3141 if (i == offset_to_reftable_index(s, refblock_offs)) { 3142 uint64_t block_index = (refblock_offs >> s->cluster_bits) & 3143 (s->refcount_block_size - 1); 3144 uint64_t refcount = s->get_refcount(refblock, block_index); 3145 3146 s->set_refcount(refblock, block_index, 0); 3147 3148 unused_block = buffer_is_zero(refblock, s->cluster_size); 3149 3150 s->set_refcount(refblock, block_index, refcount); 3151 } else { 3152 unused_block = buffer_is_zero(refblock, s->cluster_size); 3153 } 3154 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 3155 3156 reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]); 3157 } 3158 3159 ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp, 3160 s->refcount_table_size * sizeof(uint64_t)); 3161 /* 3162 * If the write in the reftable failed the image may contain a partially 3163 * overwritten reftable. In this case it would be better to clear the 3164 * reftable in memory to avoid possible image corruption. 3165 */ 3166 for (i = 0; i < s->refcount_table_size; i++) { 3167 if (s->refcount_table[i] && !reftable_tmp[i]) { 3168 if (ret == 0) { 3169 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] & 3170 REFT_OFFSET_MASK); 3171 } 3172 s->refcount_table[i] = 0; 3173 } 3174 } 3175 3176 if (!s->cache_discards) { 3177 qcow2_process_discards(bs, ret); 3178 } 3179 3180 out: 3181 g_free(reftable_tmp); 3182 return ret; 3183 } 3184 3185 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size) 3186 { 3187 BDRVQcow2State *s = bs->opaque; 3188 int64_t i; 3189 3190 for (i = size_to_clusters(s, size) - 1; i >= 0; i--) { 3191 uint64_t refcount; 3192 int ret = qcow2_get_refcount(bs, i, &refcount); 3193 if (ret < 0) { 3194 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", 3195 i, strerror(-ret)); 3196 return ret; 3197 } 3198 if (refcount > 0) { 3199 return i; 3200 } 3201 } 3202 qcow2_signal_corruption(bs, true, -1, -1, 3203 "There are no references in the refcount table."); 3204 return -EIO; 3205 } 3206