1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include "qapi/error.h" 27 #include "qemu-common.h" 28 #include "block/block_int.h" 29 #include "block/qcow2.h" 30 #include "qemu/range.h" 31 #include "qemu/bswap.h" 32 33 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); 34 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, 35 int64_t offset, int64_t length, uint64_t addend, 36 bool decrease, enum qcow2_discard_type type); 37 38 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index); 39 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index); 40 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index); 41 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index); 42 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index); 43 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index); 44 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index); 45 46 static void set_refcount_ro0(void *refcount_array, uint64_t index, 47 uint64_t value); 48 static void set_refcount_ro1(void *refcount_array, uint64_t index, 49 uint64_t value); 50 static void set_refcount_ro2(void *refcount_array, uint64_t index, 51 uint64_t value); 52 static void set_refcount_ro3(void *refcount_array, uint64_t index, 53 uint64_t value); 54 static void set_refcount_ro4(void *refcount_array, uint64_t index, 55 uint64_t value); 56 static void set_refcount_ro5(void *refcount_array, uint64_t index, 57 uint64_t value); 58 static void set_refcount_ro6(void *refcount_array, uint64_t index, 59 uint64_t value); 60 61 62 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = { 63 &get_refcount_ro0, 64 &get_refcount_ro1, 65 &get_refcount_ro2, 66 &get_refcount_ro3, 67 &get_refcount_ro4, 68 &get_refcount_ro5, 69 &get_refcount_ro6 70 }; 71 72 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { 73 &set_refcount_ro0, 74 &set_refcount_ro1, 75 &set_refcount_ro2, 76 &set_refcount_ro3, 77 &set_refcount_ro4, 78 &set_refcount_ro5, 79 &set_refcount_ro6 80 }; 81 82 83 /*********************************************************/ 84 /* refcount handling */ 85 86 static void update_max_refcount_table_index(BDRVQcow2State *s) 87 { 88 unsigned i = s->refcount_table_size - 1; 89 while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) { 90 i--; 91 } 92 /* Set s->max_refcount_table_index to the index of the last used entry */ 93 s->max_refcount_table_index = i; 94 } 95 96 int qcow2_refcount_init(BlockDriverState *bs) 97 { 98 BDRVQcow2State *s = bs->opaque; 99 unsigned int refcount_table_size2, i; 100 int ret; 101 102 assert(s->refcount_order >= 0 && s->refcount_order <= 6); 103 104 s->get_refcount = get_refcount_funcs[s->refcount_order]; 105 s->set_refcount = set_refcount_funcs[s->refcount_order]; 106 107 assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); 108 refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); 109 s->refcount_table = g_try_malloc(refcount_table_size2); 110 111 if (s->refcount_table_size > 0) { 112 if (s->refcount_table == NULL) { 113 ret = -ENOMEM; 114 goto fail; 115 } 116 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); 117 ret = bdrv_pread(bs->file, s->refcount_table_offset, 118 s->refcount_table, refcount_table_size2); 119 if (ret < 0) { 120 goto fail; 121 } 122 for(i = 0; i < s->refcount_table_size; i++) 123 be64_to_cpus(&s->refcount_table[i]); 124 update_max_refcount_table_index(s); 125 } 126 return 0; 127 fail: 128 return ret; 129 } 130 131 void qcow2_refcount_close(BlockDriverState *bs) 132 { 133 BDRVQcow2State *s = bs->opaque; 134 g_free(s->refcount_table); 135 } 136 137 138 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index) 139 { 140 return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1; 141 } 142 143 static void set_refcount_ro0(void *refcount_array, uint64_t index, 144 uint64_t value) 145 { 146 assert(!(value >> 1)); 147 ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8)); 148 ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8); 149 } 150 151 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index) 152 { 153 return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4))) 154 & 0x3; 155 } 156 157 static void set_refcount_ro1(void *refcount_array, uint64_t index, 158 uint64_t value) 159 { 160 assert(!(value >> 2)); 161 ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4))); 162 ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4)); 163 } 164 165 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index) 166 { 167 return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2))) 168 & 0xf; 169 } 170 171 static void set_refcount_ro2(void *refcount_array, uint64_t index, 172 uint64_t value) 173 { 174 assert(!(value >> 4)); 175 ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2))); 176 ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2)); 177 } 178 179 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index) 180 { 181 return ((const uint8_t *)refcount_array)[index]; 182 } 183 184 static void set_refcount_ro3(void *refcount_array, uint64_t index, 185 uint64_t value) 186 { 187 assert(!(value >> 8)); 188 ((uint8_t *)refcount_array)[index] = value; 189 } 190 191 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index) 192 { 193 return be16_to_cpu(((const uint16_t *)refcount_array)[index]); 194 } 195 196 static void set_refcount_ro4(void *refcount_array, uint64_t index, 197 uint64_t value) 198 { 199 assert(!(value >> 16)); 200 ((uint16_t *)refcount_array)[index] = cpu_to_be16(value); 201 } 202 203 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index) 204 { 205 return be32_to_cpu(((const uint32_t *)refcount_array)[index]); 206 } 207 208 static void set_refcount_ro5(void *refcount_array, uint64_t index, 209 uint64_t value) 210 { 211 assert(!(value >> 32)); 212 ((uint32_t *)refcount_array)[index] = cpu_to_be32(value); 213 } 214 215 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index) 216 { 217 return be64_to_cpu(((const uint64_t *)refcount_array)[index]); 218 } 219 220 static void set_refcount_ro6(void *refcount_array, uint64_t index, 221 uint64_t value) 222 { 223 ((uint64_t *)refcount_array)[index] = cpu_to_be64(value); 224 } 225 226 227 static int load_refcount_block(BlockDriverState *bs, 228 int64_t refcount_block_offset, 229 void **refcount_block) 230 { 231 BDRVQcow2State *s = bs->opaque; 232 233 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); 234 return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 235 refcount_block); 236 } 237 238 /* 239 * Retrieves the refcount of the cluster given by its index and stores it in 240 * *refcount. Returns 0 on success and -errno on failure. 241 */ 242 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, 243 uint64_t *refcount) 244 { 245 BDRVQcow2State *s = bs->opaque; 246 uint64_t refcount_table_index, block_index; 247 int64_t refcount_block_offset; 248 int ret; 249 void *refcount_block; 250 251 refcount_table_index = cluster_index >> s->refcount_block_bits; 252 if (refcount_table_index >= s->refcount_table_size) { 253 *refcount = 0; 254 return 0; 255 } 256 refcount_block_offset = 257 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 258 if (!refcount_block_offset) { 259 *refcount = 0; 260 return 0; 261 } 262 263 if (offset_into_cluster(s, refcount_block_offset)) { 264 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 265 " unaligned (reftable index: %#" PRIx64 ")", 266 refcount_block_offset, refcount_table_index); 267 return -EIO; 268 } 269 270 ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, 271 &refcount_block); 272 if (ret < 0) { 273 return ret; 274 } 275 276 block_index = cluster_index & (s->refcount_block_size - 1); 277 *refcount = s->get_refcount(refcount_block, block_index); 278 279 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); 280 281 return 0; 282 } 283 284 /* Checks if two offsets are described by the same refcount block */ 285 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a, 286 uint64_t offset_b) 287 { 288 uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits); 289 uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits); 290 291 return (block_a == block_b); 292 } 293 294 /* 295 * Loads a refcount block. If it doesn't exist yet, it is allocated first 296 * (including growing the refcount table if needed). 297 * 298 * Returns 0 on success or -errno in error case 299 */ 300 static int alloc_refcount_block(BlockDriverState *bs, 301 int64_t cluster_index, void **refcount_block) 302 { 303 BDRVQcow2State *s = bs->opaque; 304 unsigned int refcount_table_index; 305 int64_t ret; 306 307 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); 308 309 /* Find the refcount block for the given cluster */ 310 refcount_table_index = cluster_index >> s->refcount_block_bits; 311 312 if (refcount_table_index < s->refcount_table_size) { 313 314 uint64_t refcount_block_offset = 315 s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; 316 317 /* If it's already there, we're done */ 318 if (refcount_block_offset) { 319 if (offset_into_cluster(s, refcount_block_offset)) { 320 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 321 PRIx64 " unaligned (reftable index: " 322 "%#x)", refcount_block_offset, 323 refcount_table_index); 324 return -EIO; 325 } 326 327 return load_refcount_block(bs, refcount_block_offset, 328 refcount_block); 329 } 330 } 331 332 /* 333 * If we came here, we need to allocate something. Something is at least 334 * a cluster for the new refcount block. It may also include a new refcount 335 * table if the old refcount table is too small. 336 * 337 * Note that allocating clusters here needs some special care: 338 * 339 * - We can't use the normal qcow2_alloc_clusters(), it would try to 340 * increase the refcount and very likely we would end up with an endless 341 * recursion. Instead we must place the refcount blocks in a way that 342 * they can describe them themselves. 343 * 344 * - We need to consider that at this point we are inside update_refcounts 345 * and potentially doing an initial refcount increase. This means that 346 * some clusters have already been allocated by the caller, but their 347 * refcount isn't accurate yet. If we allocate clusters for metadata, we 348 * need to return -EAGAIN to signal the caller that it needs to restart 349 * the search for free clusters. 350 * 351 * - alloc_clusters_noref and qcow2_free_clusters may load a different 352 * refcount block into the cache 353 */ 354 355 *refcount_block = NULL; 356 357 /* We write to the refcount table, so we might depend on L2 tables */ 358 ret = qcow2_cache_flush(bs, s->l2_table_cache); 359 if (ret < 0) { 360 return ret; 361 } 362 363 /* Allocate the refcount block itself and mark it as used */ 364 int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); 365 if (new_block < 0) { 366 return new_block; 367 } 368 369 #ifdef DEBUG_ALLOC2 370 fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 371 " at %" PRIx64 "\n", 372 refcount_table_index, cluster_index << s->cluster_bits, new_block); 373 #endif 374 375 if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { 376 /* Zero the new refcount block before updating it */ 377 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 378 refcount_block); 379 if (ret < 0) { 380 goto fail; 381 } 382 383 memset(*refcount_block, 0, s->cluster_size); 384 385 /* The block describes itself, need to update the cache */ 386 int block_index = (new_block >> s->cluster_bits) & 387 (s->refcount_block_size - 1); 388 s->set_refcount(*refcount_block, block_index, 1); 389 } else { 390 /* Described somewhere else. This can recurse at most twice before we 391 * arrive at a block that describes itself. */ 392 ret = update_refcount(bs, new_block, s->cluster_size, 1, false, 393 QCOW2_DISCARD_NEVER); 394 if (ret < 0) { 395 goto fail; 396 } 397 398 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 399 if (ret < 0) { 400 goto fail; 401 } 402 403 /* Initialize the new refcount block only after updating its refcount, 404 * update_refcount uses the refcount cache itself */ 405 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, 406 refcount_block); 407 if (ret < 0) { 408 goto fail; 409 } 410 411 memset(*refcount_block, 0, s->cluster_size); 412 } 413 414 /* Now the new refcount block needs to be written to disk */ 415 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); 416 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block); 417 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 418 if (ret < 0) { 419 goto fail; 420 } 421 422 /* If the refcount table is big enough, just hook the block up there */ 423 if (refcount_table_index < s->refcount_table_size) { 424 uint64_t data64 = cpu_to_be64(new_block); 425 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); 426 ret = bdrv_pwrite_sync(bs->file, 427 s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), 428 &data64, sizeof(data64)); 429 if (ret < 0) { 430 goto fail; 431 } 432 433 s->refcount_table[refcount_table_index] = new_block; 434 /* If there's a hole in s->refcount_table then it can happen 435 * that refcount_table_index < s->max_refcount_table_index */ 436 s->max_refcount_table_index = 437 MAX(s->max_refcount_table_index, refcount_table_index); 438 439 /* The new refcount block may be where the caller intended to put its 440 * data, so let it restart the search. */ 441 return -EAGAIN; 442 } 443 444 qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); 445 446 /* 447 * If we come here, we need to grow the refcount table. Again, a new 448 * refcount table needs some space and we can't simply allocate to avoid 449 * endless recursion. 450 * 451 * Therefore let's grab new refcount blocks at the end of the image, which 452 * will describe themselves and the new refcount table. This way we can 453 * reference them only in the new table and do the switch to the new 454 * refcount table at once without producing an inconsistent state in 455 * between. 456 */ 457 BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); 458 459 /* Calculate the number of refcount blocks needed so far; this will be the 460 * basis for calculating the index of the first cluster used for the 461 * self-describing refcount structures which we are about to create. 462 * 463 * Because we reached this point, there cannot be any refcount entries for 464 * cluster_index or higher indices yet. However, because new_block has been 465 * allocated to describe that cluster (and it will assume this role later 466 * on), we cannot use that index; also, new_block may actually have a higher 467 * cluster index than cluster_index, so it needs to be taken into account 468 * here (and 1 needs to be added to its value because that cluster is used). 469 */ 470 uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1, 471 (new_block >> s->cluster_bits) + 1), 472 s->refcount_block_size); 473 474 /* Create the new refcount table and blocks */ 475 uint64_t meta_offset = (blocks_used * s->refcount_block_size) * 476 s->cluster_size; 477 478 ret = qcow2_refcount_area(bs, meta_offset, 0, false, 479 refcount_table_index, new_block); 480 if (ret < 0) { 481 return ret; 482 } 483 484 ret = load_refcount_block(bs, new_block, refcount_block); 485 if (ret < 0) { 486 return ret; 487 } 488 489 /* If we were trying to do the initial refcount update for some cluster 490 * allocation, we might have used the same clusters to store newly 491 * allocated metadata. Make the caller search some new space. */ 492 return -EAGAIN; 493 494 fail: 495 if (*refcount_block != NULL) { 496 qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); 497 } 498 return ret; 499 } 500 501 /* 502 * Starting at @start_offset, this function creates new self-covering refcount 503 * structures: A new refcount table and refcount blocks which cover all of 504 * themselves, and a number of @additional_clusters beyond their end. 505 * @start_offset must be at the end of the image file, that is, there must be 506 * only empty space beyond it. 507 * If @exact_size is false, the refcount table will have 50 % more entries than 508 * necessary so it will not need to grow again soon. 509 * If @new_refblock_offset is not zero, it contains the offset of a refcount 510 * block that should be entered into the new refcount table at index 511 * @new_refblock_index. 512 * 513 * Returns: The offset after the new refcount structures (i.e. where the 514 * @additional_clusters may be placed) on success, -errno on error. 515 */ 516 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset, 517 uint64_t additional_clusters, bool exact_size, 518 int new_refblock_index, 519 uint64_t new_refblock_offset) 520 { 521 BDRVQcow2State *s = bs->opaque; 522 uint64_t total_refblock_count_u64, additional_refblock_count; 523 int total_refblock_count, table_size, area_reftable_index, table_clusters; 524 int i; 525 uint64_t table_offset, block_offset, end_offset; 526 int ret; 527 uint64_t *new_table; 528 529 assert(!(start_offset % s->cluster_size)); 530 531 qcow2_refcount_metadata_size(start_offset / s->cluster_size + 532 additional_clusters, 533 s->cluster_size, s->refcount_order, 534 !exact_size, &total_refblock_count_u64); 535 if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) { 536 return -EFBIG; 537 } 538 total_refblock_count = total_refblock_count_u64; 539 540 /* Index in the refcount table of the first refcount block to cover the area 541 * of refcount structures we are about to create; we know that 542 * @total_refblock_count can cover @start_offset, so this will definitely 543 * fit into an int. */ 544 area_reftable_index = (start_offset / s->cluster_size) / 545 s->refcount_block_size; 546 547 if (exact_size) { 548 table_size = total_refblock_count; 549 } else { 550 table_size = total_refblock_count + 551 DIV_ROUND_UP(total_refblock_count, 2); 552 } 553 /* The qcow2 file can only store the reftable size in number of clusters */ 554 table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t)); 555 table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size; 556 557 if (table_size > QCOW_MAX_REFTABLE_SIZE) { 558 return -EFBIG; 559 } 560 561 new_table = g_try_new0(uint64_t, table_size); 562 563 assert(table_size > 0); 564 if (new_table == NULL) { 565 ret = -ENOMEM; 566 goto fail; 567 } 568 569 /* Fill the new refcount table */ 570 if (table_size > s->max_refcount_table_index) { 571 /* We're actually growing the reftable */ 572 memcpy(new_table, s->refcount_table, 573 (s->max_refcount_table_index + 1) * sizeof(uint64_t)); 574 } else { 575 /* Improbable case: We're shrinking the reftable. However, the caller 576 * has assured us that there is only empty space beyond @start_offset, 577 * so we can simply drop all of the refblocks that won't fit into the 578 * new reftable. */ 579 memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t)); 580 } 581 582 if (new_refblock_offset) { 583 assert(new_refblock_index < total_refblock_count); 584 new_table[new_refblock_index] = new_refblock_offset; 585 } 586 587 /* Count how many new refblocks we have to create */ 588 additional_refblock_count = 0; 589 for (i = area_reftable_index; i < total_refblock_count; i++) { 590 if (!new_table[i]) { 591 additional_refblock_count++; 592 } 593 } 594 595 table_offset = start_offset + additional_refblock_count * s->cluster_size; 596 end_offset = table_offset + table_clusters * s->cluster_size; 597 598 /* Fill the refcount blocks, and create new ones, if necessary */ 599 block_offset = start_offset; 600 for (i = area_reftable_index; i < total_refblock_count; i++) { 601 void *refblock_data; 602 uint64_t first_offset_covered; 603 604 /* Reuse an existing refblock if possible, create a new one otherwise */ 605 if (new_table[i]) { 606 ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i], 607 &refblock_data); 608 if (ret < 0) { 609 goto fail; 610 } 611 } else { 612 ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, 613 block_offset, &refblock_data); 614 if (ret < 0) { 615 goto fail; 616 } 617 memset(refblock_data, 0, s->cluster_size); 618 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, 619 refblock_data); 620 621 new_table[i] = block_offset; 622 block_offset += s->cluster_size; 623 } 624 625 /* First host offset covered by this refblock */ 626 first_offset_covered = (uint64_t)i * s->refcount_block_size * 627 s->cluster_size; 628 if (first_offset_covered < end_offset) { 629 int j, end_index; 630 631 /* Set the refcount of all of the new refcount structures to 1 */ 632 633 if (first_offset_covered < start_offset) { 634 assert(i == area_reftable_index); 635 j = (start_offset - first_offset_covered) / s->cluster_size; 636 assert(j < s->refcount_block_size); 637 } else { 638 j = 0; 639 } 640 641 end_index = MIN((end_offset - first_offset_covered) / 642 s->cluster_size, 643 s->refcount_block_size); 644 645 for (; j < end_index; j++) { 646 /* The caller guaranteed us this space would be empty */ 647 assert(s->get_refcount(refblock_data, j) == 0); 648 s->set_refcount(refblock_data, j, 1); 649 } 650 651 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, 652 refblock_data); 653 } 654 655 qcow2_cache_put(bs, s->refcount_block_cache, &refblock_data); 656 } 657 658 assert(block_offset == table_offset); 659 660 /* Write refcount blocks to disk */ 661 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); 662 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 663 if (ret < 0) { 664 goto fail; 665 } 666 667 /* Write refcount table to disk */ 668 for (i = 0; i < total_refblock_count; i++) { 669 cpu_to_be64s(&new_table[i]); 670 } 671 672 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); 673 ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, 674 table_size * sizeof(uint64_t)); 675 if (ret < 0) { 676 goto fail; 677 } 678 679 for (i = 0; i < total_refblock_count; i++) { 680 be64_to_cpus(&new_table[i]); 681 } 682 683 /* Hook up the new refcount table in the qcow2 header */ 684 struct QEMU_PACKED { 685 uint64_t d64; 686 uint32_t d32; 687 } data; 688 data.d64 = cpu_to_be64(table_offset); 689 data.d32 = cpu_to_be32(table_clusters); 690 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); 691 ret = bdrv_pwrite_sync(bs->file, 692 offsetof(QCowHeader, refcount_table_offset), 693 &data, sizeof(data)); 694 if (ret < 0) { 695 goto fail; 696 } 697 698 /* And switch it in memory */ 699 uint64_t old_table_offset = s->refcount_table_offset; 700 uint64_t old_table_size = s->refcount_table_size; 701 702 g_free(s->refcount_table); 703 s->refcount_table = new_table; 704 s->refcount_table_size = table_size; 705 s->refcount_table_offset = table_offset; 706 update_max_refcount_table_index(s); 707 708 /* Free old table. */ 709 qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), 710 QCOW2_DISCARD_OTHER); 711 712 return end_offset; 713 714 fail: 715 g_free(new_table); 716 return ret; 717 } 718 719 void qcow2_process_discards(BlockDriverState *bs, int ret) 720 { 721 BDRVQcow2State *s = bs->opaque; 722 Qcow2DiscardRegion *d, *next; 723 724 QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { 725 QTAILQ_REMOVE(&s->discards, d, next); 726 727 /* Discard is optional, ignore the return value */ 728 if (ret >= 0) { 729 bdrv_pdiscard(bs->file->bs, d->offset, d->bytes); 730 } 731 732 g_free(d); 733 } 734 } 735 736 static void update_refcount_discard(BlockDriverState *bs, 737 uint64_t offset, uint64_t length) 738 { 739 BDRVQcow2State *s = bs->opaque; 740 Qcow2DiscardRegion *d, *p, *next; 741 742 QTAILQ_FOREACH(d, &s->discards, next) { 743 uint64_t new_start = MIN(offset, d->offset); 744 uint64_t new_end = MAX(offset + length, d->offset + d->bytes); 745 746 if (new_end - new_start <= length + d->bytes) { 747 /* There can't be any overlap, areas ending up here have no 748 * references any more and therefore shouldn't get freed another 749 * time. */ 750 assert(d->bytes + length == new_end - new_start); 751 d->offset = new_start; 752 d->bytes = new_end - new_start; 753 goto found; 754 } 755 } 756 757 d = g_malloc(sizeof(*d)); 758 *d = (Qcow2DiscardRegion) { 759 .bs = bs, 760 .offset = offset, 761 .bytes = length, 762 }; 763 QTAILQ_INSERT_TAIL(&s->discards, d, next); 764 765 found: 766 /* Merge discard requests if they are adjacent now */ 767 QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { 768 if (p == d 769 || p->offset > d->offset + d->bytes 770 || d->offset > p->offset + p->bytes) 771 { 772 continue; 773 } 774 775 /* Still no overlap possible */ 776 assert(p->offset == d->offset + d->bytes 777 || d->offset == p->offset + p->bytes); 778 779 QTAILQ_REMOVE(&s->discards, p, next); 780 d->offset = MIN(d->offset, p->offset); 781 d->bytes += p->bytes; 782 g_free(p); 783 } 784 } 785 786 /* XXX: cache several refcount block clusters ? */ 787 /* @addend is the absolute value of the addend; if @decrease is set, @addend 788 * will be subtracted from the current refcount, otherwise it will be added */ 789 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, 790 int64_t offset, 791 int64_t length, 792 uint64_t addend, 793 bool decrease, 794 enum qcow2_discard_type type) 795 { 796 BDRVQcow2State *s = bs->opaque; 797 int64_t start, last, cluster_offset; 798 void *refcount_block = NULL; 799 int64_t old_table_index = -1; 800 int ret; 801 802 #ifdef DEBUG_ALLOC2 803 fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 804 " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "", 805 addend); 806 #endif 807 if (length < 0) { 808 return -EINVAL; 809 } else if (length == 0) { 810 return 0; 811 } 812 813 if (decrease) { 814 qcow2_cache_set_dependency(bs, s->refcount_block_cache, 815 s->l2_table_cache); 816 } 817 818 start = start_of_cluster(s, offset); 819 last = start_of_cluster(s, offset + length - 1); 820 for(cluster_offset = start; cluster_offset <= last; 821 cluster_offset += s->cluster_size) 822 { 823 int block_index; 824 uint64_t refcount; 825 int64_t cluster_index = cluster_offset >> s->cluster_bits; 826 int64_t table_index = cluster_index >> s->refcount_block_bits; 827 828 /* Load the refcount block and allocate it if needed */ 829 if (table_index != old_table_index) { 830 if (refcount_block) { 831 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); 832 } 833 ret = alloc_refcount_block(bs, cluster_index, &refcount_block); 834 if (ret < 0) { 835 goto fail; 836 } 837 } 838 old_table_index = table_index; 839 840 qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, 841 refcount_block); 842 843 /* we can update the count and save it */ 844 block_index = cluster_index & (s->refcount_block_size - 1); 845 846 refcount = s->get_refcount(refcount_block, block_index); 847 if (decrease ? (refcount - addend > refcount) 848 : (refcount + addend < refcount || 849 refcount + addend > s->refcount_max)) 850 { 851 ret = -EINVAL; 852 goto fail; 853 } 854 if (decrease) { 855 refcount -= addend; 856 } else { 857 refcount += addend; 858 } 859 if (refcount == 0 && cluster_index < s->free_cluster_index) { 860 s->free_cluster_index = cluster_index; 861 } 862 s->set_refcount(refcount_block, block_index, refcount); 863 864 if (refcount == 0 && s->discard_passthrough[type]) { 865 update_refcount_discard(bs, cluster_offset, s->cluster_size); 866 } 867 } 868 869 ret = 0; 870 fail: 871 if (!s->cache_discards) { 872 qcow2_process_discards(bs, ret); 873 } 874 875 /* Write last changed block to disk */ 876 if (refcount_block) { 877 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); 878 } 879 880 /* 881 * Try do undo any updates if an error is returned (This may succeed in 882 * some cases like ENOSPC for allocating a new refcount block) 883 */ 884 if (ret < 0) { 885 int dummy; 886 dummy = update_refcount(bs, offset, cluster_offset - offset, addend, 887 !decrease, QCOW2_DISCARD_NEVER); 888 (void)dummy; 889 } 890 891 return ret; 892 } 893 894 /* 895 * Increases or decreases the refcount of a given cluster. 896 * 897 * @addend is the absolute value of the addend; if @decrease is set, @addend 898 * will be subtracted from the current refcount, otherwise it will be added. 899 * 900 * On success 0 is returned; on failure -errno is returned. 901 */ 902 int qcow2_update_cluster_refcount(BlockDriverState *bs, 903 int64_t cluster_index, 904 uint64_t addend, bool decrease, 905 enum qcow2_discard_type type) 906 { 907 BDRVQcow2State *s = bs->opaque; 908 int ret; 909 910 ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, 911 decrease, type); 912 if (ret < 0) { 913 return ret; 914 } 915 916 return 0; 917 } 918 919 920 921 /*********************************************************/ 922 /* cluster allocation functions */ 923 924 925 926 /* return < 0 if error */ 927 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) 928 { 929 BDRVQcow2State *s = bs->opaque; 930 uint64_t i, nb_clusters, refcount; 931 int ret; 932 933 /* We can't allocate clusters if they may still be queued for discard. */ 934 if (s->cache_discards) { 935 qcow2_process_discards(bs, 0); 936 } 937 938 nb_clusters = size_to_clusters(s, size); 939 retry: 940 for(i = 0; i < nb_clusters; i++) { 941 uint64_t next_cluster_index = s->free_cluster_index++; 942 ret = qcow2_get_refcount(bs, next_cluster_index, &refcount); 943 944 if (ret < 0) { 945 return ret; 946 } else if (refcount != 0) { 947 goto retry; 948 } 949 } 950 951 /* Make sure that all offsets in the "allocated" range are representable 952 * in an int64_t */ 953 if (s->free_cluster_index > 0 && 954 s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) 955 { 956 return -EFBIG; 957 } 958 959 #ifdef DEBUG_ALLOC2 960 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", 961 size, 962 (s->free_cluster_index - nb_clusters) << s->cluster_bits); 963 #endif 964 return (s->free_cluster_index - nb_clusters) << s->cluster_bits; 965 } 966 967 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) 968 { 969 int64_t offset; 970 int ret; 971 972 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); 973 do { 974 offset = alloc_clusters_noref(bs, size); 975 if (offset < 0) { 976 return offset; 977 } 978 979 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); 980 } while (ret == -EAGAIN); 981 982 if (ret < 0) { 983 return ret; 984 } 985 986 return offset; 987 } 988 989 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, 990 int64_t nb_clusters) 991 { 992 BDRVQcow2State *s = bs->opaque; 993 uint64_t cluster_index, refcount; 994 uint64_t i; 995 int ret; 996 997 assert(nb_clusters >= 0); 998 if (nb_clusters == 0) { 999 return 0; 1000 } 1001 1002 do { 1003 /* Check how many clusters there are free */ 1004 cluster_index = offset >> s->cluster_bits; 1005 for(i = 0; i < nb_clusters; i++) { 1006 ret = qcow2_get_refcount(bs, cluster_index++, &refcount); 1007 if (ret < 0) { 1008 return ret; 1009 } else if (refcount != 0) { 1010 break; 1011 } 1012 } 1013 1014 /* And then allocate them */ 1015 ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false, 1016 QCOW2_DISCARD_NEVER); 1017 } while (ret == -EAGAIN); 1018 1019 if (ret < 0) { 1020 return ret; 1021 } 1022 1023 return i; 1024 } 1025 1026 /* only used to allocate compressed sectors. We try to allocate 1027 contiguous sectors. size must be <= cluster_size */ 1028 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) 1029 { 1030 BDRVQcow2State *s = bs->opaque; 1031 int64_t offset; 1032 size_t free_in_cluster; 1033 int ret; 1034 1035 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); 1036 assert(size > 0 && size <= s->cluster_size); 1037 assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); 1038 1039 offset = s->free_byte_offset; 1040 1041 if (offset) { 1042 uint64_t refcount; 1043 ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount); 1044 if (ret < 0) { 1045 return ret; 1046 } 1047 1048 if (refcount == s->refcount_max) { 1049 offset = 0; 1050 } 1051 } 1052 1053 free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); 1054 do { 1055 if (!offset || free_in_cluster < size) { 1056 int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); 1057 if (new_cluster < 0) { 1058 return new_cluster; 1059 } 1060 1061 if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { 1062 offset = new_cluster; 1063 free_in_cluster = s->cluster_size; 1064 } else { 1065 free_in_cluster += s->cluster_size; 1066 } 1067 } 1068 1069 assert(offset); 1070 ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); 1071 if (ret < 0) { 1072 offset = 0; 1073 } 1074 } while (ret == -EAGAIN); 1075 if (ret < 0) { 1076 return ret; 1077 } 1078 1079 /* The cluster refcount was incremented; refcount blocks must be flushed 1080 * before the caller's L2 table updates. */ 1081 qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); 1082 1083 s->free_byte_offset = offset + size; 1084 if (!offset_into_cluster(s, s->free_byte_offset)) { 1085 s->free_byte_offset = 0; 1086 } 1087 1088 return offset; 1089 } 1090 1091 void qcow2_free_clusters(BlockDriverState *bs, 1092 int64_t offset, int64_t size, 1093 enum qcow2_discard_type type) 1094 { 1095 int ret; 1096 1097 BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); 1098 ret = update_refcount(bs, offset, size, 1, true, type); 1099 if (ret < 0) { 1100 fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); 1101 /* TODO Remember the clusters to free them later and avoid leaking */ 1102 } 1103 } 1104 1105 /* 1106 * Free a cluster using its L2 entry (handles clusters of all types, e.g. 1107 * normal cluster, compressed cluster, etc.) 1108 */ 1109 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, 1110 int nb_clusters, enum qcow2_discard_type type) 1111 { 1112 BDRVQcow2State *s = bs->opaque; 1113 1114 switch (qcow2_get_cluster_type(l2_entry)) { 1115 case QCOW2_CLUSTER_COMPRESSED: 1116 { 1117 int nb_csectors; 1118 nb_csectors = ((l2_entry >> s->csize_shift) & 1119 s->csize_mask) + 1; 1120 qcow2_free_clusters(bs, 1121 (l2_entry & s->cluster_offset_mask) & ~511, 1122 nb_csectors * 512, type); 1123 } 1124 break; 1125 case QCOW2_CLUSTER_NORMAL: 1126 case QCOW2_CLUSTER_ZERO_ALLOC: 1127 if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { 1128 qcow2_signal_corruption(bs, false, -1, -1, 1129 "Cannot free unaligned cluster %#llx", 1130 l2_entry & L2E_OFFSET_MASK); 1131 } else { 1132 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, 1133 nb_clusters << s->cluster_bits, type); 1134 } 1135 break; 1136 case QCOW2_CLUSTER_ZERO_PLAIN: 1137 case QCOW2_CLUSTER_UNALLOCATED: 1138 break; 1139 default: 1140 abort(); 1141 } 1142 } 1143 1144 1145 1146 /*********************************************************/ 1147 /* snapshots and image creation */ 1148 1149 1150 1151 /* update the refcounts of snapshots and the copied flag */ 1152 int qcow2_update_snapshot_refcount(BlockDriverState *bs, 1153 int64_t l1_table_offset, int l1_size, int addend) 1154 { 1155 BDRVQcow2State *s = bs->opaque; 1156 uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount; 1157 bool l1_allocated = false; 1158 int64_t old_entry, old_l2_offset; 1159 int i, j, l1_modified = 0, nb_csectors; 1160 int ret; 1161 1162 assert(addend >= -1 && addend <= 1); 1163 1164 l2_table = NULL; 1165 l1_table = NULL; 1166 l1_size2 = l1_size * sizeof(uint64_t); 1167 1168 s->cache_discards = true; 1169 1170 /* WARNING: qcow2_snapshot_goto relies on this function not using the 1171 * l1_table_offset when it is the current s->l1_table_offset! Be careful 1172 * when changing this! */ 1173 if (l1_table_offset != s->l1_table_offset) { 1174 l1_table = g_try_malloc0(align_offset(l1_size2, 512)); 1175 if (l1_size2 && l1_table == NULL) { 1176 ret = -ENOMEM; 1177 goto fail; 1178 } 1179 l1_allocated = true; 1180 1181 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); 1182 if (ret < 0) { 1183 goto fail; 1184 } 1185 1186 for (i = 0; i < l1_size; i++) { 1187 be64_to_cpus(&l1_table[i]); 1188 } 1189 } else { 1190 assert(l1_size == s->l1_size); 1191 l1_table = s->l1_table; 1192 l1_allocated = false; 1193 } 1194 1195 for (i = 0; i < l1_size; i++) { 1196 l2_offset = l1_table[i]; 1197 if (l2_offset) { 1198 old_l2_offset = l2_offset; 1199 l2_offset &= L1E_OFFSET_MASK; 1200 1201 if (offset_into_cluster(s, l2_offset)) { 1202 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 1203 PRIx64 " unaligned (L1 index: %#x)", 1204 l2_offset, i); 1205 ret = -EIO; 1206 goto fail; 1207 } 1208 1209 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 1210 (void**) &l2_table); 1211 if (ret < 0) { 1212 goto fail; 1213 } 1214 1215 for (j = 0; j < s->l2_size; j++) { 1216 uint64_t cluster_index; 1217 uint64_t offset; 1218 1219 entry = be64_to_cpu(l2_table[j]); 1220 old_entry = entry; 1221 entry &= ~QCOW_OFLAG_COPIED; 1222 offset = entry & L2E_OFFSET_MASK; 1223 1224 switch (qcow2_get_cluster_type(entry)) { 1225 case QCOW2_CLUSTER_COMPRESSED: 1226 nb_csectors = ((entry >> s->csize_shift) & 1227 s->csize_mask) + 1; 1228 if (addend != 0) { 1229 ret = update_refcount(bs, 1230 (entry & s->cluster_offset_mask) & ~511, 1231 nb_csectors * 512, abs(addend), addend < 0, 1232 QCOW2_DISCARD_SNAPSHOT); 1233 if (ret < 0) { 1234 goto fail; 1235 } 1236 } 1237 /* compressed clusters are never modified */ 1238 refcount = 2; 1239 break; 1240 1241 case QCOW2_CLUSTER_NORMAL: 1242 case QCOW2_CLUSTER_ZERO_ALLOC: 1243 if (offset_into_cluster(s, offset)) { 1244 qcow2_signal_corruption(bs, true, -1, -1, "Cluster " 1245 "allocation offset %#" PRIx64 1246 " unaligned (L2 offset: %#" 1247 PRIx64 ", L2 index: %#x)", 1248 offset, l2_offset, j); 1249 ret = -EIO; 1250 goto fail; 1251 } 1252 1253 cluster_index = offset >> s->cluster_bits; 1254 assert(cluster_index); 1255 if (addend != 0) { 1256 ret = qcow2_update_cluster_refcount(bs, 1257 cluster_index, abs(addend), addend < 0, 1258 QCOW2_DISCARD_SNAPSHOT); 1259 if (ret < 0) { 1260 goto fail; 1261 } 1262 } 1263 1264 ret = qcow2_get_refcount(bs, cluster_index, &refcount); 1265 if (ret < 0) { 1266 goto fail; 1267 } 1268 break; 1269 1270 case QCOW2_CLUSTER_ZERO_PLAIN: 1271 case QCOW2_CLUSTER_UNALLOCATED: 1272 refcount = 0; 1273 break; 1274 1275 default: 1276 abort(); 1277 } 1278 1279 if (refcount == 1) { 1280 entry |= QCOW_OFLAG_COPIED; 1281 } 1282 if (entry != old_entry) { 1283 if (addend > 0) { 1284 qcow2_cache_set_dependency(bs, s->l2_table_cache, 1285 s->refcount_block_cache); 1286 } 1287 l2_table[j] = cpu_to_be64(entry); 1288 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, 1289 l2_table); 1290 } 1291 } 1292 1293 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1294 1295 if (addend != 0) { 1296 ret = qcow2_update_cluster_refcount(bs, l2_offset >> 1297 s->cluster_bits, 1298 abs(addend), addend < 0, 1299 QCOW2_DISCARD_SNAPSHOT); 1300 if (ret < 0) { 1301 goto fail; 1302 } 1303 } 1304 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1305 &refcount); 1306 if (ret < 0) { 1307 goto fail; 1308 } else if (refcount == 1) { 1309 l2_offset |= QCOW_OFLAG_COPIED; 1310 } 1311 if (l2_offset != old_l2_offset) { 1312 l1_table[i] = l2_offset; 1313 l1_modified = 1; 1314 } 1315 } 1316 } 1317 1318 ret = bdrv_flush(bs); 1319 fail: 1320 if (l2_table) { 1321 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1322 } 1323 1324 s->cache_discards = false; 1325 qcow2_process_discards(bs, ret); 1326 1327 /* Update L1 only if it isn't deleted anyway (addend = -1) */ 1328 if (ret == 0 && addend >= 0 && l1_modified) { 1329 for (i = 0; i < l1_size; i++) { 1330 cpu_to_be64s(&l1_table[i]); 1331 } 1332 1333 ret = bdrv_pwrite_sync(bs->file, l1_table_offset, 1334 l1_table, l1_size2); 1335 1336 for (i = 0; i < l1_size; i++) { 1337 be64_to_cpus(&l1_table[i]); 1338 } 1339 } 1340 if (l1_allocated) 1341 g_free(l1_table); 1342 return ret; 1343 } 1344 1345 1346 1347 1348 /*********************************************************/ 1349 /* refcount checking functions */ 1350 1351 1352 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries) 1353 { 1354 /* This assertion holds because there is no way we can address more than 1355 * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because 1356 * offsets have to be representable in bytes); due to every cluster 1357 * corresponding to one refcount entry, we are well below that limit */ 1358 assert(entries < (UINT64_C(1) << (64 - 9))); 1359 1360 /* Thanks to the assertion this will not overflow, because 1361 * s->refcount_order < 7. 1362 * (note: x << s->refcount_order == x * s->refcount_bits) */ 1363 return DIV_ROUND_UP(entries << s->refcount_order, 8); 1364 } 1365 1366 /** 1367 * Reallocates *array so that it can hold new_size entries. *size must contain 1368 * the current number of entries in *array. If the reallocation fails, *array 1369 * and *size will not be modified and -errno will be returned. If the 1370 * reallocation is successful, *array will be set to the new buffer, *size 1371 * will be set to new_size and 0 will be returned. The size of the reallocated 1372 * refcount array buffer will be aligned to a cluster boundary, and the newly 1373 * allocated area will be zeroed. 1374 */ 1375 static int realloc_refcount_array(BDRVQcow2State *s, void **array, 1376 int64_t *size, int64_t new_size) 1377 { 1378 int64_t old_byte_size, new_byte_size; 1379 void *new_ptr; 1380 1381 /* Round to clusters so the array can be directly written to disk */ 1382 old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size)) 1383 * s->cluster_size; 1384 new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size)) 1385 * s->cluster_size; 1386 1387 if (new_byte_size == old_byte_size) { 1388 *size = new_size; 1389 return 0; 1390 } 1391 1392 assert(new_byte_size > 0); 1393 1394 if (new_byte_size > SIZE_MAX) { 1395 return -ENOMEM; 1396 } 1397 1398 new_ptr = g_try_realloc(*array, new_byte_size); 1399 if (!new_ptr) { 1400 return -ENOMEM; 1401 } 1402 1403 if (new_byte_size > old_byte_size) { 1404 memset((char *)new_ptr + old_byte_size, 0, 1405 new_byte_size - old_byte_size); 1406 } 1407 1408 *array = new_ptr; 1409 *size = new_size; 1410 1411 return 0; 1412 } 1413 1414 /* 1415 * Increases the refcount for a range of clusters in a given refcount table. 1416 * This is used to construct a temporary refcount table out of L1 and L2 tables 1417 * which can be compared to the refcount table saved in the image. 1418 * 1419 * Modifies the number of errors in res. 1420 */ 1421 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res, 1422 void **refcount_table, 1423 int64_t *refcount_table_size, 1424 int64_t offset, int64_t size) 1425 { 1426 BDRVQcow2State *s = bs->opaque; 1427 uint64_t start, last, cluster_offset, k, refcount; 1428 int ret; 1429 1430 if (size <= 0) { 1431 return 0; 1432 } 1433 1434 start = start_of_cluster(s, offset); 1435 last = start_of_cluster(s, offset + size - 1); 1436 for(cluster_offset = start; cluster_offset <= last; 1437 cluster_offset += s->cluster_size) { 1438 k = cluster_offset >> s->cluster_bits; 1439 if (k >= *refcount_table_size) { 1440 ret = realloc_refcount_array(s, refcount_table, 1441 refcount_table_size, k + 1); 1442 if (ret < 0) { 1443 res->check_errors++; 1444 return ret; 1445 } 1446 } 1447 1448 refcount = s->get_refcount(*refcount_table, k); 1449 if (refcount == s->refcount_max) { 1450 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 1451 "\n", cluster_offset); 1452 fprintf(stderr, "Use qemu-img amend to increase the refcount entry " 1453 "width or qemu-img convert to create a clean copy if the " 1454 "image cannot be opened for writing\n"); 1455 res->corruptions++; 1456 continue; 1457 } 1458 s->set_refcount(*refcount_table, k, refcount + 1); 1459 } 1460 1461 return 0; 1462 } 1463 1464 /* Flags for check_refcounts_l1() and check_refcounts_l2() */ 1465 enum { 1466 CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ 1467 }; 1468 1469 /* 1470 * Increases the refcount in the given refcount table for the all clusters 1471 * referenced in the L2 table. While doing so, performs some checks on L2 1472 * entries. 1473 * 1474 * Returns the number of errors found by the checks or -errno if an internal 1475 * error occurred. 1476 */ 1477 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, 1478 void **refcount_table, 1479 int64_t *refcount_table_size, int64_t l2_offset, 1480 int flags) 1481 { 1482 BDRVQcow2State *s = bs->opaque; 1483 uint64_t *l2_table, l2_entry; 1484 uint64_t next_contiguous_offset = 0; 1485 int i, l2_size, nb_csectors, ret; 1486 1487 /* Read L2 table from disk */ 1488 l2_size = s->l2_size * sizeof(uint64_t); 1489 l2_table = g_malloc(l2_size); 1490 1491 ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size); 1492 if (ret < 0) { 1493 fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); 1494 res->check_errors++; 1495 goto fail; 1496 } 1497 1498 /* Do the actual checks */ 1499 for(i = 0; i < s->l2_size; i++) { 1500 l2_entry = be64_to_cpu(l2_table[i]); 1501 1502 switch (qcow2_get_cluster_type(l2_entry)) { 1503 case QCOW2_CLUSTER_COMPRESSED: 1504 /* Compressed clusters don't have QCOW_OFLAG_COPIED */ 1505 if (l2_entry & QCOW_OFLAG_COPIED) { 1506 fprintf(stderr, "ERROR: cluster %" PRId64 ": " 1507 "copied flag must never be set for compressed " 1508 "clusters\n", l2_entry >> s->cluster_bits); 1509 l2_entry &= ~QCOW_OFLAG_COPIED; 1510 res->corruptions++; 1511 } 1512 1513 /* Mark cluster as used */ 1514 nb_csectors = ((l2_entry >> s->csize_shift) & 1515 s->csize_mask) + 1; 1516 l2_entry &= s->cluster_offset_mask; 1517 ret = qcow2_inc_refcounts_imrt(bs, res, 1518 refcount_table, refcount_table_size, 1519 l2_entry & ~511, nb_csectors * 512); 1520 if (ret < 0) { 1521 goto fail; 1522 } 1523 1524 if (flags & CHECK_FRAG_INFO) { 1525 res->bfi.allocated_clusters++; 1526 res->bfi.compressed_clusters++; 1527 1528 /* Compressed clusters are fragmented by nature. Since they 1529 * take up sub-sector space but we only have sector granularity 1530 * I/O we need to re-read the same sectors even for adjacent 1531 * compressed clusters. 1532 */ 1533 res->bfi.fragmented_clusters++; 1534 } 1535 break; 1536 1537 case QCOW2_CLUSTER_ZERO_ALLOC: 1538 case QCOW2_CLUSTER_NORMAL: 1539 { 1540 uint64_t offset = l2_entry & L2E_OFFSET_MASK; 1541 1542 if (flags & CHECK_FRAG_INFO) { 1543 res->bfi.allocated_clusters++; 1544 if (next_contiguous_offset && 1545 offset != next_contiguous_offset) { 1546 res->bfi.fragmented_clusters++; 1547 } 1548 next_contiguous_offset = offset + s->cluster_size; 1549 } 1550 1551 /* Mark cluster as used */ 1552 ret = qcow2_inc_refcounts_imrt(bs, res, 1553 refcount_table, refcount_table_size, 1554 offset, s->cluster_size); 1555 if (ret < 0) { 1556 goto fail; 1557 } 1558 1559 /* Correct offsets are cluster aligned */ 1560 if (offset_into_cluster(s, offset)) { 1561 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " 1562 "properly aligned; L2 entry corrupted.\n", offset); 1563 res->corruptions++; 1564 } 1565 break; 1566 } 1567 1568 case QCOW2_CLUSTER_ZERO_PLAIN: 1569 case QCOW2_CLUSTER_UNALLOCATED: 1570 break; 1571 1572 default: 1573 abort(); 1574 } 1575 } 1576 1577 g_free(l2_table); 1578 return 0; 1579 1580 fail: 1581 g_free(l2_table); 1582 return ret; 1583 } 1584 1585 /* 1586 * Increases the refcount for the L1 table, its L2 tables and all referenced 1587 * clusters in the given refcount table. While doing so, performs some checks 1588 * on L1 and L2 entries. 1589 * 1590 * Returns the number of errors found by the checks or -errno if an internal 1591 * error occurred. 1592 */ 1593 static int check_refcounts_l1(BlockDriverState *bs, 1594 BdrvCheckResult *res, 1595 void **refcount_table, 1596 int64_t *refcount_table_size, 1597 int64_t l1_table_offset, int l1_size, 1598 int flags) 1599 { 1600 BDRVQcow2State *s = bs->opaque; 1601 uint64_t *l1_table = NULL, l2_offset, l1_size2; 1602 int i, ret; 1603 1604 l1_size2 = l1_size * sizeof(uint64_t); 1605 1606 /* Mark L1 table as used */ 1607 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size, 1608 l1_table_offset, l1_size2); 1609 if (ret < 0) { 1610 goto fail; 1611 } 1612 1613 /* Read L1 table entries from disk */ 1614 if (l1_size2 > 0) { 1615 l1_table = g_try_malloc(l1_size2); 1616 if (l1_table == NULL) { 1617 ret = -ENOMEM; 1618 res->check_errors++; 1619 goto fail; 1620 } 1621 ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); 1622 if (ret < 0) { 1623 fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); 1624 res->check_errors++; 1625 goto fail; 1626 } 1627 for(i = 0;i < l1_size; i++) 1628 be64_to_cpus(&l1_table[i]); 1629 } 1630 1631 /* Do the actual checks */ 1632 for(i = 0; i < l1_size; i++) { 1633 l2_offset = l1_table[i]; 1634 if (l2_offset) { 1635 /* Mark L2 table as used */ 1636 l2_offset &= L1E_OFFSET_MASK; 1637 ret = qcow2_inc_refcounts_imrt(bs, res, 1638 refcount_table, refcount_table_size, 1639 l2_offset, s->cluster_size); 1640 if (ret < 0) { 1641 goto fail; 1642 } 1643 1644 /* L2 tables are cluster aligned */ 1645 if (offset_into_cluster(s, l2_offset)) { 1646 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " 1647 "cluster aligned; L1 entry corrupted\n", l2_offset); 1648 res->corruptions++; 1649 } 1650 1651 /* Process and check L2 entries */ 1652 ret = check_refcounts_l2(bs, res, refcount_table, 1653 refcount_table_size, l2_offset, flags); 1654 if (ret < 0) { 1655 goto fail; 1656 } 1657 } 1658 } 1659 g_free(l1_table); 1660 return 0; 1661 1662 fail: 1663 g_free(l1_table); 1664 return ret; 1665 } 1666 1667 /* 1668 * Checks the OFLAG_COPIED flag for all L1 and L2 entries. 1669 * 1670 * This function does not print an error message nor does it increment 1671 * check_errors if qcow2_get_refcount fails (this is because such an error will 1672 * have been already detected and sufficiently signaled by the calling function 1673 * (qcow2_check_refcounts) by the time this function is called). 1674 */ 1675 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, 1676 BdrvCheckMode fix) 1677 { 1678 BDRVQcow2State *s = bs->opaque; 1679 uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); 1680 int ret; 1681 uint64_t refcount; 1682 int i, j; 1683 1684 for (i = 0; i < s->l1_size; i++) { 1685 uint64_t l1_entry = s->l1_table[i]; 1686 uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; 1687 bool l2_dirty = false; 1688 1689 if (!l2_offset) { 1690 continue; 1691 } 1692 1693 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1694 &refcount); 1695 if (ret < 0) { 1696 /* don't print message nor increment check_errors */ 1697 continue; 1698 } 1699 if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { 1700 fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " 1701 "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n", 1702 fix & BDRV_FIX_ERRORS ? "Repairing" : 1703 "ERROR", 1704 i, l1_entry, refcount); 1705 if (fix & BDRV_FIX_ERRORS) { 1706 s->l1_table[i] = refcount == 1 1707 ? l1_entry | QCOW_OFLAG_COPIED 1708 : l1_entry & ~QCOW_OFLAG_COPIED; 1709 ret = qcow2_write_l1_entry(bs, i); 1710 if (ret < 0) { 1711 res->check_errors++; 1712 goto fail; 1713 } 1714 res->corruptions_fixed++; 1715 } else { 1716 res->corruptions++; 1717 } 1718 } 1719 1720 ret = bdrv_pread(bs->file, l2_offset, l2_table, 1721 s->l2_size * sizeof(uint64_t)); 1722 if (ret < 0) { 1723 fprintf(stderr, "ERROR: Could not read L2 table: %s\n", 1724 strerror(-ret)); 1725 res->check_errors++; 1726 goto fail; 1727 } 1728 1729 for (j = 0; j < s->l2_size; j++) { 1730 uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1731 uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; 1732 QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 1733 1734 if (cluster_type == QCOW2_CLUSTER_NORMAL || 1735 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) { 1736 ret = qcow2_get_refcount(bs, 1737 data_offset >> s->cluster_bits, 1738 &refcount); 1739 if (ret < 0) { 1740 /* don't print message nor increment check_errors */ 1741 continue; 1742 } 1743 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { 1744 fprintf(stderr, "%s OFLAG_COPIED data cluster: " 1745 "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n", 1746 fix & BDRV_FIX_ERRORS ? "Repairing" : 1747 "ERROR", 1748 l2_entry, refcount); 1749 if (fix & BDRV_FIX_ERRORS) { 1750 l2_table[j] = cpu_to_be64(refcount == 1 1751 ? l2_entry | QCOW_OFLAG_COPIED 1752 : l2_entry & ~QCOW_OFLAG_COPIED); 1753 l2_dirty = true; 1754 res->corruptions_fixed++; 1755 } else { 1756 res->corruptions++; 1757 } 1758 } 1759 } 1760 } 1761 1762 if (l2_dirty) { 1763 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, 1764 l2_offset, s->cluster_size); 1765 if (ret < 0) { 1766 fprintf(stderr, "ERROR: Could not write L2 table; metadata " 1767 "overlap check failed: %s\n", strerror(-ret)); 1768 res->check_errors++; 1769 goto fail; 1770 } 1771 1772 ret = bdrv_pwrite(bs->file, l2_offset, l2_table, 1773 s->cluster_size); 1774 if (ret < 0) { 1775 fprintf(stderr, "ERROR: Could not write L2 table: %s\n", 1776 strerror(-ret)); 1777 res->check_errors++; 1778 goto fail; 1779 } 1780 } 1781 } 1782 1783 ret = 0; 1784 1785 fail: 1786 qemu_vfree(l2_table); 1787 return ret; 1788 } 1789 1790 /* 1791 * Checks consistency of refblocks and accounts for each refblock in 1792 * *refcount_table. 1793 */ 1794 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, 1795 BdrvCheckMode fix, bool *rebuild, 1796 void **refcount_table, int64_t *nb_clusters) 1797 { 1798 BDRVQcow2State *s = bs->opaque; 1799 int64_t i, size; 1800 int ret; 1801 1802 for(i = 0; i < s->refcount_table_size; i++) { 1803 uint64_t offset, cluster; 1804 offset = s->refcount_table[i]; 1805 cluster = offset >> s->cluster_bits; 1806 1807 /* Refcount blocks are cluster aligned */ 1808 if (offset_into_cluster(s, offset)) { 1809 fprintf(stderr, "ERROR refcount block %" PRId64 " is not " 1810 "cluster aligned; refcount table entry corrupted\n", i); 1811 res->corruptions++; 1812 *rebuild = true; 1813 continue; 1814 } 1815 1816 if (cluster >= *nb_clusters) { 1817 fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n", 1818 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); 1819 1820 if (fix & BDRV_FIX_ERRORS) { 1821 int64_t new_nb_clusters; 1822 Error *local_err = NULL; 1823 1824 if (offset > INT64_MAX - s->cluster_size) { 1825 ret = -EINVAL; 1826 goto resize_fail; 1827 } 1828 1829 ret = bdrv_truncate(bs->file, offset + s->cluster_size, 1830 PREALLOC_MODE_OFF, &local_err); 1831 if (ret < 0) { 1832 error_report_err(local_err); 1833 goto resize_fail; 1834 } 1835 size = bdrv_getlength(bs->file->bs); 1836 if (size < 0) { 1837 ret = size; 1838 goto resize_fail; 1839 } 1840 1841 new_nb_clusters = size_to_clusters(s, size); 1842 assert(new_nb_clusters >= *nb_clusters); 1843 1844 ret = realloc_refcount_array(s, refcount_table, 1845 nb_clusters, new_nb_clusters); 1846 if (ret < 0) { 1847 res->check_errors++; 1848 return ret; 1849 } 1850 1851 if (cluster >= *nb_clusters) { 1852 ret = -EINVAL; 1853 goto resize_fail; 1854 } 1855 1856 res->corruptions_fixed++; 1857 ret = qcow2_inc_refcounts_imrt(bs, res, 1858 refcount_table, nb_clusters, 1859 offset, s->cluster_size); 1860 if (ret < 0) { 1861 return ret; 1862 } 1863 /* No need to check whether the refcount is now greater than 1: 1864 * This area was just allocated and zeroed, so it can only be 1865 * exactly 1 after qcow2_inc_refcounts_imrt() */ 1866 continue; 1867 1868 resize_fail: 1869 res->corruptions++; 1870 *rebuild = true; 1871 fprintf(stderr, "ERROR could not resize image: %s\n", 1872 strerror(-ret)); 1873 } else { 1874 res->corruptions++; 1875 } 1876 continue; 1877 } 1878 1879 if (offset != 0) { 1880 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1881 offset, s->cluster_size); 1882 if (ret < 0) { 1883 return ret; 1884 } 1885 if (s->get_refcount(*refcount_table, cluster) != 1) { 1886 fprintf(stderr, "ERROR refcount block %" PRId64 1887 " refcount=%" PRIu64 "\n", i, 1888 s->get_refcount(*refcount_table, cluster)); 1889 res->corruptions++; 1890 *rebuild = true; 1891 } 1892 } 1893 } 1894 1895 return 0; 1896 } 1897 1898 /* 1899 * Calculates an in-memory refcount table. 1900 */ 1901 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 1902 BdrvCheckMode fix, bool *rebuild, 1903 void **refcount_table, int64_t *nb_clusters) 1904 { 1905 BDRVQcow2State *s = bs->opaque; 1906 int64_t i; 1907 QCowSnapshot *sn; 1908 int ret; 1909 1910 if (!*refcount_table) { 1911 int64_t old_size = 0; 1912 ret = realloc_refcount_array(s, refcount_table, 1913 &old_size, *nb_clusters); 1914 if (ret < 0) { 1915 res->check_errors++; 1916 return ret; 1917 } 1918 } 1919 1920 /* header */ 1921 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1922 0, s->cluster_size); 1923 if (ret < 0) { 1924 return ret; 1925 } 1926 1927 /* current L1 table */ 1928 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 1929 s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO); 1930 if (ret < 0) { 1931 return ret; 1932 } 1933 1934 /* snapshots */ 1935 for (i = 0; i < s->nb_snapshots; i++) { 1936 sn = s->snapshots + i; 1937 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, 1938 sn->l1_table_offset, sn->l1_size, 0); 1939 if (ret < 0) { 1940 return ret; 1941 } 1942 } 1943 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1944 s->snapshots_offset, s->snapshots_size); 1945 if (ret < 0) { 1946 return ret; 1947 } 1948 1949 /* refcount data */ 1950 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1951 s->refcount_table_offset, 1952 s->refcount_table_size * sizeof(uint64_t)); 1953 if (ret < 0) { 1954 return ret; 1955 } 1956 1957 /* encryption */ 1958 if (s->crypto_header.length) { 1959 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters, 1960 s->crypto_header.offset, 1961 s->crypto_header.length); 1962 if (ret < 0) { 1963 return ret; 1964 } 1965 } 1966 1967 /* bitmaps */ 1968 ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters); 1969 if (ret < 0) { 1970 return ret; 1971 } 1972 1973 return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters); 1974 } 1975 1976 /* 1977 * Compares the actual reference count for each cluster in the image against the 1978 * refcount as reported by the refcount structures on-disk. 1979 */ 1980 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 1981 BdrvCheckMode fix, bool *rebuild, 1982 int64_t *highest_cluster, 1983 void *refcount_table, int64_t nb_clusters) 1984 { 1985 BDRVQcow2State *s = bs->opaque; 1986 int64_t i; 1987 uint64_t refcount1, refcount2; 1988 int ret; 1989 1990 for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { 1991 ret = qcow2_get_refcount(bs, i, &refcount1); 1992 if (ret < 0) { 1993 fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", 1994 i, strerror(-ret)); 1995 res->check_errors++; 1996 continue; 1997 } 1998 1999 refcount2 = s->get_refcount(refcount_table, i); 2000 2001 if (refcount1 > 0 || refcount2 > 0) { 2002 *highest_cluster = i; 2003 } 2004 2005 if (refcount1 != refcount2) { 2006 /* Check if we're allowed to fix the mismatch */ 2007 int *num_fixed = NULL; 2008 if (refcount1 == 0) { 2009 *rebuild = true; 2010 } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { 2011 num_fixed = &res->leaks_fixed; 2012 } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { 2013 num_fixed = &res->corruptions_fixed; 2014 } 2015 2016 fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64 2017 " reference=%" PRIu64 "\n", 2018 num_fixed != NULL ? "Repairing" : 2019 refcount1 < refcount2 ? "ERROR" : 2020 "Leaked", 2021 i, refcount1, refcount2); 2022 2023 if (num_fixed) { 2024 ret = update_refcount(bs, i << s->cluster_bits, 1, 2025 refcount_diff(refcount1, refcount2), 2026 refcount1 > refcount2, 2027 QCOW2_DISCARD_ALWAYS); 2028 if (ret >= 0) { 2029 (*num_fixed)++; 2030 continue; 2031 } 2032 } 2033 2034 /* And if we couldn't, print an error */ 2035 if (refcount1 < refcount2) { 2036 res->corruptions++; 2037 } else { 2038 res->leaks++; 2039 } 2040 } 2041 } 2042 } 2043 2044 /* 2045 * Allocates clusters using an in-memory refcount table (IMRT) in contrast to 2046 * the on-disk refcount structures. 2047 * 2048 * On input, *first_free_cluster tells where to start looking, and need not 2049 * actually be a free cluster; the returned offset will not be before that 2050 * cluster. On output, *first_free_cluster points to the first gap found, even 2051 * if that gap was too small to be used as the returned offset. 2052 * 2053 * Note that *first_free_cluster is a cluster index whereas the return value is 2054 * an offset. 2055 */ 2056 static int64_t alloc_clusters_imrt(BlockDriverState *bs, 2057 int cluster_count, 2058 void **refcount_table, 2059 int64_t *imrt_nb_clusters, 2060 int64_t *first_free_cluster) 2061 { 2062 BDRVQcow2State *s = bs->opaque; 2063 int64_t cluster = *first_free_cluster, i; 2064 bool first_gap = true; 2065 int contiguous_free_clusters; 2066 int ret; 2067 2068 /* Starting at *first_free_cluster, find a range of at least cluster_count 2069 * continuously free clusters */ 2070 for (contiguous_free_clusters = 0; 2071 cluster < *imrt_nb_clusters && 2072 contiguous_free_clusters < cluster_count; 2073 cluster++) 2074 { 2075 if (!s->get_refcount(*refcount_table, cluster)) { 2076 contiguous_free_clusters++; 2077 if (first_gap) { 2078 /* If this is the first free cluster found, update 2079 * *first_free_cluster accordingly */ 2080 *first_free_cluster = cluster; 2081 first_gap = false; 2082 } 2083 } else if (contiguous_free_clusters) { 2084 contiguous_free_clusters = 0; 2085 } 2086 } 2087 2088 /* If contiguous_free_clusters is greater than zero, it contains the number 2089 * of continuously free clusters until the current cluster; the first free 2090 * cluster in the current "gap" is therefore 2091 * cluster - contiguous_free_clusters */ 2092 2093 /* If no such range could be found, grow the in-memory refcount table 2094 * accordingly to append free clusters at the end of the image */ 2095 if (contiguous_free_clusters < cluster_count) { 2096 /* contiguous_free_clusters clusters are already empty at the image end; 2097 * we need cluster_count clusters; therefore, we have to allocate 2098 * cluster_count - contiguous_free_clusters new clusters at the end of 2099 * the image (which is the current value of cluster; note that cluster 2100 * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond 2101 * the image end) */ 2102 ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters, 2103 cluster + cluster_count 2104 - contiguous_free_clusters); 2105 if (ret < 0) { 2106 return ret; 2107 } 2108 } 2109 2110 /* Go back to the first free cluster */ 2111 cluster -= contiguous_free_clusters; 2112 for (i = 0; i < cluster_count; i++) { 2113 s->set_refcount(*refcount_table, cluster + i, 1); 2114 } 2115 2116 return cluster << s->cluster_bits; 2117 } 2118 2119 /* 2120 * Creates a new refcount structure based solely on the in-memory information 2121 * given through *refcount_table. All necessary allocations will be reflected 2122 * in that array. 2123 * 2124 * On success, the old refcount structure is leaked (it will be covered by the 2125 * new refcount structure). 2126 */ 2127 static int rebuild_refcount_structure(BlockDriverState *bs, 2128 BdrvCheckResult *res, 2129 void **refcount_table, 2130 int64_t *nb_clusters) 2131 { 2132 BDRVQcow2State *s = bs->opaque; 2133 int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; 2134 int64_t refblock_offset, refblock_start, refblock_index; 2135 uint32_t reftable_size = 0; 2136 uint64_t *on_disk_reftable = NULL; 2137 void *on_disk_refblock; 2138 int ret = 0; 2139 struct { 2140 uint64_t reftable_offset; 2141 uint32_t reftable_clusters; 2142 } QEMU_PACKED reftable_offset_and_clusters; 2143 2144 qcow2_cache_empty(bs, s->refcount_block_cache); 2145 2146 write_refblocks: 2147 for (; cluster < *nb_clusters; cluster++) { 2148 if (!s->get_refcount(*refcount_table, cluster)) { 2149 continue; 2150 } 2151 2152 refblock_index = cluster >> s->refcount_block_bits; 2153 refblock_start = refblock_index << s->refcount_block_bits; 2154 2155 /* Don't allocate a cluster in a refblock already written to disk */ 2156 if (first_free_cluster < refblock_start) { 2157 first_free_cluster = refblock_start; 2158 } 2159 refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, 2160 nb_clusters, &first_free_cluster); 2161 if (refblock_offset < 0) { 2162 fprintf(stderr, "ERROR allocating refblock: %s\n", 2163 strerror(-refblock_offset)); 2164 res->check_errors++; 2165 ret = refblock_offset; 2166 goto fail; 2167 } 2168 2169 if (reftable_size <= refblock_index) { 2170 uint32_t old_reftable_size = reftable_size; 2171 uint64_t *new_on_disk_reftable; 2172 2173 reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t), 2174 s->cluster_size) / sizeof(uint64_t); 2175 new_on_disk_reftable = g_try_realloc(on_disk_reftable, 2176 reftable_size * 2177 sizeof(uint64_t)); 2178 if (!new_on_disk_reftable) { 2179 res->check_errors++; 2180 ret = -ENOMEM; 2181 goto fail; 2182 } 2183 on_disk_reftable = new_on_disk_reftable; 2184 2185 memset(on_disk_reftable + old_reftable_size, 0, 2186 (reftable_size - old_reftable_size) * sizeof(uint64_t)); 2187 2188 /* The offset we have for the reftable is now no longer valid; 2189 * this will leak that range, but we can easily fix that by running 2190 * a leak-fixing check after this rebuild operation */ 2191 reftable_offset = -1; 2192 } else { 2193 assert(on_disk_reftable); 2194 } 2195 on_disk_reftable[refblock_index] = refblock_offset; 2196 2197 /* If this is apparently the last refblock (for now), try to squeeze the 2198 * reftable in */ 2199 if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits && 2200 reftable_offset < 0) 2201 { 2202 uint64_t reftable_clusters = size_to_clusters(s, reftable_size * 2203 sizeof(uint64_t)); 2204 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, 2205 refcount_table, nb_clusters, 2206 &first_free_cluster); 2207 if (reftable_offset < 0) { 2208 fprintf(stderr, "ERROR allocating reftable: %s\n", 2209 strerror(-reftable_offset)); 2210 res->check_errors++; 2211 ret = reftable_offset; 2212 goto fail; 2213 } 2214 } 2215 2216 ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, 2217 s->cluster_size); 2218 if (ret < 0) { 2219 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); 2220 goto fail; 2221 } 2222 2223 /* The size of *refcount_table is always cluster-aligned, therefore the 2224 * write operation will not overflow */ 2225 on_disk_refblock = (void *)((char *) *refcount_table + 2226 refblock_index * s->cluster_size); 2227 2228 ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE, 2229 on_disk_refblock, s->cluster_sectors); 2230 if (ret < 0) { 2231 fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); 2232 goto fail; 2233 } 2234 2235 /* Go to the end of this refblock */ 2236 cluster = refblock_start + s->refcount_block_size - 1; 2237 } 2238 2239 if (reftable_offset < 0) { 2240 uint64_t post_refblock_start, reftable_clusters; 2241 2242 post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size); 2243 reftable_clusters = size_to_clusters(s, 2244 reftable_size * sizeof(uint64_t)); 2245 /* Not pretty but simple */ 2246 if (first_free_cluster < post_refblock_start) { 2247 first_free_cluster = post_refblock_start; 2248 } 2249 reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, 2250 refcount_table, nb_clusters, 2251 &first_free_cluster); 2252 if (reftable_offset < 0) { 2253 fprintf(stderr, "ERROR allocating reftable: %s\n", 2254 strerror(-reftable_offset)); 2255 res->check_errors++; 2256 ret = reftable_offset; 2257 goto fail; 2258 } 2259 2260 goto write_refblocks; 2261 } 2262 2263 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { 2264 cpu_to_be64s(&on_disk_reftable[refblock_index]); 2265 } 2266 2267 ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, 2268 reftable_size * sizeof(uint64_t)); 2269 if (ret < 0) { 2270 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); 2271 goto fail; 2272 } 2273 2274 assert(reftable_size < INT_MAX / sizeof(uint64_t)); 2275 ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, 2276 reftable_size * sizeof(uint64_t)); 2277 if (ret < 0) { 2278 fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); 2279 goto fail; 2280 } 2281 2282 /* Enter new reftable into the image header */ 2283 reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset); 2284 reftable_offset_and_clusters.reftable_clusters = 2285 cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t))); 2286 ret = bdrv_pwrite_sync(bs->file, 2287 offsetof(QCowHeader, refcount_table_offset), 2288 &reftable_offset_and_clusters, 2289 sizeof(reftable_offset_and_clusters)); 2290 if (ret < 0) { 2291 fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret)); 2292 goto fail; 2293 } 2294 2295 for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { 2296 be64_to_cpus(&on_disk_reftable[refblock_index]); 2297 } 2298 s->refcount_table = on_disk_reftable; 2299 s->refcount_table_offset = reftable_offset; 2300 s->refcount_table_size = reftable_size; 2301 update_max_refcount_table_index(s); 2302 2303 return 0; 2304 2305 fail: 2306 g_free(on_disk_reftable); 2307 return ret; 2308 } 2309 2310 /* 2311 * Checks an image for refcount consistency. 2312 * 2313 * Returns 0 if no errors are found, the number of errors in case the image is 2314 * detected as corrupted, and -errno when an internal error occurred. 2315 */ 2316 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, 2317 BdrvCheckMode fix) 2318 { 2319 BDRVQcow2State *s = bs->opaque; 2320 BdrvCheckResult pre_compare_res; 2321 int64_t size, highest_cluster, nb_clusters; 2322 void *refcount_table = NULL; 2323 bool rebuild = false; 2324 int ret; 2325 2326 size = bdrv_getlength(bs->file->bs); 2327 if (size < 0) { 2328 res->check_errors++; 2329 return size; 2330 } 2331 2332 nb_clusters = size_to_clusters(s, size); 2333 if (nb_clusters > INT_MAX) { 2334 res->check_errors++; 2335 return -EFBIG; 2336 } 2337 2338 res->bfi.total_clusters = 2339 size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); 2340 2341 ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table, 2342 &nb_clusters); 2343 if (ret < 0) { 2344 goto fail; 2345 } 2346 2347 /* In case we don't need to rebuild the refcount structure (but want to fix 2348 * something), this function is immediately called again, in which case the 2349 * result should be ignored */ 2350 pre_compare_res = *res; 2351 compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table, 2352 nb_clusters); 2353 2354 if (rebuild && (fix & BDRV_FIX_ERRORS)) { 2355 BdrvCheckResult old_res = *res; 2356 int fresh_leaks = 0; 2357 2358 fprintf(stderr, "Rebuilding refcount structure\n"); 2359 ret = rebuild_refcount_structure(bs, res, &refcount_table, 2360 &nb_clusters); 2361 if (ret < 0) { 2362 goto fail; 2363 } 2364 2365 res->corruptions = 0; 2366 res->leaks = 0; 2367 2368 /* Because the old reftable has been exchanged for a new one the 2369 * references have to be recalculated */ 2370 rebuild = false; 2371 memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters)); 2372 ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table, 2373 &nb_clusters); 2374 if (ret < 0) { 2375 goto fail; 2376 } 2377 2378 if (fix & BDRV_FIX_LEAKS) { 2379 /* The old refcount structures are now leaked, fix it; the result 2380 * can be ignored, aside from leaks which were introduced by 2381 * rebuild_refcount_structure() that could not be fixed */ 2382 BdrvCheckResult saved_res = *res; 2383 *res = (BdrvCheckResult){ 0 }; 2384 2385 compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild, 2386 &highest_cluster, refcount_table, nb_clusters); 2387 if (rebuild) { 2388 fprintf(stderr, "ERROR rebuilt refcount structure is still " 2389 "broken\n"); 2390 } 2391 2392 /* Any leaks accounted for here were introduced by 2393 * rebuild_refcount_structure() because that function has created a 2394 * new refcount structure from scratch */ 2395 fresh_leaks = res->leaks; 2396 *res = saved_res; 2397 } 2398 2399 if (res->corruptions < old_res.corruptions) { 2400 res->corruptions_fixed += old_res.corruptions - res->corruptions; 2401 } 2402 if (res->leaks < old_res.leaks) { 2403 res->leaks_fixed += old_res.leaks - res->leaks; 2404 } 2405 res->leaks += fresh_leaks; 2406 } else if (fix) { 2407 if (rebuild) { 2408 fprintf(stderr, "ERROR need to rebuild refcount structures\n"); 2409 res->check_errors++; 2410 ret = -EIO; 2411 goto fail; 2412 } 2413 2414 if (res->leaks || res->corruptions) { 2415 *res = pre_compare_res; 2416 compare_refcounts(bs, res, fix, &rebuild, &highest_cluster, 2417 refcount_table, nb_clusters); 2418 } 2419 } 2420 2421 /* check OFLAG_COPIED */ 2422 ret = check_oflag_copied(bs, res, fix); 2423 if (ret < 0) { 2424 goto fail; 2425 } 2426 2427 res->image_end_offset = (highest_cluster + 1) * s->cluster_size; 2428 ret = 0; 2429 2430 fail: 2431 g_free(refcount_table); 2432 2433 return ret; 2434 } 2435 2436 #define overlaps_with(ofs, sz) \ 2437 ranges_overlap(offset, size, ofs, sz) 2438 2439 /* 2440 * Checks if the given offset into the image file is actually free to use by 2441 * looking for overlaps with important metadata sections (L1/L2 tables etc.), 2442 * i.e. a sanity check without relying on the refcount tables. 2443 * 2444 * The ign parameter specifies what checks not to perform (being a bitmask of 2445 * QCow2MetadataOverlap values), i.e., what sections to ignore. 2446 * 2447 * Returns: 2448 * - 0 if writing to this offset will not affect the mentioned metadata 2449 * - a positive QCow2MetadataOverlap value indicating one overlapping section 2450 * - a negative value (-errno) indicating an error while performing a check, 2451 * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 2452 */ 2453 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, 2454 int64_t size) 2455 { 2456 BDRVQcow2State *s = bs->opaque; 2457 int chk = s->overlap_check & ~ign; 2458 int i, j; 2459 2460 if (!size) { 2461 return 0; 2462 } 2463 2464 if (chk & QCOW2_OL_MAIN_HEADER) { 2465 if (offset < s->cluster_size) { 2466 return QCOW2_OL_MAIN_HEADER; 2467 } 2468 } 2469 2470 /* align range to test to cluster boundaries */ 2471 size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); 2472 offset = start_of_cluster(s, offset); 2473 2474 if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { 2475 if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { 2476 return QCOW2_OL_ACTIVE_L1; 2477 } 2478 } 2479 2480 if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { 2481 if (overlaps_with(s->refcount_table_offset, 2482 s->refcount_table_size * sizeof(uint64_t))) { 2483 return QCOW2_OL_REFCOUNT_TABLE; 2484 } 2485 } 2486 2487 if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { 2488 if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { 2489 return QCOW2_OL_SNAPSHOT_TABLE; 2490 } 2491 } 2492 2493 if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { 2494 for (i = 0; i < s->nb_snapshots; i++) { 2495 if (s->snapshots[i].l1_size && 2496 overlaps_with(s->snapshots[i].l1_table_offset, 2497 s->snapshots[i].l1_size * sizeof(uint64_t))) { 2498 return QCOW2_OL_INACTIVE_L1; 2499 } 2500 } 2501 } 2502 2503 if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { 2504 for (i = 0; i < s->l1_size; i++) { 2505 if ((s->l1_table[i] & L1E_OFFSET_MASK) && 2506 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, 2507 s->cluster_size)) { 2508 return QCOW2_OL_ACTIVE_L2; 2509 } 2510 } 2511 } 2512 2513 if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { 2514 unsigned last_entry = s->max_refcount_table_index; 2515 assert(last_entry < s->refcount_table_size); 2516 assert(last_entry + 1 == s->refcount_table_size || 2517 (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0); 2518 for (i = 0; i <= last_entry; i++) { 2519 if ((s->refcount_table[i] & REFT_OFFSET_MASK) && 2520 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, 2521 s->cluster_size)) { 2522 return QCOW2_OL_REFCOUNT_BLOCK; 2523 } 2524 } 2525 } 2526 2527 if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { 2528 for (i = 0; i < s->nb_snapshots; i++) { 2529 uint64_t l1_ofs = s->snapshots[i].l1_table_offset; 2530 uint32_t l1_sz = s->snapshots[i].l1_size; 2531 uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); 2532 uint64_t *l1 = g_try_malloc(l1_sz2); 2533 int ret; 2534 2535 if (l1_sz2 && l1 == NULL) { 2536 return -ENOMEM; 2537 } 2538 2539 ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); 2540 if (ret < 0) { 2541 g_free(l1); 2542 return ret; 2543 } 2544 2545 for (j = 0; j < l1_sz; j++) { 2546 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; 2547 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { 2548 g_free(l1); 2549 return QCOW2_OL_INACTIVE_L2; 2550 } 2551 } 2552 2553 g_free(l1); 2554 } 2555 } 2556 2557 return 0; 2558 } 2559 2560 static const char *metadata_ol_names[] = { 2561 [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", 2562 [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", 2563 [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", 2564 [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", 2565 [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", 2566 [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", 2567 [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", 2568 [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", 2569 }; 2570 2571 /* 2572 * First performs a check for metadata overlaps (through 2573 * qcow2_check_metadata_overlap); if that fails with a negative value (error 2574 * while performing a check), that value is returned. If an impending overlap 2575 * is detected, the BDS will be made unusable, the qcow2 file marked corrupt 2576 * and -EIO returned. 2577 * 2578 * Returns 0 if there were neither overlaps nor errors while checking for 2579 * overlaps; or a negative value (-errno) on error. 2580 */ 2581 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, 2582 int64_t size) 2583 { 2584 int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); 2585 2586 if (ret < 0) { 2587 return ret; 2588 } else if (ret > 0) { 2589 int metadata_ol_bitnr = ctz32(ret); 2590 assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); 2591 2592 qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " 2593 "write on metadata (overlaps with %s)", 2594 metadata_ol_names[metadata_ol_bitnr]); 2595 return -EIO; 2596 } 2597 2598 return 0; 2599 } 2600 2601 /* A pointer to a function of this type is given to walk_over_reftable(). That 2602 * function will create refblocks and pass them to a RefblockFinishOp once they 2603 * are completed (@refblock). @refblock_empty is set if the refblock is 2604 * completely empty. 2605 * 2606 * Along with the refblock, a corresponding reftable entry is passed, in the 2607 * reftable @reftable (which may be reallocated) at @reftable_index. 2608 * 2609 * @allocated should be set to true if a new cluster has been allocated. 2610 */ 2611 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable, 2612 uint64_t reftable_index, uint64_t *reftable_size, 2613 void *refblock, bool refblock_empty, 2614 bool *allocated, Error **errp); 2615 2616 /** 2617 * This "operation" for walk_over_reftable() allocates the refblock on disk (if 2618 * it is not empty) and inserts its offset into the new reftable. The size of 2619 * this new reftable is increased as required. 2620 */ 2621 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable, 2622 uint64_t reftable_index, uint64_t *reftable_size, 2623 void *refblock, bool refblock_empty, bool *allocated, 2624 Error **errp) 2625 { 2626 BDRVQcow2State *s = bs->opaque; 2627 int64_t offset; 2628 2629 if (!refblock_empty && reftable_index >= *reftable_size) { 2630 uint64_t *new_reftable; 2631 uint64_t new_reftable_size; 2632 2633 new_reftable_size = ROUND_UP(reftable_index + 1, 2634 s->cluster_size / sizeof(uint64_t)); 2635 if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { 2636 error_setg(errp, 2637 "This operation would make the refcount table grow " 2638 "beyond the maximum size supported by QEMU, aborting"); 2639 return -ENOTSUP; 2640 } 2641 2642 new_reftable = g_try_realloc(*reftable, new_reftable_size * 2643 sizeof(uint64_t)); 2644 if (!new_reftable) { 2645 error_setg(errp, "Failed to increase reftable buffer size"); 2646 return -ENOMEM; 2647 } 2648 2649 memset(new_reftable + *reftable_size, 0, 2650 (new_reftable_size - *reftable_size) * sizeof(uint64_t)); 2651 2652 *reftable = new_reftable; 2653 *reftable_size = new_reftable_size; 2654 } 2655 2656 if (!refblock_empty && !(*reftable)[reftable_index]) { 2657 offset = qcow2_alloc_clusters(bs, s->cluster_size); 2658 if (offset < 0) { 2659 error_setg_errno(errp, -offset, "Failed to allocate refblock"); 2660 return offset; 2661 } 2662 (*reftable)[reftable_index] = offset; 2663 *allocated = true; 2664 } 2665 2666 return 0; 2667 } 2668 2669 /** 2670 * This "operation" for walk_over_reftable() writes the refblock to disk at the 2671 * offset specified by the new reftable's entry. It does not modify the new 2672 * reftable or change any refcounts. 2673 */ 2674 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable, 2675 uint64_t reftable_index, uint64_t *reftable_size, 2676 void *refblock, bool refblock_empty, bool *allocated, 2677 Error **errp) 2678 { 2679 BDRVQcow2State *s = bs->opaque; 2680 int64_t offset; 2681 int ret; 2682 2683 if (reftable_index < *reftable_size && (*reftable)[reftable_index]) { 2684 offset = (*reftable)[reftable_index]; 2685 2686 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 2687 if (ret < 0) { 2688 error_setg_errno(errp, -ret, "Overlap check failed"); 2689 return ret; 2690 } 2691 2692 ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size); 2693 if (ret < 0) { 2694 error_setg_errno(errp, -ret, "Failed to write refblock"); 2695 return ret; 2696 } 2697 } else { 2698 assert(refblock_empty); 2699 } 2700 2701 return 0; 2702 } 2703 2704 /** 2705 * This function walks over the existing reftable and every referenced refblock; 2706 * if @new_set_refcount is non-NULL, it is called for every refcount entry to 2707 * create an equal new entry in the passed @new_refblock. Once that 2708 * @new_refblock is completely filled, @operation will be called. 2709 * 2710 * @status_cb and @cb_opaque are used for the amend operation's status callback. 2711 * @index is the index of the walk_over_reftable() calls and @total is the total 2712 * number of walk_over_reftable() calls per amend operation. Both are used for 2713 * calculating the parameters for the status callback. 2714 * 2715 * @allocated is set to true if a new cluster has been allocated. 2716 */ 2717 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable, 2718 uint64_t *new_reftable_index, 2719 uint64_t *new_reftable_size, 2720 void *new_refblock, int new_refblock_size, 2721 int new_refcount_bits, 2722 RefblockFinishOp *operation, bool *allocated, 2723 Qcow2SetRefcountFunc *new_set_refcount, 2724 BlockDriverAmendStatusCB *status_cb, 2725 void *cb_opaque, int index, int total, 2726 Error **errp) 2727 { 2728 BDRVQcow2State *s = bs->opaque; 2729 uint64_t reftable_index; 2730 bool new_refblock_empty = true; 2731 int refblock_index; 2732 int new_refblock_index = 0; 2733 int ret; 2734 2735 for (reftable_index = 0; reftable_index < s->refcount_table_size; 2736 reftable_index++) 2737 { 2738 uint64_t refblock_offset = s->refcount_table[reftable_index] 2739 & REFT_OFFSET_MASK; 2740 2741 status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index, 2742 (uint64_t)total * s->refcount_table_size, cb_opaque); 2743 2744 if (refblock_offset) { 2745 void *refblock; 2746 2747 if (offset_into_cluster(s, refblock_offset)) { 2748 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" 2749 PRIx64 " unaligned (reftable index: %#" 2750 PRIx64 ")", refblock_offset, 2751 reftable_index); 2752 error_setg(errp, 2753 "Image is corrupt (unaligned refblock offset)"); 2754 return -EIO; 2755 } 2756 2757 ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset, 2758 &refblock); 2759 if (ret < 0) { 2760 error_setg_errno(errp, -ret, "Failed to retrieve refblock"); 2761 return ret; 2762 } 2763 2764 for (refblock_index = 0; refblock_index < s->refcount_block_size; 2765 refblock_index++) 2766 { 2767 uint64_t refcount; 2768 2769 if (new_refblock_index >= new_refblock_size) { 2770 /* new_refblock is now complete */ 2771 ret = operation(bs, new_reftable, *new_reftable_index, 2772 new_reftable_size, new_refblock, 2773 new_refblock_empty, allocated, errp); 2774 if (ret < 0) { 2775 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 2776 return ret; 2777 } 2778 2779 (*new_reftable_index)++; 2780 new_refblock_index = 0; 2781 new_refblock_empty = true; 2782 } 2783 2784 refcount = s->get_refcount(refblock, refblock_index); 2785 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) { 2786 uint64_t offset; 2787 2788 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 2789 2790 offset = ((reftable_index << s->refcount_block_bits) 2791 + refblock_index) << s->cluster_bits; 2792 2793 error_setg(errp, "Cannot decrease refcount entry width to " 2794 "%i bits: Cluster at offset %#" PRIx64 " has a " 2795 "refcount of %" PRIu64, new_refcount_bits, 2796 offset, refcount); 2797 return -EINVAL; 2798 } 2799 2800 if (new_set_refcount) { 2801 new_set_refcount(new_refblock, new_refblock_index++, 2802 refcount); 2803 } else { 2804 new_refblock_index++; 2805 } 2806 new_refblock_empty = new_refblock_empty && refcount == 0; 2807 } 2808 2809 qcow2_cache_put(bs, s->refcount_block_cache, &refblock); 2810 } else { 2811 /* No refblock means every refcount is 0 */ 2812 for (refblock_index = 0; refblock_index < s->refcount_block_size; 2813 refblock_index++) 2814 { 2815 if (new_refblock_index >= new_refblock_size) { 2816 /* new_refblock is now complete */ 2817 ret = operation(bs, new_reftable, *new_reftable_index, 2818 new_reftable_size, new_refblock, 2819 new_refblock_empty, allocated, errp); 2820 if (ret < 0) { 2821 return ret; 2822 } 2823 2824 (*new_reftable_index)++; 2825 new_refblock_index = 0; 2826 new_refblock_empty = true; 2827 } 2828 2829 if (new_set_refcount) { 2830 new_set_refcount(new_refblock, new_refblock_index++, 0); 2831 } else { 2832 new_refblock_index++; 2833 } 2834 } 2835 } 2836 } 2837 2838 if (new_refblock_index > 0) { 2839 /* Complete the potentially existing partially filled final refblock */ 2840 if (new_set_refcount) { 2841 for (; new_refblock_index < new_refblock_size; 2842 new_refblock_index++) 2843 { 2844 new_set_refcount(new_refblock, new_refblock_index, 0); 2845 } 2846 } 2847 2848 ret = operation(bs, new_reftable, *new_reftable_index, 2849 new_reftable_size, new_refblock, new_refblock_empty, 2850 allocated, errp); 2851 if (ret < 0) { 2852 return ret; 2853 } 2854 2855 (*new_reftable_index)++; 2856 } 2857 2858 status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size, 2859 (uint64_t)total * s->refcount_table_size, cb_opaque); 2860 2861 return 0; 2862 } 2863 2864 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, 2865 BlockDriverAmendStatusCB *status_cb, 2866 void *cb_opaque, Error **errp) 2867 { 2868 BDRVQcow2State *s = bs->opaque; 2869 Qcow2GetRefcountFunc *new_get_refcount; 2870 Qcow2SetRefcountFunc *new_set_refcount; 2871 void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size); 2872 uint64_t *new_reftable = NULL, new_reftable_size = 0; 2873 uint64_t *old_reftable, old_reftable_size, old_reftable_offset; 2874 uint64_t new_reftable_index = 0; 2875 uint64_t i; 2876 int64_t new_reftable_offset = 0, allocated_reftable_size = 0; 2877 int new_refblock_size, new_refcount_bits = 1 << refcount_order; 2878 int old_refcount_order; 2879 int walk_index = 0; 2880 int ret; 2881 bool new_allocation; 2882 2883 assert(s->qcow_version >= 3); 2884 assert(refcount_order >= 0 && refcount_order <= 6); 2885 2886 /* see qcow2_open() */ 2887 new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3)); 2888 2889 new_get_refcount = get_refcount_funcs[refcount_order]; 2890 new_set_refcount = set_refcount_funcs[refcount_order]; 2891 2892 2893 do { 2894 int total_walks; 2895 2896 new_allocation = false; 2897 2898 /* At least we have to do this walk and the one which writes the 2899 * refblocks; also, at least we have to do this loop here at least 2900 * twice (normally), first to do the allocations, and second to 2901 * determine that everything is correctly allocated, this then makes 2902 * three walks in total */ 2903 total_walks = MAX(walk_index + 2, 3); 2904 2905 /* First, allocate the structures so they are present in the refcount 2906 * structures */ 2907 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, 2908 &new_reftable_size, NULL, new_refblock_size, 2909 new_refcount_bits, &alloc_refblock, 2910 &new_allocation, NULL, status_cb, cb_opaque, 2911 walk_index++, total_walks, errp); 2912 if (ret < 0) { 2913 goto done; 2914 } 2915 2916 new_reftable_index = 0; 2917 2918 if (new_allocation) { 2919 if (new_reftable_offset) { 2920 qcow2_free_clusters(bs, new_reftable_offset, 2921 allocated_reftable_size * sizeof(uint64_t), 2922 QCOW2_DISCARD_NEVER); 2923 } 2924 2925 new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size * 2926 sizeof(uint64_t)); 2927 if (new_reftable_offset < 0) { 2928 error_setg_errno(errp, -new_reftable_offset, 2929 "Failed to allocate the new reftable"); 2930 ret = new_reftable_offset; 2931 goto done; 2932 } 2933 allocated_reftable_size = new_reftable_size; 2934 } 2935 } while (new_allocation); 2936 2937 /* Second, write the new refblocks */ 2938 ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, 2939 &new_reftable_size, new_refblock, 2940 new_refblock_size, new_refcount_bits, 2941 &flush_refblock, &new_allocation, new_set_refcount, 2942 status_cb, cb_opaque, walk_index, walk_index + 1, 2943 errp); 2944 if (ret < 0) { 2945 goto done; 2946 } 2947 assert(!new_allocation); 2948 2949 2950 /* Write the new reftable */ 2951 ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset, 2952 new_reftable_size * sizeof(uint64_t)); 2953 if (ret < 0) { 2954 error_setg_errno(errp, -ret, "Overlap check failed"); 2955 goto done; 2956 } 2957 2958 for (i = 0; i < new_reftable_size; i++) { 2959 cpu_to_be64s(&new_reftable[i]); 2960 } 2961 2962 ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable, 2963 new_reftable_size * sizeof(uint64_t)); 2964 2965 for (i = 0; i < new_reftable_size; i++) { 2966 be64_to_cpus(&new_reftable[i]); 2967 } 2968 2969 if (ret < 0) { 2970 error_setg_errno(errp, -ret, "Failed to write the new reftable"); 2971 goto done; 2972 } 2973 2974 2975 /* Empty the refcount cache */ 2976 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 2977 if (ret < 0) { 2978 error_setg_errno(errp, -ret, "Failed to flush the refblock cache"); 2979 goto done; 2980 } 2981 2982 /* Update the image header to point to the new reftable; this only updates 2983 * the fields which are relevant to qcow2_update_header(); other fields 2984 * such as s->refcount_table or s->refcount_bits stay stale for now 2985 * (because we have to restore everything if qcow2_update_header() fails) */ 2986 old_refcount_order = s->refcount_order; 2987 old_reftable_size = s->refcount_table_size; 2988 old_reftable_offset = s->refcount_table_offset; 2989 2990 s->refcount_order = refcount_order; 2991 s->refcount_table_size = new_reftable_size; 2992 s->refcount_table_offset = new_reftable_offset; 2993 2994 ret = qcow2_update_header(bs); 2995 if (ret < 0) { 2996 s->refcount_order = old_refcount_order; 2997 s->refcount_table_size = old_reftable_size; 2998 s->refcount_table_offset = old_reftable_offset; 2999 error_setg_errno(errp, -ret, "Failed to update the qcow2 header"); 3000 goto done; 3001 } 3002 3003 /* Now update the rest of the in-memory information */ 3004 old_reftable = s->refcount_table; 3005 s->refcount_table = new_reftable; 3006 update_max_refcount_table_index(s); 3007 3008 s->refcount_bits = 1 << refcount_order; 3009 s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); 3010 s->refcount_max += s->refcount_max - 1; 3011 3012 s->refcount_block_bits = s->cluster_bits - (refcount_order - 3); 3013 s->refcount_block_size = 1 << s->refcount_block_bits; 3014 3015 s->get_refcount = new_get_refcount; 3016 s->set_refcount = new_set_refcount; 3017 3018 /* For cleaning up all old refblocks and the old reftable below the "done" 3019 * label */ 3020 new_reftable = old_reftable; 3021 new_reftable_size = old_reftable_size; 3022 new_reftable_offset = old_reftable_offset; 3023 3024 done: 3025 if (new_reftable) { 3026 /* On success, new_reftable actually points to the old reftable (and 3027 * new_reftable_size is the old reftable's size); but that is just 3028 * fine */ 3029 for (i = 0; i < new_reftable_size; i++) { 3030 uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK; 3031 if (offset) { 3032 qcow2_free_clusters(bs, offset, s->cluster_size, 3033 QCOW2_DISCARD_OTHER); 3034 } 3035 } 3036 g_free(new_reftable); 3037 3038 if (new_reftable_offset > 0) { 3039 qcow2_free_clusters(bs, new_reftable_offset, 3040 new_reftable_size * sizeof(uint64_t), 3041 QCOW2_DISCARD_OTHER); 3042 } 3043 } 3044 3045 qemu_vfree(new_refblock); 3046 return ret; 3047 } 3048