1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include <zlib.h> 27 28 #include "qapi/error.h" 29 #include "qemu-common.h" 30 #include "block/block_int.h" 31 #include "block/qcow2.h" 32 #include "qemu/bswap.h" 33 #include "trace.h" 34 35 int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size) 36 { 37 BDRVQcow2State *s = bs->opaque; 38 int new_l1_size, i, ret; 39 40 if (exact_size >= s->l1_size) { 41 return 0; 42 } 43 44 new_l1_size = exact_size; 45 46 #ifdef DEBUG_ALLOC2 47 fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size); 48 #endif 49 50 BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE); 51 ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset + 52 new_l1_size * sizeof(uint64_t), 53 (s->l1_size - new_l1_size) * sizeof(uint64_t), 0); 54 if (ret < 0) { 55 goto fail; 56 } 57 58 ret = bdrv_flush(bs->file->bs); 59 if (ret < 0) { 60 goto fail; 61 } 62 63 BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS); 64 for (i = s->l1_size - 1; i > new_l1_size - 1; i--) { 65 if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) { 66 continue; 67 } 68 qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK, 69 s->cluster_size, QCOW2_DISCARD_ALWAYS); 70 s->l1_table[i] = 0; 71 } 72 return 0; 73 74 fail: 75 /* 76 * If the write in the l1_table failed the image may contain a partially 77 * overwritten l1_table. In this case it would be better to clear the 78 * l1_table in memory to avoid possible image corruption. 79 */ 80 memset(s->l1_table + new_l1_size, 0, 81 (s->l1_size - new_l1_size) * sizeof(uint64_t)); 82 return ret; 83 } 84 85 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 86 bool exact_size) 87 { 88 BDRVQcow2State *s = bs->opaque; 89 int new_l1_size2, ret, i; 90 uint64_t *new_l1_table; 91 int64_t old_l1_table_offset, old_l1_size; 92 int64_t new_l1_table_offset, new_l1_size; 93 uint8_t data[12]; 94 95 if (min_size <= s->l1_size) 96 return 0; 97 98 /* Do a sanity check on min_size before trying to calculate new_l1_size 99 * (this prevents overflows during the while loop for the calculation of 100 * new_l1_size) */ 101 if (min_size > INT_MAX / sizeof(uint64_t)) { 102 return -EFBIG; 103 } 104 105 if (exact_size) { 106 new_l1_size = min_size; 107 } else { 108 /* Bump size up to reduce the number of times we have to grow */ 109 new_l1_size = s->l1_size; 110 if (new_l1_size == 0) { 111 new_l1_size = 1; 112 } 113 while (min_size > new_l1_size) { 114 new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2); 115 } 116 } 117 118 QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX); 119 if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { 120 return -EFBIG; 121 } 122 123 #ifdef DEBUG_ALLOC2 124 fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 125 s->l1_size, new_l1_size); 126 #endif 127 128 new_l1_size2 = sizeof(uint64_t) * new_l1_size; 129 new_l1_table = qemu_try_blockalign(bs->file->bs, 130 align_offset(new_l1_size2, 512)); 131 if (new_l1_table == NULL) { 132 return -ENOMEM; 133 } 134 memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); 135 136 if (s->l1_size) { 137 memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); 138 } 139 140 /* write new table (align to cluster) */ 141 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 142 new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 143 if (new_l1_table_offset < 0) { 144 qemu_vfree(new_l1_table); 145 return new_l1_table_offset; 146 } 147 148 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 149 if (ret < 0) { 150 goto fail; 151 } 152 153 /* the L1 position has not yet been updated, so these clusters must 154 * indeed be completely free */ 155 ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, 156 new_l1_size2); 157 if (ret < 0) { 158 goto fail; 159 } 160 161 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 162 for(i = 0; i < s->l1_size; i++) 163 new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 164 ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, 165 new_l1_table, new_l1_size2); 166 if (ret < 0) 167 goto fail; 168 for(i = 0; i < s->l1_size; i++) 169 new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 170 171 /* set new table */ 172 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 173 stl_be_p(data, new_l1_size); 174 stq_be_p(data + 4, new_l1_table_offset); 175 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), 176 data, sizeof(data)); 177 if (ret < 0) { 178 goto fail; 179 } 180 qemu_vfree(s->l1_table); 181 old_l1_table_offset = s->l1_table_offset; 182 s->l1_table_offset = new_l1_table_offset; 183 s->l1_table = new_l1_table; 184 old_l1_size = s->l1_size; 185 s->l1_size = new_l1_size; 186 qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), 187 QCOW2_DISCARD_OTHER); 188 return 0; 189 fail: 190 qemu_vfree(new_l1_table); 191 qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 192 QCOW2_DISCARD_OTHER); 193 return ret; 194 } 195 196 /* 197 * l2_load 198 * 199 * Loads a L2 table into memory. If the table is in the cache, the cache 200 * is used; otherwise the L2 table is loaded from the image file. 201 * 202 * Returns a pointer to the L2 table on success, or NULL if the read from 203 * the image file failed. 204 */ 205 206 static int l2_load(BlockDriverState *bs, uint64_t l2_offset, 207 uint64_t **l2_table) 208 { 209 BDRVQcow2State *s = bs->opaque; 210 211 return qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 212 (void **)l2_table); 213 } 214 215 /* 216 * Writes one sector of the L1 table to the disk (can't update single entries 217 * and we really don't want bdrv_pread to perform a read-modify-write) 218 */ 219 #define L1_ENTRIES_PER_SECTOR (512 / 8) 220 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) 221 { 222 BDRVQcow2State *s = bs->opaque; 223 uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; 224 int l1_start_index; 225 int i, ret; 226 227 l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); 228 for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; 229 i++) 230 { 231 buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 232 } 233 234 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, 235 s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); 236 if (ret < 0) { 237 return ret; 238 } 239 240 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 241 ret = bdrv_pwrite_sync(bs->file, 242 s->l1_table_offset + 8 * l1_start_index, 243 buf, sizeof(buf)); 244 if (ret < 0) { 245 return ret; 246 } 247 248 return 0; 249 } 250 251 /* 252 * l2_allocate 253 * 254 * Allocate a new l2 entry in the file. If l1_index points to an already 255 * used entry in the L2 table (i.e. we are doing a copy on write for the L2 256 * table) copy the contents of the old L2 table into the newly allocated one. 257 * Otherwise the new table is initialized with zeros. 258 * 259 */ 260 261 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) 262 { 263 BDRVQcow2State *s = bs->opaque; 264 uint64_t old_l2_offset; 265 uint64_t *l2_table = NULL; 266 int64_t l2_offset; 267 int ret; 268 269 old_l2_offset = s->l1_table[l1_index]; 270 271 trace_qcow2_l2_allocate(bs, l1_index); 272 273 /* allocate a new l2 entry */ 274 275 l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); 276 if (l2_offset < 0) { 277 ret = l2_offset; 278 goto fail; 279 } 280 281 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 282 if (ret < 0) { 283 goto fail; 284 } 285 286 /* allocate a new entry in the l2 cache */ 287 288 trace_qcow2_l2_allocate_get_empty(bs, l1_index); 289 ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); 290 if (ret < 0) { 291 goto fail; 292 } 293 294 l2_table = *table; 295 296 if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 297 /* if there was no old l2 table, clear the new table */ 298 memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); 299 } else { 300 uint64_t* old_table; 301 302 /* if there was an old l2 table, read it from the disk */ 303 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 304 ret = qcow2_cache_get(bs, s->l2_table_cache, 305 old_l2_offset & L1E_OFFSET_MASK, 306 (void**) &old_table); 307 if (ret < 0) { 308 goto fail; 309 } 310 311 memcpy(l2_table, old_table, s->cluster_size); 312 313 qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); 314 } 315 316 /* write the l2 table to the file */ 317 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 318 319 trace_qcow2_l2_allocate_write_l2(bs, l1_index); 320 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 321 ret = qcow2_cache_flush(bs, s->l2_table_cache); 322 if (ret < 0) { 323 goto fail; 324 } 325 326 /* update the L1 entry */ 327 trace_qcow2_l2_allocate_write_l1(bs, l1_index); 328 s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 329 ret = qcow2_write_l1_entry(bs, l1_index); 330 if (ret < 0) { 331 goto fail; 332 } 333 334 *table = l2_table; 335 trace_qcow2_l2_allocate_done(bs, l1_index, 0); 336 return 0; 337 338 fail: 339 trace_qcow2_l2_allocate_done(bs, l1_index, ret); 340 if (l2_table != NULL) { 341 qcow2_cache_put(bs, s->l2_table_cache, (void**) table); 342 } 343 s->l1_table[l1_index] = old_l2_offset; 344 if (l2_offset > 0) { 345 qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 346 QCOW2_DISCARD_ALWAYS); 347 } 348 return ret; 349 } 350 351 /* 352 * Checks how many clusters in a given L2 table are contiguous in the image 353 * file. As soon as one of the flags in the bitmask stop_flags changes compared 354 * to the first cluster, the search is stopped and the cluster is not counted 355 * as contiguous. (This allows it, for example, to stop at the first compressed 356 * cluster which may require a different handling) 357 */ 358 static int count_contiguous_clusters(int nb_clusters, int cluster_size, 359 uint64_t *l2_table, uint64_t stop_flags) 360 { 361 int i; 362 QCow2ClusterType first_cluster_type; 363 uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; 364 uint64_t first_entry = be64_to_cpu(l2_table[0]); 365 uint64_t offset = first_entry & mask; 366 367 if (!offset) { 368 return 0; 369 } 370 371 /* must be allocated */ 372 first_cluster_type = qcow2_get_cluster_type(first_entry); 373 assert(first_cluster_type == QCOW2_CLUSTER_NORMAL || 374 first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC); 375 376 for (i = 0; i < nb_clusters; i++) { 377 uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; 378 if (offset + (uint64_t) i * cluster_size != l2_entry) { 379 break; 380 } 381 } 382 383 return i; 384 } 385 386 /* 387 * Checks how many consecutive unallocated clusters in a given L2 388 * table have the same cluster type. 389 */ 390 static int count_contiguous_clusters_unallocated(int nb_clusters, 391 uint64_t *l2_table, 392 QCow2ClusterType wanted_type) 393 { 394 int i; 395 396 assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN || 397 wanted_type == QCOW2_CLUSTER_UNALLOCATED); 398 for (i = 0; i < nb_clusters; i++) { 399 uint64_t entry = be64_to_cpu(l2_table[i]); 400 QCow2ClusterType type = qcow2_get_cluster_type(entry); 401 402 if (type != wanted_type) { 403 break; 404 } 405 } 406 407 return i; 408 } 409 410 static int coroutine_fn do_perform_cow_read(BlockDriverState *bs, 411 uint64_t src_cluster_offset, 412 unsigned offset_in_cluster, 413 QEMUIOVector *qiov) 414 { 415 int ret; 416 417 if (qiov->size == 0) { 418 return 0; 419 } 420 421 BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 422 423 if (!bs->drv) { 424 return -ENOMEDIUM; 425 } 426 427 /* Call .bdrv_co_readv() directly instead of using the public block-layer 428 * interface. This avoids double I/O throttling and request tracking, 429 * which can lead to deadlock when block layer copy-on-read is enabled. 430 */ 431 ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster, 432 qiov->size, qiov, 0); 433 if (ret < 0) { 434 return ret; 435 } 436 437 return 0; 438 } 439 440 static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs, 441 uint64_t src_cluster_offset, 442 uint64_t cluster_offset, 443 unsigned offset_in_cluster, 444 uint8_t *buffer, 445 unsigned bytes) 446 { 447 if (bytes && bs->encrypted) { 448 BDRVQcow2State *s = bs->opaque; 449 int64_t offset = (s->crypt_physical_offset ? 450 (cluster_offset + offset_in_cluster) : 451 (src_cluster_offset + offset_in_cluster)); 452 assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); 453 assert((bytes & ~BDRV_SECTOR_MASK) == 0); 454 assert(s->crypto); 455 if (qcrypto_block_encrypt(s->crypto, offset, buffer, bytes, NULL) < 0) { 456 return false; 457 } 458 } 459 return true; 460 } 461 462 static int coroutine_fn do_perform_cow_write(BlockDriverState *bs, 463 uint64_t cluster_offset, 464 unsigned offset_in_cluster, 465 QEMUIOVector *qiov) 466 { 467 int ret; 468 469 if (qiov->size == 0) { 470 return 0; 471 } 472 473 ret = qcow2_pre_write_overlap_check(bs, 0, 474 cluster_offset + offset_in_cluster, qiov->size); 475 if (ret < 0) { 476 return ret; 477 } 478 479 BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 480 ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster, 481 qiov->size, qiov, 0); 482 if (ret < 0) { 483 return ret; 484 } 485 486 return 0; 487 } 488 489 490 /* 491 * get_cluster_offset 492 * 493 * For a given offset of the virtual disk, find the cluster type and offset in 494 * the qcow2 file. The offset is stored in *cluster_offset. 495 * 496 * On entry, *bytes is the maximum number of contiguous bytes starting at 497 * offset that we are interested in. 498 * 499 * On exit, *bytes is the number of bytes starting at offset that have the same 500 * cluster type and (if applicable) are stored contiguously in the image file. 501 * Compressed clusters are always returned one by one. 502 * 503 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error 504 * cases. 505 */ 506 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, 507 unsigned int *bytes, uint64_t *cluster_offset) 508 { 509 BDRVQcow2State *s = bs->opaque; 510 unsigned int l2_index; 511 uint64_t l1_index, l2_offset, *l2_table; 512 int l1_bits, c; 513 unsigned int offset_in_cluster; 514 uint64_t bytes_available, bytes_needed, nb_clusters; 515 QCow2ClusterType type; 516 int ret; 517 518 offset_in_cluster = offset_into_cluster(s, offset); 519 bytes_needed = (uint64_t) *bytes + offset_in_cluster; 520 521 l1_bits = s->l2_bits + s->cluster_bits; 522 523 /* compute how many bytes there are between the start of the cluster 524 * containing offset and the end of the l1 entry */ 525 bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)) 526 + offset_in_cluster; 527 528 if (bytes_needed > bytes_available) { 529 bytes_needed = bytes_available; 530 } 531 532 *cluster_offset = 0; 533 534 /* seek to the l2 offset in the l1 table */ 535 536 l1_index = offset >> l1_bits; 537 if (l1_index >= s->l1_size) { 538 type = QCOW2_CLUSTER_UNALLOCATED; 539 goto out; 540 } 541 542 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 543 if (!l2_offset) { 544 type = QCOW2_CLUSTER_UNALLOCATED; 545 goto out; 546 } 547 548 if (offset_into_cluster(s, l2_offset)) { 549 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 550 " unaligned (L1 index: %#" PRIx64 ")", 551 l2_offset, l1_index); 552 return -EIO; 553 } 554 555 /* load the l2 table in memory */ 556 557 ret = l2_load(bs, l2_offset, &l2_table); 558 if (ret < 0) { 559 return ret; 560 } 561 562 /* find the cluster offset for the given disk offset */ 563 564 l2_index = offset_to_l2_index(s, offset); 565 *cluster_offset = be64_to_cpu(l2_table[l2_index]); 566 567 nb_clusters = size_to_clusters(s, bytes_needed); 568 /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned 569 * integers; the minimum cluster size is 512, so this assertion is always 570 * true */ 571 assert(nb_clusters <= INT_MAX); 572 573 type = qcow2_get_cluster_type(*cluster_offset); 574 if (s->qcow_version < 3 && (type == QCOW2_CLUSTER_ZERO_PLAIN || 575 type == QCOW2_CLUSTER_ZERO_ALLOC)) { 576 qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" 577 " in pre-v3 image (L2 offset: %#" PRIx64 578 ", L2 index: %#x)", l2_offset, l2_index); 579 ret = -EIO; 580 goto fail; 581 } 582 switch (type) { 583 case QCOW2_CLUSTER_COMPRESSED: 584 /* Compressed clusters can only be processed one by one */ 585 c = 1; 586 *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; 587 break; 588 case QCOW2_CLUSTER_ZERO_PLAIN: 589 case QCOW2_CLUSTER_UNALLOCATED: 590 /* how many empty clusters ? */ 591 c = count_contiguous_clusters_unallocated(nb_clusters, 592 &l2_table[l2_index], type); 593 *cluster_offset = 0; 594 break; 595 case QCOW2_CLUSTER_ZERO_ALLOC: 596 case QCOW2_CLUSTER_NORMAL: 597 /* how many allocated clusters ? */ 598 c = count_contiguous_clusters(nb_clusters, s->cluster_size, 599 &l2_table[l2_index], QCOW_OFLAG_ZERO); 600 *cluster_offset &= L2E_OFFSET_MASK; 601 if (offset_into_cluster(s, *cluster_offset)) { 602 qcow2_signal_corruption(bs, true, -1, -1, 603 "Cluster allocation offset %#" 604 PRIx64 " unaligned (L2 offset: %#" PRIx64 605 ", L2 index: %#x)", *cluster_offset, 606 l2_offset, l2_index); 607 ret = -EIO; 608 goto fail; 609 } 610 break; 611 default: 612 abort(); 613 } 614 615 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 616 617 bytes_available = (int64_t)c * s->cluster_size; 618 619 out: 620 if (bytes_available > bytes_needed) { 621 bytes_available = bytes_needed; 622 } 623 624 /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster; 625 * subtracting offset_in_cluster will therefore definitely yield something 626 * not exceeding UINT_MAX */ 627 assert(bytes_available - offset_in_cluster <= UINT_MAX); 628 *bytes = bytes_available - offset_in_cluster; 629 630 return type; 631 632 fail: 633 qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); 634 return ret; 635 } 636 637 /* 638 * get_cluster_table 639 * 640 * for a given disk offset, load (and allocate if needed) 641 * the l2 table. 642 * 643 * the l2 table offset in the qcow2 file and the cluster index 644 * in the l2 table are given to the caller. 645 * 646 * Returns 0 on success, -errno in failure case 647 */ 648 static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 649 uint64_t **new_l2_table, 650 int *new_l2_index) 651 { 652 BDRVQcow2State *s = bs->opaque; 653 unsigned int l2_index; 654 uint64_t l1_index, l2_offset; 655 uint64_t *l2_table = NULL; 656 int ret; 657 658 /* seek to the l2 offset in the l1 table */ 659 660 l1_index = offset >> (s->l2_bits + s->cluster_bits); 661 if (l1_index >= s->l1_size) { 662 ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 663 if (ret < 0) { 664 return ret; 665 } 666 } 667 668 assert(l1_index < s->l1_size); 669 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 670 if (offset_into_cluster(s, l2_offset)) { 671 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 672 " unaligned (L1 index: %#" PRIx64 ")", 673 l2_offset, l1_index); 674 return -EIO; 675 } 676 677 /* seek the l2 table of the given l2 offset */ 678 679 if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { 680 /* load the l2 table in memory */ 681 ret = l2_load(bs, l2_offset, &l2_table); 682 if (ret < 0) { 683 return ret; 684 } 685 } else { 686 /* First allocate a new L2 table (and do COW if needed) */ 687 ret = l2_allocate(bs, l1_index, &l2_table); 688 if (ret < 0) { 689 return ret; 690 } 691 692 /* Then decrease the refcount of the old table */ 693 if (l2_offset) { 694 qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 695 QCOW2_DISCARD_OTHER); 696 } 697 } 698 699 /* find the cluster offset for the given disk offset */ 700 701 l2_index = offset_to_l2_index(s, offset); 702 703 *new_l2_table = l2_table; 704 *new_l2_index = l2_index; 705 706 return 0; 707 } 708 709 /* 710 * alloc_compressed_cluster_offset 711 * 712 * For a given offset of the disk image, return cluster offset in 713 * qcow2 file. 714 * 715 * If the offset is not found, allocate a new compressed cluster. 716 * 717 * Return the cluster offset if successful, 718 * Return 0, otherwise. 719 * 720 */ 721 722 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 723 uint64_t offset, 724 int compressed_size) 725 { 726 BDRVQcow2State *s = bs->opaque; 727 int l2_index, ret; 728 uint64_t *l2_table; 729 int64_t cluster_offset; 730 int nb_csectors; 731 732 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 733 if (ret < 0) { 734 return 0; 735 } 736 737 /* Compression can't overwrite anything. Fail if the cluster was already 738 * allocated. */ 739 cluster_offset = be64_to_cpu(l2_table[l2_index]); 740 if (cluster_offset & L2E_OFFSET_MASK) { 741 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 742 return 0; 743 } 744 745 cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 746 if (cluster_offset < 0) { 747 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 748 return 0; 749 } 750 751 nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - 752 (cluster_offset >> 9); 753 754 cluster_offset |= QCOW_OFLAG_COMPRESSED | 755 ((uint64_t)nb_csectors << s->csize_shift); 756 757 /* update L2 table */ 758 759 /* compressed clusters never have the copied flag */ 760 761 BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 762 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 763 l2_table[l2_index] = cpu_to_be64(cluster_offset); 764 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 765 766 return cluster_offset; 767 } 768 769 static int perform_cow(BlockDriverState *bs, QCowL2Meta *m) 770 { 771 BDRVQcow2State *s = bs->opaque; 772 Qcow2COWRegion *start = &m->cow_start; 773 Qcow2COWRegion *end = &m->cow_end; 774 unsigned buffer_size; 775 unsigned data_bytes = end->offset - (start->offset + start->nb_bytes); 776 bool merge_reads; 777 uint8_t *start_buffer, *end_buffer; 778 QEMUIOVector qiov; 779 int ret; 780 781 assert(start->nb_bytes <= UINT_MAX - end->nb_bytes); 782 assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes); 783 assert(start->offset + start->nb_bytes <= end->offset); 784 assert(!m->data_qiov || m->data_qiov->size == data_bytes); 785 786 if (start->nb_bytes == 0 && end->nb_bytes == 0) { 787 return 0; 788 } 789 790 /* If we have to read both the start and end COW regions and the 791 * middle region is not too large then perform just one read 792 * operation */ 793 merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384; 794 if (merge_reads) { 795 buffer_size = start->nb_bytes + data_bytes + end->nb_bytes; 796 } else { 797 /* If we have to do two reads, add some padding in the middle 798 * if necessary to make sure that the end region is optimally 799 * aligned. */ 800 size_t align = bdrv_opt_mem_align(bs); 801 assert(align > 0 && align <= UINT_MAX); 802 assert(QEMU_ALIGN_UP(start->nb_bytes, align) <= 803 UINT_MAX - end->nb_bytes); 804 buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes; 805 } 806 807 /* Reserve a buffer large enough to store all the data that we're 808 * going to read */ 809 start_buffer = qemu_try_blockalign(bs, buffer_size); 810 if (start_buffer == NULL) { 811 return -ENOMEM; 812 } 813 /* The part of the buffer where the end region is located */ 814 end_buffer = start_buffer + buffer_size - end->nb_bytes; 815 816 qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0)); 817 818 qemu_co_mutex_unlock(&s->lock); 819 /* First we read the existing data from both COW regions. We 820 * either read the whole region in one go, or the start and end 821 * regions separately. */ 822 if (merge_reads) { 823 qemu_iovec_add(&qiov, start_buffer, buffer_size); 824 ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov); 825 } else { 826 qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 827 ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov); 828 if (ret < 0) { 829 goto fail; 830 } 831 832 qemu_iovec_reset(&qiov); 833 qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 834 ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov); 835 } 836 if (ret < 0) { 837 goto fail; 838 } 839 840 /* Encrypt the data if necessary before writing it */ 841 if (bs->encrypted) { 842 if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset, 843 start->offset, start_buffer, 844 start->nb_bytes) || 845 !do_perform_cow_encrypt(bs, m->offset, m->alloc_offset, 846 end->offset, end_buffer, end->nb_bytes)) { 847 ret = -EIO; 848 goto fail; 849 } 850 } 851 852 /* And now we can write everything. If we have the guest data we 853 * can write everything in one single operation */ 854 if (m->data_qiov) { 855 qemu_iovec_reset(&qiov); 856 if (start->nb_bytes) { 857 qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 858 } 859 qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes); 860 if (end->nb_bytes) { 861 qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 862 } 863 /* NOTE: we have a write_aio blkdebug event here followed by 864 * a cow_write one in do_perform_cow_write(), but there's only 865 * one single I/O operation */ 866 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); 867 ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov); 868 } else { 869 /* If there's no guest data then write both COW regions separately */ 870 qemu_iovec_reset(&qiov); 871 qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 872 ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov); 873 if (ret < 0) { 874 goto fail; 875 } 876 877 qemu_iovec_reset(&qiov); 878 qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 879 ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov); 880 } 881 882 fail: 883 qemu_co_mutex_lock(&s->lock); 884 885 /* 886 * Before we update the L2 table to actually point to the new cluster, we 887 * need to be sure that the refcounts have been increased and COW was 888 * handled. 889 */ 890 if (ret == 0) { 891 qcow2_cache_depends_on_flush(s->l2_table_cache); 892 } 893 894 qemu_vfree(start_buffer); 895 qemu_iovec_destroy(&qiov); 896 return ret; 897 } 898 899 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) 900 { 901 BDRVQcow2State *s = bs->opaque; 902 int i, j = 0, l2_index, ret; 903 uint64_t *old_cluster, *l2_table; 904 uint64_t cluster_offset = m->alloc_offset; 905 906 trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 907 assert(m->nb_clusters > 0); 908 909 old_cluster = g_try_new(uint64_t, m->nb_clusters); 910 if (old_cluster == NULL) { 911 ret = -ENOMEM; 912 goto err; 913 } 914 915 /* copy content of unmodified sectors */ 916 ret = perform_cow(bs, m); 917 if (ret < 0) { 918 goto err; 919 } 920 921 /* Update L2 table. */ 922 if (s->use_lazy_refcounts) { 923 qcow2_mark_dirty(bs); 924 } 925 if (qcow2_need_accurate_refcounts(s)) { 926 qcow2_cache_set_dependency(bs, s->l2_table_cache, 927 s->refcount_block_cache); 928 } 929 930 ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); 931 if (ret < 0) { 932 goto err; 933 } 934 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 935 936 assert(l2_index + m->nb_clusters <= s->l2_size); 937 for (i = 0; i < m->nb_clusters; i++) { 938 /* if two concurrent writes happen to the same unallocated cluster 939 * each write allocates separate cluster and writes data concurrently. 940 * The first one to complete updates l2 table with pointer to its 941 * cluster the second one has to do RMW (which is done above by 942 * perform_cow()), update l2 table with its cluster pointer and free 943 * old cluster. This is what this loop does */ 944 if (l2_table[l2_index + i] != 0) { 945 old_cluster[j++] = l2_table[l2_index + i]; 946 } 947 948 l2_table[l2_index + i] = cpu_to_be64((cluster_offset + 949 (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); 950 } 951 952 953 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 954 955 /* 956 * If this was a COW, we need to decrease the refcount of the old cluster. 957 * 958 * Don't discard clusters that reach a refcount of 0 (e.g. compressed 959 * clusters), the next write will reuse them anyway. 960 */ 961 if (!m->keep_old_clusters && j != 0) { 962 for (i = 0; i < j; i++) { 963 qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, 964 QCOW2_DISCARD_NEVER); 965 } 966 } 967 968 ret = 0; 969 err: 970 g_free(old_cluster); 971 return ret; 972 } 973 974 /* 975 * Returns the number of contiguous clusters that can be used for an allocating 976 * write, but require COW to be performed (this includes yet unallocated space, 977 * which must copy from the backing file) 978 */ 979 static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, 980 uint64_t *l2_table, int l2_index) 981 { 982 int i; 983 984 for (i = 0; i < nb_clusters; i++) { 985 uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); 986 QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 987 988 switch(cluster_type) { 989 case QCOW2_CLUSTER_NORMAL: 990 if (l2_entry & QCOW_OFLAG_COPIED) { 991 goto out; 992 } 993 break; 994 case QCOW2_CLUSTER_UNALLOCATED: 995 case QCOW2_CLUSTER_COMPRESSED: 996 case QCOW2_CLUSTER_ZERO_PLAIN: 997 case QCOW2_CLUSTER_ZERO_ALLOC: 998 break; 999 default: 1000 abort(); 1001 } 1002 } 1003 1004 out: 1005 assert(i <= nb_clusters); 1006 return i; 1007 } 1008 1009 /* 1010 * Check if there already is an AIO write request in flight which allocates 1011 * the same cluster. In this case we need to wait until the previous 1012 * request has completed and updated the L2 table accordingly. 1013 * 1014 * Returns: 1015 * 0 if there was no dependency. *cur_bytes indicates the number of 1016 * bytes from guest_offset that can be read before the next 1017 * dependency must be processed (or the request is complete) 1018 * 1019 * -EAGAIN if we had to wait for another request, previously gathered 1020 * information on cluster allocation may be invalid now. The caller 1021 * must start over anyway, so consider *cur_bytes undefined. 1022 */ 1023 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, 1024 uint64_t *cur_bytes, QCowL2Meta **m) 1025 { 1026 BDRVQcow2State *s = bs->opaque; 1027 QCowL2Meta *old_alloc; 1028 uint64_t bytes = *cur_bytes; 1029 1030 QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 1031 1032 uint64_t start = guest_offset; 1033 uint64_t end = start + bytes; 1034 uint64_t old_start = l2meta_cow_start(old_alloc); 1035 uint64_t old_end = l2meta_cow_end(old_alloc); 1036 1037 if (end <= old_start || start >= old_end) { 1038 /* No intersection */ 1039 } else { 1040 if (start < old_start) { 1041 /* Stop at the start of a running allocation */ 1042 bytes = old_start - start; 1043 } else { 1044 bytes = 0; 1045 } 1046 1047 /* Stop if already an l2meta exists. After yielding, it wouldn't 1048 * be valid any more, so we'd have to clean up the old L2Metas 1049 * and deal with requests depending on them before starting to 1050 * gather new ones. Not worth the trouble. */ 1051 if (bytes == 0 && *m) { 1052 *cur_bytes = 0; 1053 return 0; 1054 } 1055 1056 if (bytes == 0) { 1057 /* Wait for the dependency to complete. We need to recheck 1058 * the free/allocated clusters when we continue. */ 1059 qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); 1060 return -EAGAIN; 1061 } 1062 } 1063 } 1064 1065 /* Make sure that existing clusters and new allocations are only used up to 1066 * the next dependency if we shortened the request above */ 1067 *cur_bytes = bytes; 1068 1069 return 0; 1070 } 1071 1072 /* 1073 * Checks how many already allocated clusters that don't require a copy on 1074 * write there are at the given guest_offset (up to *bytes). If 1075 * *host_offset is not zero, only physically contiguous clusters beginning at 1076 * this host offset are counted. 1077 * 1078 * Note that guest_offset may not be cluster aligned. In this case, the 1079 * returned *host_offset points to exact byte referenced by guest_offset and 1080 * therefore isn't cluster aligned as well. 1081 * 1082 * Returns: 1083 * 0: if no allocated clusters are available at the given offset. 1084 * *bytes is normally unchanged. It is set to 0 if the cluster 1085 * is allocated and doesn't need COW, but doesn't have the right 1086 * physical offset. 1087 * 1088 * 1: if allocated clusters that don't require a COW are available at 1089 * the requested offset. *bytes may have decreased and describes 1090 * the length of the area that can be written to. 1091 * 1092 * -errno: in error cases 1093 */ 1094 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, 1095 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 1096 { 1097 BDRVQcow2State *s = bs->opaque; 1098 int l2_index; 1099 uint64_t cluster_offset; 1100 uint64_t *l2_table; 1101 uint64_t nb_clusters; 1102 unsigned int keep_clusters; 1103 int ret; 1104 1105 trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 1106 *bytes); 1107 1108 assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) 1109 == offset_into_cluster(s, *host_offset)); 1110 1111 /* 1112 * Calculate the number of clusters to look for. We stop at L2 table 1113 * boundaries to keep things simple. 1114 */ 1115 nb_clusters = 1116 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1117 1118 l2_index = offset_to_l2_index(s, guest_offset); 1119 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1120 assert(nb_clusters <= INT_MAX); 1121 1122 /* Find L2 entry for the first involved cluster */ 1123 ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 1124 if (ret < 0) { 1125 return ret; 1126 } 1127 1128 cluster_offset = be64_to_cpu(l2_table[l2_index]); 1129 1130 /* Check how many clusters are already allocated and don't need COW */ 1131 if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL 1132 && (cluster_offset & QCOW_OFLAG_COPIED)) 1133 { 1134 /* If a specific host_offset is required, check it */ 1135 bool offset_matches = 1136 (cluster_offset & L2E_OFFSET_MASK) == *host_offset; 1137 1138 if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { 1139 qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 1140 "%#llx unaligned (guest offset: %#" PRIx64 1141 ")", cluster_offset & L2E_OFFSET_MASK, 1142 guest_offset); 1143 ret = -EIO; 1144 goto out; 1145 } 1146 1147 if (*host_offset != 0 && !offset_matches) { 1148 *bytes = 0; 1149 ret = 0; 1150 goto out; 1151 } 1152 1153 /* We keep all QCOW_OFLAG_COPIED clusters */ 1154 keep_clusters = 1155 count_contiguous_clusters(nb_clusters, s->cluster_size, 1156 &l2_table[l2_index], 1157 QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); 1158 assert(keep_clusters <= nb_clusters); 1159 1160 *bytes = MIN(*bytes, 1161 keep_clusters * s->cluster_size 1162 - offset_into_cluster(s, guest_offset)); 1163 1164 ret = 1; 1165 } else { 1166 ret = 0; 1167 } 1168 1169 /* Cleanup */ 1170 out: 1171 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1172 1173 /* Only return a host offset if we actually made progress. Otherwise we 1174 * would make requirements for handle_alloc() that it can't fulfill */ 1175 if (ret > 0) { 1176 *host_offset = (cluster_offset & L2E_OFFSET_MASK) 1177 + offset_into_cluster(s, guest_offset); 1178 } 1179 1180 return ret; 1181 } 1182 1183 /* 1184 * Allocates new clusters for the given guest_offset. 1185 * 1186 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 1187 * contain the number of clusters that have been allocated and are contiguous 1188 * in the image file. 1189 * 1190 * If *host_offset is non-zero, it specifies the offset in the image file at 1191 * which the new clusters must start. *nb_clusters can be 0 on return in this 1192 * case if the cluster at host_offset is already in use. If *host_offset is 1193 * zero, the clusters can be allocated anywhere in the image file. 1194 * 1195 * *host_offset is updated to contain the offset into the image file at which 1196 * the first allocated cluster starts. 1197 * 1198 * Return 0 on success and -errno in error cases. -EAGAIN means that the 1199 * function has been waiting for another request and the allocation must be 1200 * restarted, but the whole request should not be failed. 1201 */ 1202 static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, 1203 uint64_t *host_offset, uint64_t *nb_clusters) 1204 { 1205 BDRVQcow2State *s = bs->opaque; 1206 1207 trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 1208 *host_offset, *nb_clusters); 1209 1210 /* Allocate new clusters */ 1211 trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 1212 if (*host_offset == 0) { 1213 int64_t cluster_offset = 1214 qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 1215 if (cluster_offset < 0) { 1216 return cluster_offset; 1217 } 1218 *host_offset = cluster_offset; 1219 return 0; 1220 } else { 1221 int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 1222 if (ret < 0) { 1223 return ret; 1224 } 1225 *nb_clusters = ret; 1226 return 0; 1227 } 1228 } 1229 1230 /* 1231 * Allocates new clusters for an area that either is yet unallocated or needs a 1232 * copy on write. If *host_offset is non-zero, clusters are only allocated if 1233 * the new allocation can match the specified host offset. 1234 * 1235 * Note that guest_offset may not be cluster aligned. In this case, the 1236 * returned *host_offset points to exact byte referenced by guest_offset and 1237 * therefore isn't cluster aligned as well. 1238 * 1239 * Returns: 1240 * 0: if no clusters could be allocated. *bytes is set to 0, 1241 * *host_offset is left unchanged. 1242 * 1243 * 1: if new clusters were allocated. *bytes may be decreased if the 1244 * new allocation doesn't cover all of the requested area. 1245 * *host_offset is updated to contain the host offset of the first 1246 * newly allocated cluster. 1247 * 1248 * -errno: in error cases 1249 */ 1250 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, 1251 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 1252 { 1253 BDRVQcow2State *s = bs->opaque; 1254 int l2_index; 1255 uint64_t *l2_table; 1256 uint64_t entry; 1257 uint64_t nb_clusters; 1258 int ret; 1259 bool keep_old_clusters = false; 1260 1261 uint64_t alloc_cluster_offset = 0; 1262 1263 trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 1264 *bytes); 1265 assert(*bytes > 0); 1266 1267 /* 1268 * Calculate the number of clusters to look for. We stop at L2 table 1269 * boundaries to keep things simple. 1270 */ 1271 nb_clusters = 1272 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1273 1274 l2_index = offset_to_l2_index(s, guest_offset); 1275 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1276 assert(nb_clusters <= INT_MAX); 1277 1278 /* Find L2 entry for the first involved cluster */ 1279 ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 1280 if (ret < 0) { 1281 return ret; 1282 } 1283 1284 entry = be64_to_cpu(l2_table[l2_index]); 1285 1286 /* For the moment, overwrite compressed clusters one by one */ 1287 if (entry & QCOW_OFLAG_COMPRESSED) { 1288 nb_clusters = 1; 1289 } else { 1290 nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); 1291 } 1292 1293 /* This function is only called when there were no non-COW clusters, so if 1294 * we can't find any unallocated or COW clusters either, something is 1295 * wrong with our code. */ 1296 assert(nb_clusters > 0); 1297 1298 if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO_ALLOC && 1299 (entry & QCOW_OFLAG_COPIED) && 1300 (!*host_offset || 1301 start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK))) 1302 { 1303 /* Try to reuse preallocated zero clusters; contiguous normal clusters 1304 * would be fine, too, but count_cow_clusters() above has limited 1305 * nb_clusters already to a range of COW clusters */ 1306 int preallocated_nb_clusters = 1307 count_contiguous_clusters(nb_clusters, s->cluster_size, 1308 &l2_table[l2_index], QCOW_OFLAG_COPIED); 1309 assert(preallocated_nb_clusters > 0); 1310 1311 nb_clusters = preallocated_nb_clusters; 1312 alloc_cluster_offset = entry & L2E_OFFSET_MASK; 1313 1314 /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2() 1315 * should not free them. */ 1316 keep_old_clusters = true; 1317 } 1318 1319 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1320 1321 if (!alloc_cluster_offset) { 1322 /* Allocate, if necessary at a given offset in the image file */ 1323 alloc_cluster_offset = start_of_cluster(s, *host_offset); 1324 ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 1325 &nb_clusters); 1326 if (ret < 0) { 1327 goto fail; 1328 } 1329 1330 /* Can't extend contiguous allocation */ 1331 if (nb_clusters == 0) { 1332 *bytes = 0; 1333 return 0; 1334 } 1335 1336 /* !*host_offset would overwrite the image header and is reserved for 1337 * "no host offset preferred". If 0 was a valid host offset, it'd 1338 * trigger the following overlap check; do that now to avoid having an 1339 * invalid value in *host_offset. */ 1340 if (!alloc_cluster_offset) { 1341 ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, 1342 nb_clusters * s->cluster_size); 1343 assert(ret < 0); 1344 goto fail; 1345 } 1346 } 1347 1348 /* 1349 * Save info needed for meta data update. 1350 * 1351 * requested_bytes: Number of bytes from the start of the first 1352 * newly allocated cluster to the end of the (possibly shortened 1353 * before) write request. 1354 * 1355 * avail_bytes: Number of bytes from the start of the first 1356 * newly allocated to the end of the last newly allocated cluster. 1357 * 1358 * nb_bytes: The number of bytes from the start of the first 1359 * newly allocated cluster to the end of the area that the write 1360 * request actually writes to (excluding COW at the end) 1361 */ 1362 uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset); 1363 int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits); 1364 int nb_bytes = MIN(requested_bytes, avail_bytes); 1365 QCowL2Meta *old_m = *m; 1366 1367 *m = g_malloc0(sizeof(**m)); 1368 1369 **m = (QCowL2Meta) { 1370 .next = old_m, 1371 1372 .alloc_offset = alloc_cluster_offset, 1373 .offset = start_of_cluster(s, guest_offset), 1374 .nb_clusters = nb_clusters, 1375 1376 .keep_old_clusters = keep_old_clusters, 1377 1378 .cow_start = { 1379 .offset = 0, 1380 .nb_bytes = offset_into_cluster(s, guest_offset), 1381 }, 1382 .cow_end = { 1383 .offset = nb_bytes, 1384 .nb_bytes = avail_bytes - nb_bytes, 1385 }, 1386 }; 1387 qemu_co_queue_init(&(*m)->dependent_requests); 1388 QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 1389 1390 *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 1391 *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); 1392 assert(*bytes != 0); 1393 1394 return 1; 1395 1396 fail: 1397 if (*m && (*m)->nb_clusters > 0) { 1398 QLIST_REMOVE(*m, next_in_flight); 1399 } 1400 return ret; 1401 } 1402 1403 /* 1404 * alloc_cluster_offset 1405 * 1406 * For a given offset on the virtual disk, find the cluster offset in qcow2 1407 * file. If the offset is not found, allocate a new cluster. 1408 * 1409 * If the cluster was already allocated, m->nb_clusters is set to 0 and 1410 * other fields in m are meaningless. 1411 * 1412 * If the cluster is newly allocated, m->nb_clusters is set to the number of 1413 * contiguous clusters that have been allocated. In this case, the other 1414 * fields of m are valid and contain information about the first allocated 1415 * cluster. 1416 * 1417 * If the request conflicts with another write request in flight, the coroutine 1418 * is queued and will be reentered when the dependency has completed. 1419 * 1420 * Return 0 on success and -errno in error cases 1421 */ 1422 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, 1423 unsigned int *bytes, uint64_t *host_offset, 1424 QCowL2Meta **m) 1425 { 1426 BDRVQcow2State *s = bs->opaque; 1427 uint64_t start, remaining; 1428 uint64_t cluster_offset; 1429 uint64_t cur_bytes; 1430 int ret; 1431 1432 trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes); 1433 1434 again: 1435 start = offset; 1436 remaining = *bytes; 1437 cluster_offset = 0; 1438 *host_offset = 0; 1439 cur_bytes = 0; 1440 *m = NULL; 1441 1442 while (true) { 1443 1444 if (!*host_offset) { 1445 *host_offset = start_of_cluster(s, cluster_offset); 1446 } 1447 1448 assert(remaining >= cur_bytes); 1449 1450 start += cur_bytes; 1451 remaining -= cur_bytes; 1452 cluster_offset += cur_bytes; 1453 1454 if (remaining == 0) { 1455 break; 1456 } 1457 1458 cur_bytes = remaining; 1459 1460 /* 1461 * Now start gathering as many contiguous clusters as possible: 1462 * 1463 * 1. Check for overlaps with in-flight allocations 1464 * 1465 * a) Overlap not in the first cluster -> shorten this request and 1466 * let the caller handle the rest in its next loop iteration. 1467 * 1468 * b) Real overlaps of two requests. Yield and restart the search 1469 * for contiguous clusters (the situation could have changed 1470 * while we were sleeping) 1471 * 1472 * c) TODO: Request starts in the same cluster as the in-flight 1473 * allocation ends. Shorten the COW of the in-fight allocation, 1474 * set cluster_offset to write to the same cluster and set up 1475 * the right synchronisation between the in-flight request and 1476 * the new one. 1477 */ 1478 ret = handle_dependencies(bs, start, &cur_bytes, m); 1479 if (ret == -EAGAIN) { 1480 /* Currently handle_dependencies() doesn't yield if we already had 1481 * an allocation. If it did, we would have to clean up the L2Meta 1482 * structs before starting over. */ 1483 assert(*m == NULL); 1484 goto again; 1485 } else if (ret < 0) { 1486 return ret; 1487 } else if (cur_bytes == 0) { 1488 break; 1489 } else { 1490 /* handle_dependencies() may have decreased cur_bytes (shortened 1491 * the allocations below) so that the next dependency is processed 1492 * correctly during the next loop iteration. */ 1493 } 1494 1495 /* 1496 * 2. Count contiguous COPIED clusters. 1497 */ 1498 ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 1499 if (ret < 0) { 1500 return ret; 1501 } else if (ret) { 1502 continue; 1503 } else if (cur_bytes == 0) { 1504 break; 1505 } 1506 1507 /* 1508 * 3. If the request still hasn't completed, allocate new clusters, 1509 * considering any cluster_offset of steps 1c or 2. 1510 */ 1511 ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1512 if (ret < 0) { 1513 return ret; 1514 } else if (ret) { 1515 continue; 1516 } else { 1517 assert(cur_bytes == 0); 1518 break; 1519 } 1520 } 1521 1522 *bytes -= remaining; 1523 assert(*bytes > 0); 1524 assert(*host_offset != 0); 1525 1526 return 0; 1527 } 1528 1529 static int decompress_buffer(uint8_t *out_buf, int out_buf_size, 1530 const uint8_t *buf, int buf_size) 1531 { 1532 z_stream strm1, *strm = &strm1; 1533 int ret, out_len; 1534 1535 memset(strm, 0, sizeof(*strm)); 1536 1537 strm->next_in = (uint8_t *)buf; 1538 strm->avail_in = buf_size; 1539 strm->next_out = out_buf; 1540 strm->avail_out = out_buf_size; 1541 1542 ret = inflateInit2(strm, -12); 1543 if (ret != Z_OK) 1544 return -1; 1545 ret = inflate(strm, Z_FINISH); 1546 out_len = strm->next_out - out_buf; 1547 if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || 1548 out_len != out_buf_size) { 1549 inflateEnd(strm); 1550 return -1; 1551 } 1552 inflateEnd(strm); 1553 return 0; 1554 } 1555 1556 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) 1557 { 1558 BDRVQcow2State *s = bs->opaque; 1559 int ret, csize, nb_csectors, sector_offset; 1560 uint64_t coffset; 1561 1562 coffset = cluster_offset & s->cluster_offset_mask; 1563 if (s->cluster_cache_offset != coffset) { 1564 nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 1565 sector_offset = coffset & 511; 1566 csize = nb_csectors * 512 - sector_offset; 1567 1568 /* Allocate buffers on first decompress operation, most images are 1569 * uncompressed and the memory overhead can be avoided. The buffers 1570 * are freed in .bdrv_close(). 1571 */ 1572 if (!s->cluster_data) { 1573 /* one more sector for decompressed data alignment */ 1574 s->cluster_data = qemu_try_blockalign(bs->file->bs, 1575 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + 512); 1576 if (!s->cluster_data) { 1577 return -ENOMEM; 1578 } 1579 } 1580 if (!s->cluster_cache) { 1581 s->cluster_cache = g_malloc(s->cluster_size); 1582 } 1583 1584 BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 1585 ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, 1586 nb_csectors); 1587 if (ret < 0) { 1588 return ret; 1589 } 1590 if (decompress_buffer(s->cluster_cache, s->cluster_size, 1591 s->cluster_data + sector_offset, csize) < 0) { 1592 return -EIO; 1593 } 1594 s->cluster_cache_offset = coffset; 1595 } 1596 return 0; 1597 } 1598 1599 /* 1600 * This discards as many clusters of nb_clusters as possible at once (i.e. 1601 * all clusters in the same L2 table) and returns the number of discarded 1602 * clusters. 1603 */ 1604 static int discard_single_l2(BlockDriverState *bs, uint64_t offset, 1605 uint64_t nb_clusters, enum qcow2_discard_type type, 1606 bool full_discard) 1607 { 1608 BDRVQcow2State *s = bs->opaque; 1609 uint64_t *l2_table; 1610 int l2_index; 1611 int ret; 1612 int i; 1613 1614 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1615 if (ret < 0) { 1616 return ret; 1617 } 1618 1619 /* Limit nb_clusters to one L2 table */ 1620 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1621 assert(nb_clusters <= INT_MAX); 1622 1623 for (i = 0; i < nb_clusters; i++) { 1624 uint64_t old_l2_entry; 1625 1626 old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); 1627 1628 /* 1629 * If full_discard is false, make sure that a discarded area reads back 1630 * as zeroes for v3 images (we cannot do it for v2 without actually 1631 * writing a zero-filled buffer). We can skip the operation if the 1632 * cluster is already marked as zero, or if it's unallocated and we 1633 * don't have a backing file. 1634 * 1635 * TODO We might want to use bdrv_get_block_status(bs) here, but we're 1636 * holding s->lock, so that doesn't work today. 1637 * 1638 * If full_discard is true, the sector should not read back as zeroes, 1639 * but rather fall through to the backing file. 1640 */ 1641 switch (qcow2_get_cluster_type(old_l2_entry)) { 1642 case QCOW2_CLUSTER_UNALLOCATED: 1643 if (full_discard || !bs->backing) { 1644 continue; 1645 } 1646 break; 1647 1648 case QCOW2_CLUSTER_ZERO_PLAIN: 1649 if (!full_discard) { 1650 continue; 1651 } 1652 break; 1653 1654 case QCOW2_CLUSTER_ZERO_ALLOC: 1655 case QCOW2_CLUSTER_NORMAL: 1656 case QCOW2_CLUSTER_COMPRESSED: 1657 break; 1658 1659 default: 1660 abort(); 1661 } 1662 1663 /* First remove L2 entries */ 1664 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1665 if (!full_discard && s->qcow_version >= 3) { 1666 l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1667 } else { 1668 l2_table[l2_index + i] = cpu_to_be64(0); 1669 } 1670 1671 /* Then decrease the refcount */ 1672 qcow2_free_any_clusters(bs, old_l2_entry, 1, type); 1673 } 1674 1675 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1676 1677 return nb_clusters; 1678 } 1679 1680 int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, 1681 uint64_t bytes, enum qcow2_discard_type type, 1682 bool full_discard) 1683 { 1684 BDRVQcow2State *s = bs->opaque; 1685 uint64_t end_offset = offset + bytes; 1686 uint64_t nb_clusters; 1687 int64_t cleared; 1688 int ret; 1689 1690 /* Caller must pass aligned values, except at image end */ 1691 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 1692 assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 1693 end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 1694 1695 nb_clusters = size_to_clusters(s, bytes); 1696 1697 s->cache_discards = true; 1698 1699 /* Each L2 table is handled by its own loop iteration */ 1700 while (nb_clusters > 0) { 1701 cleared = discard_single_l2(bs, offset, nb_clusters, type, 1702 full_discard); 1703 if (cleared < 0) { 1704 ret = cleared; 1705 goto fail; 1706 } 1707 1708 nb_clusters -= cleared; 1709 offset += (cleared * s->cluster_size); 1710 } 1711 1712 ret = 0; 1713 fail: 1714 s->cache_discards = false; 1715 qcow2_process_discards(bs, ret); 1716 1717 return ret; 1718 } 1719 1720 /* 1721 * This zeroes as many clusters of nb_clusters as possible at once (i.e. 1722 * all clusters in the same L2 table) and returns the number of zeroed 1723 * clusters. 1724 */ 1725 static int zero_single_l2(BlockDriverState *bs, uint64_t offset, 1726 uint64_t nb_clusters, int flags) 1727 { 1728 BDRVQcow2State *s = bs->opaque; 1729 uint64_t *l2_table; 1730 int l2_index; 1731 int ret; 1732 int i; 1733 bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP); 1734 1735 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1736 if (ret < 0) { 1737 return ret; 1738 } 1739 1740 /* Limit nb_clusters to one L2 table */ 1741 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1742 assert(nb_clusters <= INT_MAX); 1743 1744 for (i = 0; i < nb_clusters; i++) { 1745 uint64_t old_offset; 1746 QCow2ClusterType cluster_type; 1747 1748 old_offset = be64_to_cpu(l2_table[l2_index + i]); 1749 1750 /* 1751 * Minimize L2 changes if the cluster already reads back as 1752 * zeroes with correct allocation. 1753 */ 1754 cluster_type = qcow2_get_cluster_type(old_offset); 1755 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN || 1756 (cluster_type == QCOW2_CLUSTER_ZERO_ALLOC && !unmap)) { 1757 continue; 1758 } 1759 1760 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1761 if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) { 1762 l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1763 qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1764 } else { 1765 l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); 1766 } 1767 } 1768 1769 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1770 1771 return nb_clusters; 1772 } 1773 1774 int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset, 1775 uint64_t bytes, int flags) 1776 { 1777 BDRVQcow2State *s = bs->opaque; 1778 uint64_t end_offset = offset + bytes; 1779 uint64_t nb_clusters; 1780 int64_t cleared; 1781 int ret; 1782 1783 /* Caller must pass aligned values, except at image end */ 1784 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 1785 assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 1786 end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 1787 1788 /* The zero flag is only supported by version 3 and newer */ 1789 if (s->qcow_version < 3) { 1790 return -ENOTSUP; 1791 } 1792 1793 /* Each L2 table is handled by its own loop iteration */ 1794 nb_clusters = size_to_clusters(s, bytes); 1795 1796 s->cache_discards = true; 1797 1798 while (nb_clusters > 0) { 1799 cleared = zero_single_l2(bs, offset, nb_clusters, flags); 1800 if (cleared < 0) { 1801 ret = cleared; 1802 goto fail; 1803 } 1804 1805 nb_clusters -= cleared; 1806 offset += (cleared * s->cluster_size); 1807 } 1808 1809 ret = 0; 1810 fail: 1811 s->cache_discards = false; 1812 qcow2_process_discards(bs, ret); 1813 1814 return ret; 1815 } 1816 1817 /* 1818 * Expands all zero clusters in a specific L1 table (or deallocates them, for 1819 * non-backed non-pre-allocated zero clusters). 1820 * 1821 * l1_entries and *visited_l1_entries are used to keep track of progress for 1822 * status_cb(). l1_entries contains the total number of L1 entries and 1823 * *visited_l1_entries counts all visited L1 entries. 1824 */ 1825 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, 1826 int l1_size, int64_t *visited_l1_entries, 1827 int64_t l1_entries, 1828 BlockDriverAmendStatusCB *status_cb, 1829 void *cb_opaque) 1830 { 1831 BDRVQcow2State *s = bs->opaque; 1832 bool is_active_l1 = (l1_table == s->l1_table); 1833 uint64_t *l2_table = NULL; 1834 int ret; 1835 int i, j; 1836 1837 if (!is_active_l1) { 1838 /* inactive L2 tables require a buffer to be stored in when loading 1839 * them from disk */ 1840 l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size); 1841 if (l2_table == NULL) { 1842 return -ENOMEM; 1843 } 1844 } 1845 1846 for (i = 0; i < l1_size; i++) { 1847 uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; 1848 bool l2_dirty = false; 1849 uint64_t l2_refcount; 1850 1851 if (!l2_offset) { 1852 /* unallocated */ 1853 (*visited_l1_entries)++; 1854 if (status_cb) { 1855 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 1856 } 1857 continue; 1858 } 1859 1860 if (offset_into_cluster(s, l2_offset)) { 1861 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 1862 PRIx64 " unaligned (L1 index: %#x)", 1863 l2_offset, i); 1864 ret = -EIO; 1865 goto fail; 1866 } 1867 1868 if (is_active_l1) { 1869 /* get active L2 tables from cache */ 1870 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 1871 (void **)&l2_table); 1872 } else { 1873 /* load inactive L2 tables from disk */ 1874 ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, 1875 (void *)l2_table, s->cluster_sectors); 1876 } 1877 if (ret < 0) { 1878 goto fail; 1879 } 1880 1881 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1882 &l2_refcount); 1883 if (ret < 0) { 1884 goto fail; 1885 } 1886 1887 for (j = 0; j < s->l2_size; j++) { 1888 uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1889 int64_t offset = l2_entry & L2E_OFFSET_MASK; 1890 QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry); 1891 1892 if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN && 1893 cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) { 1894 continue; 1895 } 1896 1897 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 1898 if (!bs->backing) { 1899 /* not backed; therefore we can simply deallocate the 1900 * cluster */ 1901 l2_table[j] = 0; 1902 l2_dirty = true; 1903 continue; 1904 } 1905 1906 offset = qcow2_alloc_clusters(bs, s->cluster_size); 1907 if (offset < 0) { 1908 ret = offset; 1909 goto fail; 1910 } 1911 1912 if (l2_refcount > 1) { 1913 /* For shared L2 tables, set the refcount accordingly (it is 1914 * already 1 and needs to be l2_refcount) */ 1915 ret = qcow2_update_cluster_refcount(bs, 1916 offset >> s->cluster_bits, 1917 refcount_diff(1, l2_refcount), false, 1918 QCOW2_DISCARD_OTHER); 1919 if (ret < 0) { 1920 qcow2_free_clusters(bs, offset, s->cluster_size, 1921 QCOW2_DISCARD_OTHER); 1922 goto fail; 1923 } 1924 } 1925 } 1926 1927 if (offset_into_cluster(s, offset)) { 1928 qcow2_signal_corruption(bs, true, -1, -1, 1929 "Cluster allocation offset " 1930 "%#" PRIx64 " unaligned (L2 offset: %#" 1931 PRIx64 ", L2 index: %#x)", offset, 1932 l2_offset, j); 1933 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 1934 qcow2_free_clusters(bs, offset, s->cluster_size, 1935 QCOW2_DISCARD_ALWAYS); 1936 } 1937 ret = -EIO; 1938 goto fail; 1939 } 1940 1941 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 1942 if (ret < 0) { 1943 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 1944 qcow2_free_clusters(bs, offset, s->cluster_size, 1945 QCOW2_DISCARD_ALWAYS); 1946 } 1947 goto fail; 1948 } 1949 1950 ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0); 1951 if (ret < 0) { 1952 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 1953 qcow2_free_clusters(bs, offset, s->cluster_size, 1954 QCOW2_DISCARD_ALWAYS); 1955 } 1956 goto fail; 1957 } 1958 1959 if (l2_refcount == 1) { 1960 l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); 1961 } else { 1962 l2_table[j] = cpu_to_be64(offset); 1963 } 1964 l2_dirty = true; 1965 } 1966 1967 if (is_active_l1) { 1968 if (l2_dirty) { 1969 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1970 qcow2_cache_depends_on_flush(s->l2_table_cache); 1971 } 1972 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1973 } else { 1974 if (l2_dirty) { 1975 ret = qcow2_pre_write_overlap_check(bs, 1976 QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, 1977 s->cluster_size); 1978 if (ret < 0) { 1979 goto fail; 1980 } 1981 1982 ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, 1983 (void *)l2_table, s->cluster_sectors); 1984 if (ret < 0) { 1985 goto fail; 1986 } 1987 } 1988 } 1989 1990 (*visited_l1_entries)++; 1991 if (status_cb) { 1992 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 1993 } 1994 } 1995 1996 ret = 0; 1997 1998 fail: 1999 if (l2_table) { 2000 if (!is_active_l1) { 2001 qemu_vfree(l2_table); 2002 } else { 2003 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 2004 } 2005 } 2006 return ret; 2007 } 2008 2009 /* 2010 * For backed images, expands all zero clusters on the image. For non-backed 2011 * images, deallocates all non-pre-allocated zero clusters (and claims the 2012 * allocation for pre-allocated ones). This is important for downgrading to a 2013 * qcow2 version which doesn't yet support metadata zero clusters. 2014 */ 2015 int qcow2_expand_zero_clusters(BlockDriverState *bs, 2016 BlockDriverAmendStatusCB *status_cb, 2017 void *cb_opaque) 2018 { 2019 BDRVQcow2State *s = bs->opaque; 2020 uint64_t *l1_table = NULL; 2021 int64_t l1_entries = 0, visited_l1_entries = 0; 2022 int ret; 2023 int i, j; 2024 2025 if (status_cb) { 2026 l1_entries = s->l1_size; 2027 for (i = 0; i < s->nb_snapshots; i++) { 2028 l1_entries += s->snapshots[i].l1_size; 2029 } 2030 } 2031 2032 ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, 2033 &visited_l1_entries, l1_entries, 2034 status_cb, cb_opaque); 2035 if (ret < 0) { 2036 goto fail; 2037 } 2038 2039 /* Inactive L1 tables may point to active L2 tables - therefore it is 2040 * necessary to flush the L2 table cache before trying to access the L2 2041 * tables pointed to by inactive L1 entries (else we might try to expand 2042 * zero clusters that have already been expanded); furthermore, it is also 2043 * necessary to empty the L2 table cache, since it may contain tables which 2044 * are now going to be modified directly on disk, bypassing the cache. 2045 * qcow2_cache_empty() does both for us. */ 2046 ret = qcow2_cache_empty(bs, s->l2_table_cache); 2047 if (ret < 0) { 2048 goto fail; 2049 } 2050 2051 for (i = 0; i < s->nb_snapshots; i++) { 2052 int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size * 2053 sizeof(uint64_t), BDRV_SECTOR_SIZE); 2054 2055 l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); 2056 2057 ret = bdrv_read(bs->file, 2058 s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, 2059 (void *)l1_table, l1_sectors); 2060 if (ret < 0) { 2061 goto fail; 2062 } 2063 2064 for (j = 0; j < s->snapshots[i].l1_size; j++) { 2065 be64_to_cpus(&l1_table[j]); 2066 } 2067 2068 ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, 2069 &visited_l1_entries, l1_entries, 2070 status_cb, cb_opaque); 2071 if (ret < 0) { 2072 goto fail; 2073 } 2074 } 2075 2076 ret = 0; 2077 2078 fail: 2079 g_free(l1_table); 2080 return ret; 2081 } 2082