1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include <zlib.h> 27 28 #include "qapi/error.h" 29 #include "qemu-common.h" 30 #include "block/block_int.h" 31 #include "block/qcow2.h" 32 #include "qemu/bswap.h" 33 #include "trace.h" 34 35 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 36 bool exact_size) 37 { 38 BDRVQcow2State *s = bs->opaque; 39 int new_l1_size2, ret, i; 40 uint64_t *new_l1_table; 41 int64_t old_l1_table_offset, old_l1_size; 42 int64_t new_l1_table_offset, new_l1_size; 43 uint8_t data[12]; 44 45 if (min_size <= s->l1_size) 46 return 0; 47 48 /* Do a sanity check on min_size before trying to calculate new_l1_size 49 * (this prevents overflows during the while loop for the calculation of 50 * new_l1_size) */ 51 if (min_size > INT_MAX / sizeof(uint64_t)) { 52 return -EFBIG; 53 } 54 55 if (exact_size) { 56 new_l1_size = min_size; 57 } else { 58 /* Bump size up to reduce the number of times we have to grow */ 59 new_l1_size = s->l1_size; 60 if (new_l1_size == 0) { 61 new_l1_size = 1; 62 } 63 while (min_size > new_l1_size) { 64 new_l1_size = (new_l1_size * 3 + 1) / 2; 65 } 66 } 67 68 if (new_l1_size > INT_MAX / sizeof(uint64_t)) { 69 return -EFBIG; 70 } 71 72 #ifdef DEBUG_ALLOC2 73 fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 74 s->l1_size, new_l1_size); 75 #endif 76 77 new_l1_size2 = sizeof(uint64_t) * new_l1_size; 78 new_l1_table = qemu_try_blockalign(bs->file->bs, 79 align_offset(new_l1_size2, 512)); 80 if (new_l1_table == NULL) { 81 return -ENOMEM; 82 } 83 memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); 84 85 memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); 86 87 /* write new table (align to cluster) */ 88 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 89 new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 90 if (new_l1_table_offset < 0) { 91 qemu_vfree(new_l1_table); 92 return new_l1_table_offset; 93 } 94 95 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 96 if (ret < 0) { 97 goto fail; 98 } 99 100 /* the L1 position has not yet been updated, so these clusters must 101 * indeed be completely free */ 102 ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, 103 new_l1_size2); 104 if (ret < 0) { 105 goto fail; 106 } 107 108 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 109 for(i = 0; i < s->l1_size; i++) 110 new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 111 ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset, 112 new_l1_table, new_l1_size2); 113 if (ret < 0) 114 goto fail; 115 for(i = 0; i < s->l1_size; i++) 116 new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 117 118 /* set new table */ 119 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 120 cpu_to_be32w((uint32_t*)data, new_l1_size); 121 stq_be_p(data + 4, new_l1_table_offset); 122 ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size), 123 data, sizeof(data)); 124 if (ret < 0) { 125 goto fail; 126 } 127 qemu_vfree(s->l1_table); 128 old_l1_table_offset = s->l1_table_offset; 129 s->l1_table_offset = new_l1_table_offset; 130 s->l1_table = new_l1_table; 131 old_l1_size = s->l1_size; 132 s->l1_size = new_l1_size; 133 qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), 134 QCOW2_DISCARD_OTHER); 135 return 0; 136 fail: 137 qemu_vfree(new_l1_table); 138 qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 139 QCOW2_DISCARD_OTHER); 140 return ret; 141 } 142 143 /* 144 * l2_load 145 * 146 * Loads a L2 table into memory. If the table is in the cache, the cache 147 * is used; otherwise the L2 table is loaded from the image file. 148 * 149 * Returns a pointer to the L2 table on success, or NULL if the read from 150 * the image file failed. 151 */ 152 153 static int l2_load(BlockDriverState *bs, uint64_t l2_offset, 154 uint64_t **l2_table) 155 { 156 BDRVQcow2State *s = bs->opaque; 157 158 return qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 159 (void **)l2_table); 160 } 161 162 /* 163 * Writes one sector of the L1 table to the disk (can't update single entries 164 * and we really don't want bdrv_pread to perform a read-modify-write) 165 */ 166 #define L1_ENTRIES_PER_SECTOR (512 / 8) 167 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) 168 { 169 BDRVQcow2State *s = bs->opaque; 170 uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; 171 int l1_start_index; 172 int i, ret; 173 174 l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); 175 for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; 176 i++) 177 { 178 buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 179 } 180 181 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, 182 s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); 183 if (ret < 0) { 184 return ret; 185 } 186 187 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 188 ret = bdrv_pwrite_sync(bs->file->bs, 189 s->l1_table_offset + 8 * l1_start_index, 190 buf, sizeof(buf)); 191 if (ret < 0) { 192 return ret; 193 } 194 195 return 0; 196 } 197 198 /* 199 * l2_allocate 200 * 201 * Allocate a new l2 entry in the file. If l1_index points to an already 202 * used entry in the L2 table (i.e. we are doing a copy on write for the L2 203 * table) copy the contents of the old L2 table into the newly allocated one. 204 * Otherwise the new table is initialized with zeros. 205 * 206 */ 207 208 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) 209 { 210 BDRVQcow2State *s = bs->opaque; 211 uint64_t old_l2_offset; 212 uint64_t *l2_table = NULL; 213 int64_t l2_offset; 214 int ret; 215 216 old_l2_offset = s->l1_table[l1_index]; 217 218 trace_qcow2_l2_allocate(bs, l1_index); 219 220 /* allocate a new l2 entry */ 221 222 l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); 223 if (l2_offset < 0) { 224 ret = l2_offset; 225 goto fail; 226 } 227 228 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 229 if (ret < 0) { 230 goto fail; 231 } 232 233 /* allocate a new entry in the l2 cache */ 234 235 trace_qcow2_l2_allocate_get_empty(bs, l1_index); 236 ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); 237 if (ret < 0) { 238 goto fail; 239 } 240 241 l2_table = *table; 242 243 if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 244 /* if there was no old l2 table, clear the new table */ 245 memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); 246 } else { 247 uint64_t* old_table; 248 249 /* if there was an old l2 table, read it from the disk */ 250 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 251 ret = qcow2_cache_get(bs, s->l2_table_cache, 252 old_l2_offset & L1E_OFFSET_MASK, 253 (void**) &old_table); 254 if (ret < 0) { 255 goto fail; 256 } 257 258 memcpy(l2_table, old_table, s->cluster_size); 259 260 qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); 261 } 262 263 /* write the l2 table to the file */ 264 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 265 266 trace_qcow2_l2_allocate_write_l2(bs, l1_index); 267 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 268 ret = qcow2_cache_flush(bs, s->l2_table_cache); 269 if (ret < 0) { 270 goto fail; 271 } 272 273 /* update the L1 entry */ 274 trace_qcow2_l2_allocate_write_l1(bs, l1_index); 275 s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 276 ret = qcow2_write_l1_entry(bs, l1_index); 277 if (ret < 0) { 278 goto fail; 279 } 280 281 *table = l2_table; 282 trace_qcow2_l2_allocate_done(bs, l1_index, 0); 283 return 0; 284 285 fail: 286 trace_qcow2_l2_allocate_done(bs, l1_index, ret); 287 if (l2_table != NULL) { 288 qcow2_cache_put(bs, s->l2_table_cache, (void**) table); 289 } 290 s->l1_table[l1_index] = old_l2_offset; 291 if (l2_offset > 0) { 292 qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 293 QCOW2_DISCARD_ALWAYS); 294 } 295 return ret; 296 } 297 298 /* 299 * Checks how many clusters in a given L2 table are contiguous in the image 300 * file. As soon as one of the flags in the bitmask stop_flags changes compared 301 * to the first cluster, the search is stopped and the cluster is not counted 302 * as contiguous. (This allows it, for example, to stop at the first compressed 303 * cluster which may require a different handling) 304 */ 305 static int count_contiguous_clusters(int nb_clusters, int cluster_size, 306 uint64_t *l2_table, uint64_t stop_flags) 307 { 308 int i; 309 uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; 310 uint64_t first_entry = be64_to_cpu(l2_table[0]); 311 uint64_t offset = first_entry & mask; 312 313 if (!offset) 314 return 0; 315 316 assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL); 317 318 for (i = 0; i < nb_clusters; i++) { 319 uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; 320 if (offset + (uint64_t) i * cluster_size != l2_entry) { 321 break; 322 } 323 } 324 325 return i; 326 } 327 328 static int count_contiguous_clusters_by_type(int nb_clusters, 329 uint64_t *l2_table, 330 int wanted_type) 331 { 332 int i; 333 334 for (i = 0; i < nb_clusters; i++) { 335 int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); 336 337 if (type != wanted_type) { 338 break; 339 } 340 } 341 342 return i; 343 } 344 345 /* The crypt function is compatible with the linux cryptoloop 346 algorithm for < 4 GB images. NOTE: out_buf == in_buf is 347 supported */ 348 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, 349 uint8_t *out_buf, const uint8_t *in_buf, 350 int nb_sectors, bool enc, 351 Error **errp) 352 { 353 union { 354 uint64_t ll[2]; 355 uint8_t b[16]; 356 } ivec; 357 int i; 358 int ret; 359 360 for(i = 0; i < nb_sectors; i++) { 361 ivec.ll[0] = cpu_to_le64(sector_num); 362 ivec.ll[1] = 0; 363 if (qcrypto_cipher_setiv(s->cipher, 364 ivec.b, G_N_ELEMENTS(ivec.b), 365 errp) < 0) { 366 return -1; 367 } 368 if (enc) { 369 ret = qcrypto_cipher_encrypt(s->cipher, 370 in_buf, 371 out_buf, 372 512, 373 errp); 374 } else { 375 ret = qcrypto_cipher_decrypt(s->cipher, 376 in_buf, 377 out_buf, 378 512, 379 errp); 380 } 381 if (ret < 0) { 382 return -1; 383 } 384 sector_num++; 385 in_buf += 512; 386 out_buf += 512; 387 } 388 return 0; 389 } 390 391 static int coroutine_fn do_perform_cow(BlockDriverState *bs, 392 uint64_t src_cluster_offset, 393 uint64_t cluster_offset, 394 int offset_in_cluster, 395 int bytes) 396 { 397 BDRVQcow2State *s = bs->opaque; 398 QEMUIOVector qiov; 399 struct iovec iov; 400 int ret; 401 402 iov.iov_len = bytes; 403 iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); 404 if (iov.iov_base == NULL) { 405 return -ENOMEM; 406 } 407 408 qemu_iovec_init_external(&qiov, &iov, 1); 409 410 BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 411 412 if (!bs->drv) { 413 ret = -ENOMEDIUM; 414 goto out; 415 } 416 417 /* Call .bdrv_co_readv() directly instead of using the public block-layer 418 * interface. This avoids double I/O throttling and request tracking, 419 * which can lead to deadlock when block layer copy-on-read is enabled. 420 */ 421 ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster, 422 bytes, &qiov, 0); 423 if (ret < 0) { 424 goto out; 425 } 426 427 if (bs->encrypted) { 428 Error *err = NULL; 429 int64_t sector = (cluster_offset + offset_in_cluster) 430 >> BDRV_SECTOR_BITS; 431 assert(s->cipher); 432 assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0); 433 assert((bytes & ~BDRV_SECTOR_MASK) == 0); 434 if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base, 435 bytes >> BDRV_SECTOR_BITS, true, &err) < 0) { 436 ret = -EIO; 437 error_free(err); 438 goto out; 439 } 440 } 441 442 ret = qcow2_pre_write_overlap_check(bs, 0, 443 cluster_offset + offset_in_cluster, bytes); 444 if (ret < 0) { 445 goto out; 446 } 447 448 BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 449 ret = bdrv_co_pwritev(bs->file->bs, cluster_offset + offset_in_cluster, 450 bytes, &qiov, 0); 451 if (ret < 0) { 452 goto out; 453 } 454 455 ret = 0; 456 out: 457 qemu_vfree(iov.iov_base); 458 return ret; 459 } 460 461 462 /* 463 * get_cluster_offset 464 * 465 * For a given offset of the virtual disk, find the cluster type and offset in 466 * the qcow2 file. The offset is stored in *cluster_offset. 467 * 468 * On entry, *bytes is the maximum number of contiguous bytes starting at 469 * offset that we are interested in. 470 * 471 * On exit, *bytes is the number of bytes starting at offset that have the same 472 * cluster type and (if applicable) are stored contiguously in the image file. 473 * Compressed clusters are always returned one by one. 474 * 475 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error 476 * cases. 477 */ 478 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, 479 unsigned int *bytes, uint64_t *cluster_offset) 480 { 481 BDRVQcow2State *s = bs->opaque; 482 unsigned int l2_index; 483 uint64_t l1_index, l2_offset, *l2_table; 484 int l1_bits, c; 485 unsigned int offset_in_cluster, nb_clusters; 486 uint64_t bytes_available, bytes_needed; 487 int ret; 488 489 offset_in_cluster = offset_into_cluster(s, offset); 490 bytes_needed = (uint64_t) *bytes + offset_in_cluster; 491 492 l1_bits = s->l2_bits + s->cluster_bits; 493 494 /* compute how many bytes there are between the start of the cluster 495 * containing offset and the end of the l1 entry */ 496 bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)) 497 + offset_in_cluster; 498 499 if (bytes_needed > bytes_available) { 500 bytes_needed = bytes_available; 501 } 502 assert(bytes_needed <= INT_MAX); 503 504 *cluster_offset = 0; 505 506 /* seek to the l2 offset in the l1 table */ 507 508 l1_index = offset >> l1_bits; 509 if (l1_index >= s->l1_size) { 510 ret = QCOW2_CLUSTER_UNALLOCATED; 511 goto out; 512 } 513 514 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 515 if (!l2_offset) { 516 ret = QCOW2_CLUSTER_UNALLOCATED; 517 goto out; 518 } 519 520 if (offset_into_cluster(s, l2_offset)) { 521 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 522 " unaligned (L1 index: %#" PRIx64 ")", 523 l2_offset, l1_index); 524 return -EIO; 525 } 526 527 /* load the l2 table in memory */ 528 529 ret = l2_load(bs, l2_offset, &l2_table); 530 if (ret < 0) { 531 return ret; 532 } 533 534 /* find the cluster offset for the given disk offset */ 535 536 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 537 *cluster_offset = be64_to_cpu(l2_table[l2_index]); 538 539 /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */ 540 nb_clusters = size_to_clusters(s, bytes_needed); 541 542 ret = qcow2_get_cluster_type(*cluster_offset); 543 switch (ret) { 544 case QCOW2_CLUSTER_COMPRESSED: 545 /* Compressed clusters can only be processed one by one */ 546 c = 1; 547 *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; 548 break; 549 case QCOW2_CLUSTER_ZERO: 550 if (s->qcow_version < 3) { 551 qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" 552 " in pre-v3 image (L2 offset: %#" PRIx64 553 ", L2 index: %#x)", l2_offset, l2_index); 554 ret = -EIO; 555 goto fail; 556 } 557 c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index], 558 QCOW2_CLUSTER_ZERO); 559 *cluster_offset = 0; 560 break; 561 case QCOW2_CLUSTER_UNALLOCATED: 562 /* how many empty clusters ? */ 563 c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index], 564 QCOW2_CLUSTER_UNALLOCATED); 565 *cluster_offset = 0; 566 break; 567 case QCOW2_CLUSTER_NORMAL: 568 /* how many allocated clusters ? */ 569 c = count_contiguous_clusters(nb_clusters, s->cluster_size, 570 &l2_table[l2_index], QCOW_OFLAG_ZERO); 571 *cluster_offset &= L2E_OFFSET_MASK; 572 if (offset_into_cluster(s, *cluster_offset)) { 573 qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#" 574 PRIx64 " unaligned (L2 offset: %#" PRIx64 575 ", L2 index: %#x)", *cluster_offset, 576 l2_offset, l2_index); 577 ret = -EIO; 578 goto fail; 579 } 580 break; 581 default: 582 abort(); 583 } 584 585 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 586 587 bytes_available = (c * s->cluster_size); 588 589 out: 590 if (bytes_available > bytes_needed) { 591 bytes_available = bytes_needed; 592 } 593 594 *bytes = bytes_available - offset_in_cluster; 595 596 return ret; 597 598 fail: 599 qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); 600 return ret; 601 } 602 603 /* 604 * get_cluster_table 605 * 606 * for a given disk offset, load (and allocate if needed) 607 * the l2 table. 608 * 609 * the l2 table offset in the qcow2 file and the cluster index 610 * in the l2 table are given to the caller. 611 * 612 * Returns 0 on success, -errno in failure case 613 */ 614 static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 615 uint64_t **new_l2_table, 616 int *new_l2_index) 617 { 618 BDRVQcow2State *s = bs->opaque; 619 unsigned int l2_index; 620 uint64_t l1_index, l2_offset; 621 uint64_t *l2_table = NULL; 622 int ret; 623 624 /* seek to the l2 offset in the l1 table */ 625 626 l1_index = offset >> (s->l2_bits + s->cluster_bits); 627 if (l1_index >= s->l1_size) { 628 ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 629 if (ret < 0) { 630 return ret; 631 } 632 } 633 634 assert(l1_index < s->l1_size); 635 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 636 if (offset_into_cluster(s, l2_offset)) { 637 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 638 " unaligned (L1 index: %#" PRIx64 ")", 639 l2_offset, l1_index); 640 return -EIO; 641 } 642 643 /* seek the l2 table of the given l2 offset */ 644 645 if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { 646 /* load the l2 table in memory */ 647 ret = l2_load(bs, l2_offset, &l2_table); 648 if (ret < 0) { 649 return ret; 650 } 651 } else { 652 /* First allocate a new L2 table (and do COW if needed) */ 653 ret = l2_allocate(bs, l1_index, &l2_table); 654 if (ret < 0) { 655 return ret; 656 } 657 658 /* Then decrease the refcount of the old table */ 659 if (l2_offset) { 660 qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 661 QCOW2_DISCARD_OTHER); 662 } 663 } 664 665 /* find the cluster offset for the given disk offset */ 666 667 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 668 669 *new_l2_table = l2_table; 670 *new_l2_index = l2_index; 671 672 return 0; 673 } 674 675 /* 676 * alloc_compressed_cluster_offset 677 * 678 * For a given offset of the disk image, return cluster offset in 679 * qcow2 file. 680 * 681 * If the offset is not found, allocate a new compressed cluster. 682 * 683 * Return the cluster offset if successful, 684 * Return 0, otherwise. 685 * 686 */ 687 688 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 689 uint64_t offset, 690 int compressed_size) 691 { 692 BDRVQcow2State *s = bs->opaque; 693 int l2_index, ret; 694 uint64_t *l2_table; 695 int64_t cluster_offset; 696 int nb_csectors; 697 698 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 699 if (ret < 0) { 700 return 0; 701 } 702 703 /* Compression can't overwrite anything. Fail if the cluster was already 704 * allocated. */ 705 cluster_offset = be64_to_cpu(l2_table[l2_index]); 706 if (cluster_offset & L2E_OFFSET_MASK) { 707 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 708 return 0; 709 } 710 711 cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 712 if (cluster_offset < 0) { 713 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 714 return 0; 715 } 716 717 nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - 718 (cluster_offset >> 9); 719 720 cluster_offset |= QCOW_OFLAG_COMPRESSED | 721 ((uint64_t)nb_csectors << s->csize_shift); 722 723 /* update L2 table */ 724 725 /* compressed clusters never have the copied flag */ 726 727 BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 728 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 729 l2_table[l2_index] = cpu_to_be64(cluster_offset); 730 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 731 732 return cluster_offset; 733 } 734 735 static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) 736 { 737 BDRVQcow2State *s = bs->opaque; 738 int ret; 739 740 if (r->nb_bytes == 0) { 741 return 0; 742 } 743 744 qemu_co_mutex_unlock(&s->lock); 745 ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, r->nb_bytes); 746 qemu_co_mutex_lock(&s->lock); 747 748 if (ret < 0) { 749 return ret; 750 } 751 752 /* 753 * Before we update the L2 table to actually point to the new cluster, we 754 * need to be sure that the refcounts have been increased and COW was 755 * handled. 756 */ 757 qcow2_cache_depends_on_flush(s->l2_table_cache); 758 759 return 0; 760 } 761 762 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) 763 { 764 BDRVQcow2State *s = bs->opaque; 765 int i, j = 0, l2_index, ret; 766 uint64_t *old_cluster, *l2_table; 767 uint64_t cluster_offset = m->alloc_offset; 768 769 trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 770 assert(m->nb_clusters > 0); 771 772 old_cluster = g_try_new(uint64_t, m->nb_clusters); 773 if (old_cluster == NULL) { 774 ret = -ENOMEM; 775 goto err; 776 } 777 778 /* copy content of unmodified sectors */ 779 ret = perform_cow(bs, m, &m->cow_start); 780 if (ret < 0) { 781 goto err; 782 } 783 784 ret = perform_cow(bs, m, &m->cow_end); 785 if (ret < 0) { 786 goto err; 787 } 788 789 /* Update L2 table. */ 790 if (s->use_lazy_refcounts) { 791 qcow2_mark_dirty(bs); 792 } 793 if (qcow2_need_accurate_refcounts(s)) { 794 qcow2_cache_set_dependency(bs, s->l2_table_cache, 795 s->refcount_block_cache); 796 } 797 798 ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); 799 if (ret < 0) { 800 goto err; 801 } 802 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 803 804 assert(l2_index + m->nb_clusters <= s->l2_size); 805 for (i = 0; i < m->nb_clusters; i++) { 806 /* if two concurrent writes happen to the same unallocated cluster 807 * each write allocates separate cluster and writes data concurrently. 808 * The first one to complete updates l2 table with pointer to its 809 * cluster the second one has to do RMW (which is done above by 810 * perform_cow()), update l2 table with its cluster pointer and free 811 * old cluster. This is what this loop does */ 812 if (l2_table[l2_index + i] != 0) { 813 old_cluster[j++] = l2_table[l2_index + i]; 814 } 815 816 l2_table[l2_index + i] = cpu_to_be64((cluster_offset + 817 (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); 818 } 819 820 821 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 822 823 /* 824 * If this was a COW, we need to decrease the refcount of the old cluster. 825 * 826 * Don't discard clusters that reach a refcount of 0 (e.g. compressed 827 * clusters), the next write will reuse them anyway. 828 */ 829 if (j != 0) { 830 for (i = 0; i < j; i++) { 831 qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, 832 QCOW2_DISCARD_NEVER); 833 } 834 } 835 836 ret = 0; 837 err: 838 g_free(old_cluster); 839 return ret; 840 } 841 842 /* 843 * Returns the number of contiguous clusters that can be used for an allocating 844 * write, but require COW to be performed (this includes yet unallocated space, 845 * which must copy from the backing file) 846 */ 847 static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, 848 uint64_t *l2_table, int l2_index) 849 { 850 int i; 851 852 for (i = 0; i < nb_clusters; i++) { 853 uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); 854 int cluster_type = qcow2_get_cluster_type(l2_entry); 855 856 switch(cluster_type) { 857 case QCOW2_CLUSTER_NORMAL: 858 if (l2_entry & QCOW_OFLAG_COPIED) { 859 goto out; 860 } 861 break; 862 case QCOW2_CLUSTER_UNALLOCATED: 863 case QCOW2_CLUSTER_COMPRESSED: 864 case QCOW2_CLUSTER_ZERO: 865 break; 866 default: 867 abort(); 868 } 869 } 870 871 out: 872 assert(i <= nb_clusters); 873 return i; 874 } 875 876 /* 877 * Check if there already is an AIO write request in flight which allocates 878 * the same cluster. In this case we need to wait until the previous 879 * request has completed and updated the L2 table accordingly. 880 * 881 * Returns: 882 * 0 if there was no dependency. *cur_bytes indicates the number of 883 * bytes from guest_offset that can be read before the next 884 * dependency must be processed (or the request is complete) 885 * 886 * -EAGAIN if we had to wait for another request, previously gathered 887 * information on cluster allocation may be invalid now. The caller 888 * must start over anyway, so consider *cur_bytes undefined. 889 */ 890 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, 891 uint64_t *cur_bytes, QCowL2Meta **m) 892 { 893 BDRVQcow2State *s = bs->opaque; 894 QCowL2Meta *old_alloc; 895 uint64_t bytes = *cur_bytes; 896 897 QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 898 899 uint64_t start = guest_offset; 900 uint64_t end = start + bytes; 901 uint64_t old_start = l2meta_cow_start(old_alloc); 902 uint64_t old_end = l2meta_cow_end(old_alloc); 903 904 if (end <= old_start || start >= old_end) { 905 /* No intersection */ 906 } else { 907 if (start < old_start) { 908 /* Stop at the start of a running allocation */ 909 bytes = old_start - start; 910 } else { 911 bytes = 0; 912 } 913 914 /* Stop if already an l2meta exists. After yielding, it wouldn't 915 * be valid any more, so we'd have to clean up the old L2Metas 916 * and deal with requests depending on them before starting to 917 * gather new ones. Not worth the trouble. */ 918 if (bytes == 0 && *m) { 919 *cur_bytes = 0; 920 return 0; 921 } 922 923 if (bytes == 0) { 924 /* Wait for the dependency to complete. We need to recheck 925 * the free/allocated clusters when we continue. */ 926 qemu_co_mutex_unlock(&s->lock); 927 qemu_co_queue_wait(&old_alloc->dependent_requests); 928 qemu_co_mutex_lock(&s->lock); 929 return -EAGAIN; 930 } 931 } 932 } 933 934 /* Make sure that existing clusters and new allocations are only used up to 935 * the next dependency if we shortened the request above */ 936 *cur_bytes = bytes; 937 938 return 0; 939 } 940 941 /* 942 * Checks how many already allocated clusters that don't require a copy on 943 * write there are at the given guest_offset (up to *bytes). If 944 * *host_offset is not zero, only physically contiguous clusters beginning at 945 * this host offset are counted. 946 * 947 * Note that guest_offset may not be cluster aligned. In this case, the 948 * returned *host_offset points to exact byte referenced by guest_offset and 949 * therefore isn't cluster aligned as well. 950 * 951 * Returns: 952 * 0: if no allocated clusters are available at the given offset. 953 * *bytes is normally unchanged. It is set to 0 if the cluster 954 * is allocated and doesn't need COW, but doesn't have the right 955 * physical offset. 956 * 957 * 1: if allocated clusters that don't require a COW are available at 958 * the requested offset. *bytes may have decreased and describes 959 * the length of the area that can be written to. 960 * 961 * -errno: in error cases 962 */ 963 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, 964 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 965 { 966 BDRVQcow2State *s = bs->opaque; 967 int l2_index; 968 uint64_t cluster_offset; 969 uint64_t *l2_table; 970 uint64_t nb_clusters; 971 unsigned int keep_clusters; 972 int ret; 973 974 trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 975 *bytes); 976 977 assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) 978 == offset_into_cluster(s, *host_offset)); 979 980 /* 981 * Calculate the number of clusters to look for. We stop at L2 table 982 * boundaries to keep things simple. 983 */ 984 nb_clusters = 985 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 986 987 l2_index = offset_to_l2_index(s, guest_offset); 988 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 989 assert(nb_clusters <= INT_MAX); 990 991 /* Find L2 entry for the first involved cluster */ 992 ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 993 if (ret < 0) { 994 return ret; 995 } 996 997 cluster_offset = be64_to_cpu(l2_table[l2_index]); 998 999 /* Check how many clusters are already allocated and don't need COW */ 1000 if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL 1001 && (cluster_offset & QCOW_OFLAG_COPIED)) 1002 { 1003 /* If a specific host_offset is required, check it */ 1004 bool offset_matches = 1005 (cluster_offset & L2E_OFFSET_MASK) == *host_offset; 1006 1007 if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { 1008 qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 1009 "%#llx unaligned (guest offset: %#" PRIx64 1010 ")", cluster_offset & L2E_OFFSET_MASK, 1011 guest_offset); 1012 ret = -EIO; 1013 goto out; 1014 } 1015 1016 if (*host_offset != 0 && !offset_matches) { 1017 *bytes = 0; 1018 ret = 0; 1019 goto out; 1020 } 1021 1022 /* We keep all QCOW_OFLAG_COPIED clusters */ 1023 keep_clusters = 1024 count_contiguous_clusters(nb_clusters, s->cluster_size, 1025 &l2_table[l2_index], 1026 QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); 1027 assert(keep_clusters <= nb_clusters); 1028 1029 *bytes = MIN(*bytes, 1030 keep_clusters * s->cluster_size 1031 - offset_into_cluster(s, guest_offset)); 1032 1033 ret = 1; 1034 } else { 1035 ret = 0; 1036 } 1037 1038 /* Cleanup */ 1039 out: 1040 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1041 1042 /* Only return a host offset if we actually made progress. Otherwise we 1043 * would make requirements for handle_alloc() that it can't fulfill */ 1044 if (ret > 0) { 1045 *host_offset = (cluster_offset & L2E_OFFSET_MASK) 1046 + offset_into_cluster(s, guest_offset); 1047 } 1048 1049 return ret; 1050 } 1051 1052 /* 1053 * Allocates new clusters for the given guest_offset. 1054 * 1055 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 1056 * contain the number of clusters that have been allocated and are contiguous 1057 * in the image file. 1058 * 1059 * If *host_offset is non-zero, it specifies the offset in the image file at 1060 * which the new clusters must start. *nb_clusters can be 0 on return in this 1061 * case if the cluster at host_offset is already in use. If *host_offset is 1062 * zero, the clusters can be allocated anywhere in the image file. 1063 * 1064 * *host_offset is updated to contain the offset into the image file at which 1065 * the first allocated cluster starts. 1066 * 1067 * Return 0 on success and -errno in error cases. -EAGAIN means that the 1068 * function has been waiting for another request and the allocation must be 1069 * restarted, but the whole request should not be failed. 1070 */ 1071 static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, 1072 uint64_t *host_offset, uint64_t *nb_clusters) 1073 { 1074 BDRVQcow2State *s = bs->opaque; 1075 1076 trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 1077 *host_offset, *nb_clusters); 1078 1079 /* Allocate new clusters */ 1080 trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 1081 if (*host_offset == 0) { 1082 int64_t cluster_offset = 1083 qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 1084 if (cluster_offset < 0) { 1085 return cluster_offset; 1086 } 1087 *host_offset = cluster_offset; 1088 return 0; 1089 } else { 1090 int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 1091 if (ret < 0) { 1092 return ret; 1093 } 1094 *nb_clusters = ret; 1095 return 0; 1096 } 1097 } 1098 1099 /* 1100 * Allocates new clusters for an area that either is yet unallocated or needs a 1101 * copy on write. If *host_offset is non-zero, clusters are only allocated if 1102 * the new allocation can match the specified host offset. 1103 * 1104 * Note that guest_offset may not be cluster aligned. In this case, the 1105 * returned *host_offset points to exact byte referenced by guest_offset and 1106 * therefore isn't cluster aligned as well. 1107 * 1108 * Returns: 1109 * 0: if no clusters could be allocated. *bytes is set to 0, 1110 * *host_offset is left unchanged. 1111 * 1112 * 1: if new clusters were allocated. *bytes may be decreased if the 1113 * new allocation doesn't cover all of the requested area. 1114 * *host_offset is updated to contain the host offset of the first 1115 * newly allocated cluster. 1116 * 1117 * -errno: in error cases 1118 */ 1119 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, 1120 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 1121 { 1122 BDRVQcow2State *s = bs->opaque; 1123 int l2_index; 1124 uint64_t *l2_table; 1125 uint64_t entry; 1126 uint64_t nb_clusters; 1127 int ret; 1128 1129 uint64_t alloc_cluster_offset; 1130 1131 trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 1132 *bytes); 1133 assert(*bytes > 0); 1134 1135 /* 1136 * Calculate the number of clusters to look for. We stop at L2 table 1137 * boundaries to keep things simple. 1138 */ 1139 nb_clusters = 1140 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1141 1142 l2_index = offset_to_l2_index(s, guest_offset); 1143 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1144 assert(nb_clusters <= INT_MAX); 1145 1146 /* Find L2 entry for the first involved cluster */ 1147 ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 1148 if (ret < 0) { 1149 return ret; 1150 } 1151 1152 entry = be64_to_cpu(l2_table[l2_index]); 1153 1154 /* For the moment, overwrite compressed clusters one by one */ 1155 if (entry & QCOW_OFLAG_COMPRESSED) { 1156 nb_clusters = 1; 1157 } else { 1158 nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); 1159 } 1160 1161 /* This function is only called when there were no non-COW clusters, so if 1162 * we can't find any unallocated or COW clusters either, something is 1163 * wrong with our code. */ 1164 assert(nb_clusters > 0); 1165 1166 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1167 1168 /* Allocate, if necessary at a given offset in the image file */ 1169 alloc_cluster_offset = start_of_cluster(s, *host_offset); 1170 ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 1171 &nb_clusters); 1172 if (ret < 0) { 1173 goto fail; 1174 } 1175 1176 /* Can't extend contiguous allocation */ 1177 if (nb_clusters == 0) { 1178 *bytes = 0; 1179 return 0; 1180 } 1181 1182 /* !*host_offset would overwrite the image header and is reserved for "no 1183 * host offset preferred". If 0 was a valid host offset, it'd trigger the 1184 * following overlap check; do that now to avoid having an invalid value in 1185 * *host_offset. */ 1186 if (!alloc_cluster_offset) { 1187 ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, 1188 nb_clusters * s->cluster_size); 1189 assert(ret < 0); 1190 goto fail; 1191 } 1192 1193 /* 1194 * Save info needed for meta data update. 1195 * 1196 * requested_bytes: Number of bytes from the start of the first 1197 * newly allocated cluster to the end of the (possibly shortened 1198 * before) write request. 1199 * 1200 * avail_bytes: Number of bytes from the start of the first 1201 * newly allocated to the end of the last newly allocated cluster. 1202 * 1203 * nb_bytes: The number of bytes from the start of the first 1204 * newly allocated cluster to the end of the area that the write 1205 * request actually writes to (excluding COW at the end) 1206 */ 1207 uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset); 1208 int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits); 1209 int nb_bytes = MIN(requested_bytes, avail_bytes); 1210 QCowL2Meta *old_m = *m; 1211 1212 *m = g_malloc0(sizeof(**m)); 1213 1214 **m = (QCowL2Meta) { 1215 .next = old_m, 1216 1217 .alloc_offset = alloc_cluster_offset, 1218 .offset = start_of_cluster(s, guest_offset), 1219 .nb_clusters = nb_clusters, 1220 1221 .cow_start = { 1222 .offset = 0, 1223 .nb_bytes = offset_into_cluster(s, guest_offset), 1224 }, 1225 .cow_end = { 1226 .offset = nb_bytes, 1227 .nb_bytes = avail_bytes - nb_bytes, 1228 }, 1229 }; 1230 qemu_co_queue_init(&(*m)->dependent_requests); 1231 QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 1232 1233 *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 1234 *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); 1235 assert(*bytes != 0); 1236 1237 return 1; 1238 1239 fail: 1240 if (*m && (*m)->nb_clusters > 0) { 1241 QLIST_REMOVE(*m, next_in_flight); 1242 } 1243 return ret; 1244 } 1245 1246 /* 1247 * alloc_cluster_offset 1248 * 1249 * For a given offset on the virtual disk, find the cluster offset in qcow2 1250 * file. If the offset is not found, allocate a new cluster. 1251 * 1252 * If the cluster was already allocated, m->nb_clusters is set to 0 and 1253 * other fields in m are meaningless. 1254 * 1255 * If the cluster is newly allocated, m->nb_clusters is set to the number of 1256 * contiguous clusters that have been allocated. In this case, the other 1257 * fields of m are valid and contain information about the first allocated 1258 * cluster. 1259 * 1260 * If the request conflicts with another write request in flight, the coroutine 1261 * is queued and will be reentered when the dependency has completed. 1262 * 1263 * Return 0 on success and -errno in error cases 1264 */ 1265 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, 1266 unsigned int *bytes, uint64_t *host_offset, 1267 QCowL2Meta **m) 1268 { 1269 BDRVQcow2State *s = bs->opaque; 1270 uint64_t start, remaining; 1271 uint64_t cluster_offset; 1272 uint64_t cur_bytes; 1273 int ret; 1274 1275 trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes); 1276 1277 again: 1278 start = offset; 1279 remaining = *bytes; 1280 cluster_offset = 0; 1281 *host_offset = 0; 1282 cur_bytes = 0; 1283 *m = NULL; 1284 1285 while (true) { 1286 1287 if (!*host_offset) { 1288 *host_offset = start_of_cluster(s, cluster_offset); 1289 } 1290 1291 assert(remaining >= cur_bytes); 1292 1293 start += cur_bytes; 1294 remaining -= cur_bytes; 1295 cluster_offset += cur_bytes; 1296 1297 if (remaining == 0) { 1298 break; 1299 } 1300 1301 cur_bytes = remaining; 1302 1303 /* 1304 * Now start gathering as many contiguous clusters as possible: 1305 * 1306 * 1. Check for overlaps with in-flight allocations 1307 * 1308 * a) Overlap not in the first cluster -> shorten this request and 1309 * let the caller handle the rest in its next loop iteration. 1310 * 1311 * b) Real overlaps of two requests. Yield and restart the search 1312 * for contiguous clusters (the situation could have changed 1313 * while we were sleeping) 1314 * 1315 * c) TODO: Request starts in the same cluster as the in-flight 1316 * allocation ends. Shorten the COW of the in-fight allocation, 1317 * set cluster_offset to write to the same cluster and set up 1318 * the right synchronisation between the in-flight request and 1319 * the new one. 1320 */ 1321 ret = handle_dependencies(bs, start, &cur_bytes, m); 1322 if (ret == -EAGAIN) { 1323 /* Currently handle_dependencies() doesn't yield if we already had 1324 * an allocation. If it did, we would have to clean up the L2Meta 1325 * structs before starting over. */ 1326 assert(*m == NULL); 1327 goto again; 1328 } else if (ret < 0) { 1329 return ret; 1330 } else if (cur_bytes == 0) { 1331 break; 1332 } else { 1333 /* handle_dependencies() may have decreased cur_bytes (shortened 1334 * the allocations below) so that the next dependency is processed 1335 * correctly during the next loop iteration. */ 1336 } 1337 1338 /* 1339 * 2. Count contiguous COPIED clusters. 1340 */ 1341 ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 1342 if (ret < 0) { 1343 return ret; 1344 } else if (ret) { 1345 continue; 1346 } else if (cur_bytes == 0) { 1347 break; 1348 } 1349 1350 /* 1351 * 3. If the request still hasn't completed, allocate new clusters, 1352 * considering any cluster_offset of steps 1c or 2. 1353 */ 1354 ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1355 if (ret < 0) { 1356 return ret; 1357 } else if (ret) { 1358 continue; 1359 } else { 1360 assert(cur_bytes == 0); 1361 break; 1362 } 1363 } 1364 1365 *bytes -= remaining; 1366 assert(*bytes > 0); 1367 assert(*host_offset != 0); 1368 1369 return 0; 1370 } 1371 1372 static int decompress_buffer(uint8_t *out_buf, int out_buf_size, 1373 const uint8_t *buf, int buf_size) 1374 { 1375 z_stream strm1, *strm = &strm1; 1376 int ret, out_len; 1377 1378 memset(strm, 0, sizeof(*strm)); 1379 1380 strm->next_in = (uint8_t *)buf; 1381 strm->avail_in = buf_size; 1382 strm->next_out = out_buf; 1383 strm->avail_out = out_buf_size; 1384 1385 ret = inflateInit2(strm, -12); 1386 if (ret != Z_OK) 1387 return -1; 1388 ret = inflate(strm, Z_FINISH); 1389 out_len = strm->next_out - out_buf; 1390 if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || 1391 out_len != out_buf_size) { 1392 inflateEnd(strm); 1393 return -1; 1394 } 1395 inflateEnd(strm); 1396 return 0; 1397 } 1398 1399 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) 1400 { 1401 BDRVQcow2State *s = bs->opaque; 1402 int ret, csize, nb_csectors, sector_offset; 1403 uint64_t coffset; 1404 1405 coffset = cluster_offset & s->cluster_offset_mask; 1406 if (s->cluster_cache_offset != coffset) { 1407 nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 1408 sector_offset = coffset & 511; 1409 csize = nb_csectors * 512 - sector_offset; 1410 BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 1411 ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data, 1412 nb_csectors); 1413 if (ret < 0) { 1414 return ret; 1415 } 1416 if (decompress_buffer(s->cluster_cache, s->cluster_size, 1417 s->cluster_data + sector_offset, csize) < 0) { 1418 return -EIO; 1419 } 1420 s->cluster_cache_offset = coffset; 1421 } 1422 return 0; 1423 } 1424 1425 /* 1426 * This discards as many clusters of nb_clusters as possible at once (i.e. 1427 * all clusters in the same L2 table) and returns the number of discarded 1428 * clusters. 1429 */ 1430 static int discard_single_l2(BlockDriverState *bs, uint64_t offset, 1431 uint64_t nb_clusters, enum qcow2_discard_type type, 1432 bool full_discard) 1433 { 1434 BDRVQcow2State *s = bs->opaque; 1435 uint64_t *l2_table; 1436 int l2_index; 1437 int ret; 1438 int i; 1439 1440 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1441 if (ret < 0) { 1442 return ret; 1443 } 1444 1445 /* Limit nb_clusters to one L2 table */ 1446 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1447 assert(nb_clusters <= INT_MAX); 1448 1449 for (i = 0; i < nb_clusters; i++) { 1450 uint64_t old_l2_entry; 1451 1452 old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); 1453 1454 /* 1455 * If full_discard is false, make sure that a discarded area reads back 1456 * as zeroes for v3 images (we cannot do it for v2 without actually 1457 * writing a zero-filled buffer). We can skip the operation if the 1458 * cluster is already marked as zero, or if it's unallocated and we 1459 * don't have a backing file. 1460 * 1461 * TODO We might want to use bdrv_get_block_status(bs) here, but we're 1462 * holding s->lock, so that doesn't work today. 1463 * 1464 * If full_discard is true, the sector should not read back as zeroes, 1465 * but rather fall through to the backing file. 1466 */ 1467 switch (qcow2_get_cluster_type(old_l2_entry)) { 1468 case QCOW2_CLUSTER_UNALLOCATED: 1469 if (full_discard || !bs->backing) { 1470 continue; 1471 } 1472 break; 1473 1474 case QCOW2_CLUSTER_ZERO: 1475 if (!full_discard) { 1476 continue; 1477 } 1478 break; 1479 1480 case QCOW2_CLUSTER_NORMAL: 1481 case QCOW2_CLUSTER_COMPRESSED: 1482 break; 1483 1484 default: 1485 abort(); 1486 } 1487 1488 /* First remove L2 entries */ 1489 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1490 if (!full_discard && s->qcow_version >= 3) { 1491 l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1492 } else { 1493 l2_table[l2_index + i] = cpu_to_be64(0); 1494 } 1495 1496 /* Then decrease the refcount */ 1497 qcow2_free_any_clusters(bs, old_l2_entry, 1, type); 1498 } 1499 1500 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1501 1502 return nb_clusters; 1503 } 1504 1505 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, 1506 int nb_sectors, enum qcow2_discard_type type, bool full_discard) 1507 { 1508 BDRVQcow2State *s = bs->opaque; 1509 uint64_t end_offset; 1510 uint64_t nb_clusters; 1511 int ret; 1512 1513 end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); 1514 1515 /* Round start up and end down */ 1516 offset = align_offset(offset, s->cluster_size); 1517 end_offset = start_of_cluster(s, end_offset); 1518 1519 if (offset > end_offset) { 1520 return 0; 1521 } 1522 1523 nb_clusters = size_to_clusters(s, end_offset - offset); 1524 1525 s->cache_discards = true; 1526 1527 /* Each L2 table is handled by its own loop iteration */ 1528 while (nb_clusters > 0) { 1529 ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard); 1530 if (ret < 0) { 1531 goto fail; 1532 } 1533 1534 nb_clusters -= ret; 1535 offset += (ret * s->cluster_size); 1536 } 1537 1538 ret = 0; 1539 fail: 1540 s->cache_discards = false; 1541 qcow2_process_discards(bs, ret); 1542 1543 return ret; 1544 } 1545 1546 /* 1547 * This zeroes as many clusters of nb_clusters as possible at once (i.e. 1548 * all clusters in the same L2 table) and returns the number of zeroed 1549 * clusters. 1550 */ 1551 static int zero_single_l2(BlockDriverState *bs, uint64_t offset, 1552 uint64_t nb_clusters) 1553 { 1554 BDRVQcow2State *s = bs->opaque; 1555 uint64_t *l2_table; 1556 int l2_index; 1557 int ret; 1558 int i; 1559 1560 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1561 if (ret < 0) { 1562 return ret; 1563 } 1564 1565 /* Limit nb_clusters to one L2 table */ 1566 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1567 assert(nb_clusters <= INT_MAX); 1568 1569 for (i = 0; i < nb_clusters; i++) { 1570 uint64_t old_offset; 1571 1572 old_offset = be64_to_cpu(l2_table[l2_index + i]); 1573 1574 /* Update L2 entries */ 1575 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1576 if (old_offset & QCOW_OFLAG_COMPRESSED) { 1577 l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1578 qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1579 } else { 1580 l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); 1581 } 1582 } 1583 1584 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1585 1586 return nb_clusters; 1587 } 1588 1589 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) 1590 { 1591 BDRVQcow2State *s = bs->opaque; 1592 uint64_t nb_clusters; 1593 int ret; 1594 1595 /* The zero flag is only supported by version 3 and newer */ 1596 if (s->qcow_version < 3) { 1597 return -ENOTSUP; 1598 } 1599 1600 /* Each L2 table is handled by its own loop iteration */ 1601 nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); 1602 1603 s->cache_discards = true; 1604 1605 while (nb_clusters > 0) { 1606 ret = zero_single_l2(bs, offset, nb_clusters); 1607 if (ret < 0) { 1608 goto fail; 1609 } 1610 1611 nb_clusters -= ret; 1612 offset += (ret * s->cluster_size); 1613 } 1614 1615 ret = 0; 1616 fail: 1617 s->cache_discards = false; 1618 qcow2_process_discards(bs, ret); 1619 1620 return ret; 1621 } 1622 1623 /* 1624 * Expands all zero clusters in a specific L1 table (or deallocates them, for 1625 * non-backed non-pre-allocated zero clusters). 1626 * 1627 * l1_entries and *visited_l1_entries are used to keep track of progress for 1628 * status_cb(). l1_entries contains the total number of L1 entries and 1629 * *visited_l1_entries counts all visited L1 entries. 1630 */ 1631 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, 1632 int l1_size, int64_t *visited_l1_entries, 1633 int64_t l1_entries, 1634 BlockDriverAmendStatusCB *status_cb, 1635 void *cb_opaque) 1636 { 1637 BDRVQcow2State *s = bs->opaque; 1638 bool is_active_l1 = (l1_table == s->l1_table); 1639 uint64_t *l2_table = NULL; 1640 int ret; 1641 int i, j; 1642 1643 if (!is_active_l1) { 1644 /* inactive L2 tables require a buffer to be stored in when loading 1645 * them from disk */ 1646 l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size); 1647 if (l2_table == NULL) { 1648 return -ENOMEM; 1649 } 1650 } 1651 1652 for (i = 0; i < l1_size; i++) { 1653 uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; 1654 bool l2_dirty = false; 1655 uint64_t l2_refcount; 1656 1657 if (!l2_offset) { 1658 /* unallocated */ 1659 (*visited_l1_entries)++; 1660 if (status_cb) { 1661 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 1662 } 1663 continue; 1664 } 1665 1666 if (offset_into_cluster(s, l2_offset)) { 1667 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 1668 PRIx64 " unaligned (L1 index: %#x)", 1669 l2_offset, i); 1670 ret = -EIO; 1671 goto fail; 1672 } 1673 1674 if (is_active_l1) { 1675 /* get active L2 tables from cache */ 1676 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 1677 (void **)&l2_table); 1678 } else { 1679 /* load inactive L2 tables from disk */ 1680 ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, 1681 (void *)l2_table, s->cluster_sectors); 1682 } 1683 if (ret < 0) { 1684 goto fail; 1685 } 1686 1687 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 1688 &l2_refcount); 1689 if (ret < 0) { 1690 goto fail; 1691 } 1692 1693 for (j = 0; j < s->l2_size; j++) { 1694 uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1695 int64_t offset = l2_entry & L2E_OFFSET_MASK; 1696 int cluster_type = qcow2_get_cluster_type(l2_entry); 1697 bool preallocated = offset != 0; 1698 1699 if (cluster_type != QCOW2_CLUSTER_ZERO) { 1700 continue; 1701 } 1702 1703 if (!preallocated) { 1704 if (!bs->backing) { 1705 /* not backed; therefore we can simply deallocate the 1706 * cluster */ 1707 l2_table[j] = 0; 1708 l2_dirty = true; 1709 continue; 1710 } 1711 1712 offset = qcow2_alloc_clusters(bs, s->cluster_size); 1713 if (offset < 0) { 1714 ret = offset; 1715 goto fail; 1716 } 1717 1718 if (l2_refcount > 1) { 1719 /* For shared L2 tables, set the refcount accordingly (it is 1720 * already 1 and needs to be l2_refcount) */ 1721 ret = qcow2_update_cluster_refcount(bs, 1722 offset >> s->cluster_bits, 1723 refcount_diff(1, l2_refcount), false, 1724 QCOW2_DISCARD_OTHER); 1725 if (ret < 0) { 1726 qcow2_free_clusters(bs, offset, s->cluster_size, 1727 QCOW2_DISCARD_OTHER); 1728 goto fail; 1729 } 1730 } 1731 } 1732 1733 if (offset_into_cluster(s, offset)) { 1734 qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " 1735 "%#" PRIx64 " unaligned (L2 offset: %#" 1736 PRIx64 ", L2 index: %#x)", offset, 1737 l2_offset, j); 1738 if (!preallocated) { 1739 qcow2_free_clusters(bs, offset, s->cluster_size, 1740 QCOW2_DISCARD_ALWAYS); 1741 } 1742 ret = -EIO; 1743 goto fail; 1744 } 1745 1746 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 1747 if (ret < 0) { 1748 if (!preallocated) { 1749 qcow2_free_clusters(bs, offset, s->cluster_size, 1750 QCOW2_DISCARD_ALWAYS); 1751 } 1752 goto fail; 1753 } 1754 1755 ret = bdrv_pwrite_zeroes(bs->file->bs, offset, s->cluster_size, 0); 1756 if (ret < 0) { 1757 if (!preallocated) { 1758 qcow2_free_clusters(bs, offset, s->cluster_size, 1759 QCOW2_DISCARD_ALWAYS); 1760 } 1761 goto fail; 1762 } 1763 1764 if (l2_refcount == 1) { 1765 l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); 1766 } else { 1767 l2_table[j] = cpu_to_be64(offset); 1768 } 1769 l2_dirty = true; 1770 } 1771 1772 if (is_active_l1) { 1773 if (l2_dirty) { 1774 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); 1775 qcow2_cache_depends_on_flush(s->l2_table_cache); 1776 } 1777 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1778 } else { 1779 if (l2_dirty) { 1780 ret = qcow2_pre_write_overlap_check(bs, 1781 QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, 1782 s->cluster_size); 1783 if (ret < 0) { 1784 goto fail; 1785 } 1786 1787 ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, 1788 (void *)l2_table, s->cluster_sectors); 1789 if (ret < 0) { 1790 goto fail; 1791 } 1792 } 1793 } 1794 1795 (*visited_l1_entries)++; 1796 if (status_cb) { 1797 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 1798 } 1799 } 1800 1801 ret = 0; 1802 1803 fail: 1804 if (l2_table) { 1805 if (!is_active_l1) { 1806 qemu_vfree(l2_table); 1807 } else { 1808 qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); 1809 } 1810 } 1811 return ret; 1812 } 1813 1814 /* 1815 * For backed images, expands all zero clusters on the image. For non-backed 1816 * images, deallocates all non-pre-allocated zero clusters (and claims the 1817 * allocation for pre-allocated ones). This is important for downgrading to a 1818 * qcow2 version which doesn't yet support metadata zero clusters. 1819 */ 1820 int qcow2_expand_zero_clusters(BlockDriverState *bs, 1821 BlockDriverAmendStatusCB *status_cb, 1822 void *cb_opaque) 1823 { 1824 BDRVQcow2State *s = bs->opaque; 1825 uint64_t *l1_table = NULL; 1826 int64_t l1_entries = 0, visited_l1_entries = 0; 1827 int ret; 1828 int i, j; 1829 1830 if (status_cb) { 1831 l1_entries = s->l1_size; 1832 for (i = 0; i < s->nb_snapshots; i++) { 1833 l1_entries += s->snapshots[i].l1_size; 1834 } 1835 } 1836 1837 ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, 1838 &visited_l1_entries, l1_entries, 1839 status_cb, cb_opaque); 1840 if (ret < 0) { 1841 goto fail; 1842 } 1843 1844 /* Inactive L1 tables may point to active L2 tables - therefore it is 1845 * necessary to flush the L2 table cache before trying to access the L2 1846 * tables pointed to by inactive L1 entries (else we might try to expand 1847 * zero clusters that have already been expanded); furthermore, it is also 1848 * necessary to empty the L2 table cache, since it may contain tables which 1849 * are now going to be modified directly on disk, bypassing the cache. 1850 * qcow2_cache_empty() does both for us. */ 1851 ret = qcow2_cache_empty(bs, s->l2_table_cache); 1852 if (ret < 0) { 1853 goto fail; 1854 } 1855 1856 for (i = 0; i < s->nb_snapshots; i++) { 1857 int l1_sectors = DIV_ROUND_UP(s->snapshots[i].l1_size * 1858 sizeof(uint64_t), BDRV_SECTOR_SIZE); 1859 1860 l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); 1861 1862 ret = bdrv_read(bs->file->bs, 1863 s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, 1864 (void *)l1_table, l1_sectors); 1865 if (ret < 0) { 1866 goto fail; 1867 } 1868 1869 for (j = 0; j < s->snapshots[i].l1_size; j++) { 1870 be64_to_cpus(&l1_table[j]); 1871 } 1872 1873 ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, 1874 &visited_l1_entries, l1_entries, 1875 status_cb, cb_opaque); 1876 if (ret < 0) { 1877 goto fail; 1878 } 1879 } 1880 1881 ret = 0; 1882 1883 fail: 1884 g_free(l1_table); 1885 return ret; 1886 } 1887