1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include <zlib.h> 26 27 #include "qemu-common.h" 28 #include "block/block_int.h" 29 #include "block/qcow2.h" 30 #include "trace.h" 31 32 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 33 bool exact_size) 34 { 35 BDRVQcowState *s = bs->opaque; 36 int new_l1_size2, ret, i; 37 uint64_t *new_l1_table; 38 int64_t old_l1_table_offset, old_l1_size; 39 int64_t new_l1_table_offset, new_l1_size; 40 uint8_t data[12]; 41 42 if (min_size <= s->l1_size) 43 return 0; 44 45 if (exact_size) { 46 new_l1_size = min_size; 47 } else { 48 /* Bump size up to reduce the number of times we have to grow */ 49 new_l1_size = s->l1_size; 50 if (new_l1_size == 0) { 51 new_l1_size = 1; 52 } 53 while (min_size > new_l1_size) { 54 new_l1_size = (new_l1_size * 3 + 1) / 2; 55 } 56 } 57 58 if (new_l1_size > INT_MAX / sizeof(uint64_t)) { 59 return -EFBIG; 60 } 61 62 #ifdef DEBUG_ALLOC2 63 fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 64 s->l1_size, new_l1_size); 65 #endif 66 67 new_l1_size2 = sizeof(uint64_t) * new_l1_size; 68 new_l1_table = g_malloc0(align_offset(new_l1_size2, 512)); 69 memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); 70 71 /* write new table (align to cluster) */ 72 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 73 new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 74 if (new_l1_table_offset < 0) { 75 g_free(new_l1_table); 76 return new_l1_table_offset; 77 } 78 79 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 80 if (ret < 0) { 81 goto fail; 82 } 83 84 /* the L1 position has not yet been updated, so these clusters must 85 * indeed be completely free */ 86 ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, 87 new_l1_size2); 88 if (ret < 0) { 89 goto fail; 90 } 91 92 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 93 for(i = 0; i < s->l1_size; i++) 94 new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 95 ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); 96 if (ret < 0) 97 goto fail; 98 for(i = 0; i < s->l1_size; i++) 99 new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 100 101 /* set new table */ 102 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 103 cpu_to_be32w((uint32_t*)data, new_l1_size); 104 stq_be_p(data + 4, new_l1_table_offset); 105 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); 106 if (ret < 0) { 107 goto fail; 108 } 109 g_free(s->l1_table); 110 old_l1_table_offset = s->l1_table_offset; 111 s->l1_table_offset = new_l1_table_offset; 112 s->l1_table = new_l1_table; 113 old_l1_size = s->l1_size; 114 s->l1_size = new_l1_size; 115 qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), 116 QCOW2_DISCARD_OTHER); 117 return 0; 118 fail: 119 g_free(new_l1_table); 120 qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 121 QCOW2_DISCARD_OTHER); 122 return ret; 123 } 124 125 /* 126 * l2_load 127 * 128 * Loads a L2 table into memory. If the table is in the cache, the cache 129 * is used; otherwise the L2 table is loaded from the image file. 130 * 131 * Returns a pointer to the L2 table on success, or NULL if the read from 132 * the image file failed. 133 */ 134 135 static int l2_load(BlockDriverState *bs, uint64_t l2_offset, 136 uint64_t **l2_table) 137 { 138 BDRVQcowState *s = bs->opaque; 139 int ret; 140 141 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); 142 143 return ret; 144 } 145 146 /* 147 * Writes one sector of the L1 table to the disk (can't update single entries 148 * and we really don't want bdrv_pread to perform a read-modify-write) 149 */ 150 #define L1_ENTRIES_PER_SECTOR (512 / 8) 151 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) 152 { 153 BDRVQcowState *s = bs->opaque; 154 uint64_t buf[L1_ENTRIES_PER_SECTOR]; 155 int l1_start_index; 156 int i, ret; 157 158 l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); 159 for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) { 160 buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 161 } 162 163 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, 164 s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); 165 if (ret < 0) { 166 return ret; 167 } 168 169 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 170 ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, 171 buf, sizeof(buf)); 172 if (ret < 0) { 173 return ret; 174 } 175 176 return 0; 177 } 178 179 /* 180 * l2_allocate 181 * 182 * Allocate a new l2 entry in the file. If l1_index points to an already 183 * used entry in the L2 table (i.e. we are doing a copy on write for the L2 184 * table) copy the contents of the old L2 table into the newly allocated one. 185 * Otherwise the new table is initialized with zeros. 186 * 187 */ 188 189 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) 190 { 191 BDRVQcowState *s = bs->opaque; 192 uint64_t old_l2_offset; 193 uint64_t *l2_table = NULL; 194 int64_t l2_offset; 195 int ret; 196 197 old_l2_offset = s->l1_table[l1_index]; 198 199 trace_qcow2_l2_allocate(bs, l1_index); 200 201 /* allocate a new l2 entry */ 202 203 l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); 204 if (l2_offset < 0) { 205 ret = l2_offset; 206 goto fail; 207 } 208 209 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 210 if (ret < 0) { 211 goto fail; 212 } 213 214 /* allocate a new entry in the l2 cache */ 215 216 trace_qcow2_l2_allocate_get_empty(bs, l1_index); 217 ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); 218 if (ret < 0) { 219 goto fail; 220 } 221 222 l2_table = *table; 223 224 if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 225 /* if there was no old l2 table, clear the new table */ 226 memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); 227 } else { 228 uint64_t* old_table; 229 230 /* if there was an old l2 table, read it from the disk */ 231 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 232 ret = qcow2_cache_get(bs, s->l2_table_cache, 233 old_l2_offset & L1E_OFFSET_MASK, 234 (void**) &old_table); 235 if (ret < 0) { 236 goto fail; 237 } 238 239 memcpy(l2_table, old_table, s->cluster_size); 240 241 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table); 242 if (ret < 0) { 243 goto fail; 244 } 245 } 246 247 /* write the l2 table to the file */ 248 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 249 250 trace_qcow2_l2_allocate_write_l2(bs, l1_index); 251 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 252 ret = qcow2_cache_flush(bs, s->l2_table_cache); 253 if (ret < 0) { 254 goto fail; 255 } 256 257 /* update the L1 entry */ 258 trace_qcow2_l2_allocate_write_l1(bs, l1_index); 259 s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 260 ret = qcow2_write_l1_entry(bs, l1_index); 261 if (ret < 0) { 262 goto fail; 263 } 264 265 *table = l2_table; 266 trace_qcow2_l2_allocate_done(bs, l1_index, 0); 267 return 0; 268 269 fail: 270 trace_qcow2_l2_allocate_done(bs, l1_index, ret); 271 if (l2_table != NULL) { 272 qcow2_cache_put(bs, s->l2_table_cache, (void**) table); 273 } 274 s->l1_table[l1_index] = old_l2_offset; 275 if (l2_offset > 0) { 276 qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 277 QCOW2_DISCARD_ALWAYS); 278 } 279 return ret; 280 } 281 282 /* 283 * Checks how many clusters in a given L2 table are contiguous in the image 284 * file. As soon as one of the flags in the bitmask stop_flags changes compared 285 * to the first cluster, the search is stopped and the cluster is not counted 286 * as contiguous. (This allows it, for example, to stop at the first compressed 287 * cluster which may require a different handling) 288 */ 289 static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, 290 uint64_t *l2_table, uint64_t stop_flags) 291 { 292 int i; 293 uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; 294 uint64_t first_entry = be64_to_cpu(l2_table[0]); 295 uint64_t offset = first_entry & mask; 296 297 if (!offset) 298 return 0; 299 300 assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED); 301 302 for (i = 0; i < nb_clusters; i++) { 303 uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; 304 if (offset + (uint64_t) i * cluster_size != l2_entry) { 305 break; 306 } 307 } 308 309 return i; 310 } 311 312 static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) 313 { 314 int i; 315 316 for (i = 0; i < nb_clusters; i++) { 317 int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); 318 319 if (type != QCOW2_CLUSTER_UNALLOCATED) { 320 break; 321 } 322 } 323 324 return i; 325 } 326 327 /* The crypt function is compatible with the linux cryptoloop 328 algorithm for < 4 GB images. NOTE: out_buf == in_buf is 329 supported */ 330 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, 331 uint8_t *out_buf, const uint8_t *in_buf, 332 int nb_sectors, int enc, 333 const AES_KEY *key) 334 { 335 union { 336 uint64_t ll[2]; 337 uint8_t b[16]; 338 } ivec; 339 int i; 340 341 for(i = 0; i < nb_sectors; i++) { 342 ivec.ll[0] = cpu_to_le64(sector_num); 343 ivec.ll[1] = 0; 344 AES_cbc_encrypt(in_buf, out_buf, 512, key, 345 ivec.b, enc); 346 sector_num++; 347 in_buf += 512; 348 out_buf += 512; 349 } 350 } 351 352 static int coroutine_fn copy_sectors(BlockDriverState *bs, 353 uint64_t start_sect, 354 uint64_t cluster_offset, 355 int n_start, int n_end) 356 { 357 BDRVQcowState *s = bs->opaque; 358 QEMUIOVector qiov; 359 struct iovec iov; 360 int n, ret; 361 362 n = n_end - n_start; 363 if (n <= 0) { 364 return 0; 365 } 366 367 iov.iov_len = n * BDRV_SECTOR_SIZE; 368 iov.iov_base = qemu_blockalign(bs, iov.iov_len); 369 370 qemu_iovec_init_external(&qiov, &iov, 1); 371 372 BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 373 374 if (!bs->drv) { 375 return -ENOMEDIUM; 376 } 377 378 /* Call .bdrv_co_readv() directly instead of using the public block-layer 379 * interface. This avoids double I/O throttling and request tracking, 380 * which can lead to deadlock when block layer copy-on-read is enabled. 381 */ 382 ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); 383 if (ret < 0) { 384 goto out; 385 } 386 387 if (s->crypt_method) { 388 qcow2_encrypt_sectors(s, start_sect + n_start, 389 iov.iov_base, iov.iov_base, n, 1, 390 &s->aes_encrypt_key); 391 } 392 393 ret = qcow2_pre_write_overlap_check(bs, 0, 394 cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE); 395 if (ret < 0) { 396 goto out; 397 } 398 399 BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 400 ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); 401 if (ret < 0) { 402 goto out; 403 } 404 405 ret = 0; 406 out: 407 qemu_vfree(iov.iov_base); 408 return ret; 409 } 410 411 412 /* 413 * get_cluster_offset 414 * 415 * For a given offset of the disk image, find the cluster offset in 416 * qcow2 file. The offset is stored in *cluster_offset. 417 * 418 * on entry, *num is the number of contiguous sectors we'd like to 419 * access following offset. 420 * 421 * on exit, *num is the number of contiguous sectors we can read. 422 * 423 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error 424 * cases. 425 */ 426 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, 427 int *num, uint64_t *cluster_offset) 428 { 429 BDRVQcowState *s = bs->opaque; 430 unsigned int l2_index; 431 uint64_t l1_index, l2_offset, *l2_table; 432 int l1_bits, c; 433 unsigned int index_in_cluster, nb_clusters; 434 uint64_t nb_available, nb_needed; 435 int ret; 436 437 index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); 438 nb_needed = *num + index_in_cluster; 439 440 l1_bits = s->l2_bits + s->cluster_bits; 441 442 /* compute how many bytes there are between the offset and 443 * the end of the l1 entry 444 */ 445 446 nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); 447 448 /* compute the number of available sectors */ 449 450 nb_available = (nb_available >> 9) + index_in_cluster; 451 452 if (nb_needed > nb_available) { 453 nb_needed = nb_available; 454 } 455 456 *cluster_offset = 0; 457 458 /* seek the the l2 offset in the l1 table */ 459 460 l1_index = offset >> l1_bits; 461 if (l1_index >= s->l1_size) { 462 ret = QCOW2_CLUSTER_UNALLOCATED; 463 goto out; 464 } 465 466 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 467 if (!l2_offset) { 468 ret = QCOW2_CLUSTER_UNALLOCATED; 469 goto out; 470 } 471 472 /* load the l2 table in memory */ 473 474 ret = l2_load(bs, l2_offset, &l2_table); 475 if (ret < 0) { 476 return ret; 477 } 478 479 /* find the cluster offset for the given disk offset */ 480 481 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 482 *cluster_offset = be64_to_cpu(l2_table[l2_index]); 483 nb_clusters = size_to_clusters(s, nb_needed << 9); 484 485 ret = qcow2_get_cluster_type(*cluster_offset); 486 switch (ret) { 487 case QCOW2_CLUSTER_COMPRESSED: 488 /* Compressed clusters can only be processed one by one */ 489 c = 1; 490 *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; 491 break; 492 case QCOW2_CLUSTER_ZERO: 493 if (s->qcow_version < 3) { 494 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 495 return -EIO; 496 } 497 c = count_contiguous_clusters(nb_clusters, s->cluster_size, 498 &l2_table[l2_index], QCOW_OFLAG_ZERO); 499 *cluster_offset = 0; 500 break; 501 case QCOW2_CLUSTER_UNALLOCATED: 502 /* how many empty clusters ? */ 503 c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); 504 *cluster_offset = 0; 505 break; 506 case QCOW2_CLUSTER_NORMAL: 507 /* how many allocated clusters ? */ 508 c = count_contiguous_clusters(nb_clusters, s->cluster_size, 509 &l2_table[l2_index], QCOW_OFLAG_ZERO); 510 *cluster_offset &= L2E_OFFSET_MASK; 511 break; 512 default: 513 abort(); 514 } 515 516 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 517 518 nb_available = (c * s->cluster_sectors); 519 520 out: 521 if (nb_available > nb_needed) 522 nb_available = nb_needed; 523 524 *num = nb_available - index_in_cluster; 525 526 return ret; 527 } 528 529 /* 530 * get_cluster_table 531 * 532 * for a given disk offset, load (and allocate if needed) 533 * the l2 table. 534 * 535 * the l2 table offset in the qcow2 file and the cluster index 536 * in the l2 table are given to the caller. 537 * 538 * Returns 0 on success, -errno in failure case 539 */ 540 static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 541 uint64_t **new_l2_table, 542 int *new_l2_index) 543 { 544 BDRVQcowState *s = bs->opaque; 545 unsigned int l2_index; 546 uint64_t l1_index, l2_offset; 547 uint64_t *l2_table = NULL; 548 int ret; 549 550 /* seek the the l2 offset in the l1 table */ 551 552 l1_index = offset >> (s->l2_bits + s->cluster_bits); 553 if (l1_index >= s->l1_size) { 554 ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 555 if (ret < 0) { 556 return ret; 557 } 558 } 559 560 assert(l1_index < s->l1_size); 561 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 562 563 /* seek the l2 table of the given l2 offset */ 564 565 if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { 566 /* load the l2 table in memory */ 567 ret = l2_load(bs, l2_offset, &l2_table); 568 if (ret < 0) { 569 return ret; 570 } 571 } else { 572 /* First allocate a new L2 table (and do COW if needed) */ 573 ret = l2_allocate(bs, l1_index, &l2_table); 574 if (ret < 0) { 575 return ret; 576 } 577 578 /* Then decrease the refcount of the old table */ 579 if (l2_offset) { 580 qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 581 QCOW2_DISCARD_OTHER); 582 } 583 } 584 585 /* find the cluster offset for the given disk offset */ 586 587 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 588 589 *new_l2_table = l2_table; 590 *new_l2_index = l2_index; 591 592 return 0; 593 } 594 595 /* 596 * alloc_compressed_cluster_offset 597 * 598 * For a given offset of the disk image, return cluster offset in 599 * qcow2 file. 600 * 601 * If the offset is not found, allocate a new compressed cluster. 602 * 603 * Return the cluster offset if successful, 604 * Return 0, otherwise. 605 * 606 */ 607 608 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 609 uint64_t offset, 610 int compressed_size) 611 { 612 BDRVQcowState *s = bs->opaque; 613 int l2_index, ret; 614 uint64_t *l2_table; 615 int64_t cluster_offset; 616 int nb_csectors; 617 618 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 619 if (ret < 0) { 620 return 0; 621 } 622 623 /* Compression can't overwrite anything. Fail if the cluster was already 624 * allocated. */ 625 cluster_offset = be64_to_cpu(l2_table[l2_index]); 626 if (cluster_offset & L2E_OFFSET_MASK) { 627 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 628 return 0; 629 } 630 631 cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 632 if (cluster_offset < 0) { 633 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 634 return 0; 635 } 636 637 nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - 638 (cluster_offset >> 9); 639 640 cluster_offset |= QCOW_OFLAG_COMPRESSED | 641 ((uint64_t)nb_csectors << s->csize_shift); 642 643 /* update L2 table */ 644 645 /* compressed clusters never have the copied flag */ 646 647 BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 648 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 649 l2_table[l2_index] = cpu_to_be64(cluster_offset); 650 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 651 if (ret < 0) { 652 return 0; 653 } 654 655 return cluster_offset; 656 } 657 658 static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) 659 { 660 BDRVQcowState *s = bs->opaque; 661 int ret; 662 663 if (r->nb_sectors == 0) { 664 return 0; 665 } 666 667 qemu_co_mutex_unlock(&s->lock); 668 ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, 669 r->offset / BDRV_SECTOR_SIZE, 670 r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); 671 qemu_co_mutex_lock(&s->lock); 672 673 if (ret < 0) { 674 return ret; 675 } 676 677 /* 678 * Before we update the L2 table to actually point to the new cluster, we 679 * need to be sure that the refcounts have been increased and COW was 680 * handled. 681 */ 682 qcow2_cache_depends_on_flush(s->l2_table_cache); 683 684 return 0; 685 } 686 687 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) 688 { 689 BDRVQcowState *s = bs->opaque; 690 int i, j = 0, l2_index, ret; 691 uint64_t *old_cluster, *l2_table; 692 uint64_t cluster_offset = m->alloc_offset; 693 694 trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 695 assert(m->nb_clusters > 0); 696 697 old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); 698 699 /* copy content of unmodified sectors */ 700 ret = perform_cow(bs, m, &m->cow_start); 701 if (ret < 0) { 702 goto err; 703 } 704 705 ret = perform_cow(bs, m, &m->cow_end); 706 if (ret < 0) { 707 goto err; 708 } 709 710 /* Update L2 table. */ 711 if (s->use_lazy_refcounts) { 712 qcow2_mark_dirty(bs); 713 } 714 if (qcow2_need_accurate_refcounts(s)) { 715 qcow2_cache_set_dependency(bs, s->l2_table_cache, 716 s->refcount_block_cache); 717 } 718 719 ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); 720 if (ret < 0) { 721 goto err; 722 } 723 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 724 725 assert(l2_index + m->nb_clusters <= s->l2_size); 726 for (i = 0; i < m->nb_clusters; i++) { 727 /* if two concurrent writes happen to the same unallocated cluster 728 * each write allocates separate cluster and writes data concurrently. 729 * The first one to complete updates l2 table with pointer to its 730 * cluster the second one has to do RMW (which is done above by 731 * copy_sectors()), update l2 table with its cluster pointer and free 732 * old cluster. This is what this loop does */ 733 if(l2_table[l2_index + i] != 0) 734 old_cluster[j++] = l2_table[l2_index + i]; 735 736 l2_table[l2_index + i] = cpu_to_be64((cluster_offset + 737 (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); 738 } 739 740 741 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 742 if (ret < 0) { 743 goto err; 744 } 745 746 /* 747 * If this was a COW, we need to decrease the refcount of the old cluster. 748 * Also flush bs->file to get the right order for L2 and refcount update. 749 * 750 * Don't discard clusters that reach a refcount of 0 (e.g. compressed 751 * clusters), the next write will reuse them anyway. 752 */ 753 if (j != 0) { 754 for (i = 0; i < j; i++) { 755 qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, 756 QCOW2_DISCARD_NEVER); 757 } 758 } 759 760 ret = 0; 761 err: 762 g_free(old_cluster); 763 return ret; 764 } 765 766 /* 767 * Returns the number of contiguous clusters that can be used for an allocating 768 * write, but require COW to be performed (this includes yet unallocated space, 769 * which must copy from the backing file) 770 */ 771 static int count_cow_clusters(BDRVQcowState *s, int nb_clusters, 772 uint64_t *l2_table, int l2_index) 773 { 774 int i; 775 776 for (i = 0; i < nb_clusters; i++) { 777 uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); 778 int cluster_type = qcow2_get_cluster_type(l2_entry); 779 780 switch(cluster_type) { 781 case QCOW2_CLUSTER_NORMAL: 782 if (l2_entry & QCOW_OFLAG_COPIED) { 783 goto out; 784 } 785 break; 786 case QCOW2_CLUSTER_UNALLOCATED: 787 case QCOW2_CLUSTER_COMPRESSED: 788 case QCOW2_CLUSTER_ZERO: 789 break; 790 default: 791 abort(); 792 } 793 } 794 795 out: 796 assert(i <= nb_clusters); 797 return i; 798 } 799 800 /* 801 * Check if there already is an AIO write request in flight which allocates 802 * the same cluster. In this case we need to wait until the previous 803 * request has completed and updated the L2 table accordingly. 804 * 805 * Returns: 806 * 0 if there was no dependency. *cur_bytes indicates the number of 807 * bytes from guest_offset that can be read before the next 808 * dependency must be processed (or the request is complete) 809 * 810 * -EAGAIN if we had to wait for another request, previously gathered 811 * information on cluster allocation may be invalid now. The caller 812 * must start over anyway, so consider *cur_bytes undefined. 813 */ 814 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, 815 uint64_t *cur_bytes, QCowL2Meta **m) 816 { 817 BDRVQcowState *s = bs->opaque; 818 QCowL2Meta *old_alloc; 819 uint64_t bytes = *cur_bytes; 820 821 QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 822 823 uint64_t start = guest_offset; 824 uint64_t end = start + bytes; 825 uint64_t old_start = l2meta_cow_start(old_alloc); 826 uint64_t old_end = l2meta_cow_end(old_alloc); 827 828 if (end <= old_start || start >= old_end) { 829 /* No intersection */ 830 } else { 831 if (start < old_start) { 832 /* Stop at the start of a running allocation */ 833 bytes = old_start - start; 834 } else { 835 bytes = 0; 836 } 837 838 /* Stop if already an l2meta exists. After yielding, it wouldn't 839 * be valid any more, so we'd have to clean up the old L2Metas 840 * and deal with requests depending on them before starting to 841 * gather new ones. Not worth the trouble. */ 842 if (bytes == 0 && *m) { 843 *cur_bytes = 0; 844 return 0; 845 } 846 847 if (bytes == 0) { 848 /* Wait for the dependency to complete. We need to recheck 849 * the free/allocated clusters when we continue. */ 850 qemu_co_mutex_unlock(&s->lock); 851 qemu_co_queue_wait(&old_alloc->dependent_requests); 852 qemu_co_mutex_lock(&s->lock); 853 return -EAGAIN; 854 } 855 } 856 } 857 858 /* Make sure that existing clusters and new allocations are only used up to 859 * the next dependency if we shortened the request above */ 860 *cur_bytes = bytes; 861 862 return 0; 863 } 864 865 /* 866 * Checks how many already allocated clusters that don't require a copy on 867 * write there are at the given guest_offset (up to *bytes). If 868 * *host_offset is not zero, only physically contiguous clusters beginning at 869 * this host offset are counted. 870 * 871 * Note that guest_offset may not be cluster aligned. In this case, the 872 * returned *host_offset points to exact byte referenced by guest_offset and 873 * therefore isn't cluster aligned as well. 874 * 875 * Returns: 876 * 0: if no allocated clusters are available at the given offset. 877 * *bytes is normally unchanged. It is set to 0 if the cluster 878 * is allocated and doesn't need COW, but doesn't have the right 879 * physical offset. 880 * 881 * 1: if allocated clusters that don't require a COW are available at 882 * the requested offset. *bytes may have decreased and describes 883 * the length of the area that can be written to. 884 * 885 * -errno: in error cases 886 */ 887 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, 888 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 889 { 890 BDRVQcowState *s = bs->opaque; 891 int l2_index; 892 uint64_t cluster_offset; 893 uint64_t *l2_table; 894 unsigned int nb_clusters; 895 unsigned int keep_clusters; 896 int ret, pret; 897 898 trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 899 *bytes); 900 901 assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) 902 == offset_into_cluster(s, *host_offset)); 903 904 /* 905 * Calculate the number of clusters to look for. We stop at L2 table 906 * boundaries to keep things simple. 907 */ 908 nb_clusters = 909 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 910 911 l2_index = offset_to_l2_index(s, guest_offset); 912 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 913 914 /* Find L2 entry for the first involved cluster */ 915 ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 916 if (ret < 0) { 917 return ret; 918 } 919 920 cluster_offset = be64_to_cpu(l2_table[l2_index]); 921 922 /* Check how many clusters are already allocated and don't need COW */ 923 if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL 924 && (cluster_offset & QCOW_OFLAG_COPIED)) 925 { 926 /* If a specific host_offset is required, check it */ 927 bool offset_matches = 928 (cluster_offset & L2E_OFFSET_MASK) == *host_offset; 929 930 if (*host_offset != 0 && !offset_matches) { 931 *bytes = 0; 932 ret = 0; 933 goto out; 934 } 935 936 /* We keep all QCOW_OFLAG_COPIED clusters */ 937 keep_clusters = 938 count_contiguous_clusters(nb_clusters, s->cluster_size, 939 &l2_table[l2_index], 940 QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); 941 assert(keep_clusters <= nb_clusters); 942 943 *bytes = MIN(*bytes, 944 keep_clusters * s->cluster_size 945 - offset_into_cluster(s, guest_offset)); 946 947 ret = 1; 948 } else { 949 ret = 0; 950 } 951 952 /* Cleanup */ 953 out: 954 pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 955 if (pret < 0) { 956 return pret; 957 } 958 959 /* Only return a host offset if we actually made progress. Otherwise we 960 * would make requirements for handle_alloc() that it can't fulfill */ 961 if (ret) { 962 *host_offset = (cluster_offset & L2E_OFFSET_MASK) 963 + offset_into_cluster(s, guest_offset); 964 } 965 966 return ret; 967 } 968 969 /* 970 * Allocates new clusters for the given guest_offset. 971 * 972 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 973 * contain the number of clusters that have been allocated and are contiguous 974 * in the image file. 975 * 976 * If *host_offset is non-zero, it specifies the offset in the image file at 977 * which the new clusters must start. *nb_clusters can be 0 on return in this 978 * case if the cluster at host_offset is already in use. If *host_offset is 979 * zero, the clusters can be allocated anywhere in the image file. 980 * 981 * *host_offset is updated to contain the offset into the image file at which 982 * the first allocated cluster starts. 983 * 984 * Return 0 on success and -errno in error cases. -EAGAIN means that the 985 * function has been waiting for another request and the allocation must be 986 * restarted, but the whole request should not be failed. 987 */ 988 static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, 989 uint64_t *host_offset, unsigned int *nb_clusters) 990 { 991 BDRVQcowState *s = bs->opaque; 992 993 trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 994 *host_offset, *nb_clusters); 995 996 /* Allocate new clusters */ 997 trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 998 if (*host_offset == 0) { 999 int64_t cluster_offset = 1000 qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 1001 if (cluster_offset < 0) { 1002 return cluster_offset; 1003 } 1004 *host_offset = cluster_offset; 1005 return 0; 1006 } else { 1007 int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 1008 if (ret < 0) { 1009 return ret; 1010 } 1011 *nb_clusters = ret; 1012 return 0; 1013 } 1014 } 1015 1016 /* 1017 * Allocates new clusters for an area that either is yet unallocated or needs a 1018 * copy on write. If *host_offset is non-zero, clusters are only allocated if 1019 * the new allocation can match the specified host offset. 1020 * 1021 * Note that guest_offset may not be cluster aligned. In this case, the 1022 * returned *host_offset points to exact byte referenced by guest_offset and 1023 * therefore isn't cluster aligned as well. 1024 * 1025 * Returns: 1026 * 0: if no clusters could be allocated. *bytes is set to 0, 1027 * *host_offset is left unchanged. 1028 * 1029 * 1: if new clusters were allocated. *bytes may be decreased if the 1030 * new allocation doesn't cover all of the requested area. 1031 * *host_offset is updated to contain the host offset of the first 1032 * newly allocated cluster. 1033 * 1034 * -errno: in error cases 1035 */ 1036 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, 1037 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 1038 { 1039 BDRVQcowState *s = bs->opaque; 1040 int l2_index; 1041 uint64_t *l2_table; 1042 uint64_t entry; 1043 unsigned int nb_clusters; 1044 int ret; 1045 1046 uint64_t alloc_cluster_offset; 1047 1048 trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 1049 *bytes); 1050 assert(*bytes > 0); 1051 1052 /* 1053 * Calculate the number of clusters to look for. We stop at L2 table 1054 * boundaries to keep things simple. 1055 */ 1056 nb_clusters = 1057 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1058 1059 l2_index = offset_to_l2_index(s, guest_offset); 1060 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1061 1062 /* Find L2 entry for the first involved cluster */ 1063 ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 1064 if (ret < 0) { 1065 return ret; 1066 } 1067 1068 entry = be64_to_cpu(l2_table[l2_index]); 1069 1070 /* For the moment, overwrite compressed clusters one by one */ 1071 if (entry & QCOW_OFLAG_COMPRESSED) { 1072 nb_clusters = 1; 1073 } else { 1074 nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); 1075 } 1076 1077 /* This function is only called when there were no non-COW clusters, so if 1078 * we can't find any unallocated or COW clusters either, something is 1079 * wrong with our code. */ 1080 assert(nb_clusters > 0); 1081 1082 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1083 if (ret < 0) { 1084 return ret; 1085 } 1086 1087 /* Allocate, if necessary at a given offset in the image file */ 1088 alloc_cluster_offset = start_of_cluster(s, *host_offset); 1089 ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 1090 &nb_clusters); 1091 if (ret < 0) { 1092 goto fail; 1093 } 1094 1095 /* Can't extend contiguous allocation */ 1096 if (nb_clusters == 0) { 1097 *bytes = 0; 1098 return 0; 1099 } 1100 1101 /* 1102 * Save info needed for meta data update. 1103 * 1104 * requested_sectors: Number of sectors from the start of the first 1105 * newly allocated cluster to the end of the (possibly shortened 1106 * before) write request. 1107 * 1108 * avail_sectors: Number of sectors from the start of the first 1109 * newly allocated to the end of the last newly allocated cluster. 1110 * 1111 * nb_sectors: The number of sectors from the start of the first 1112 * newly allocated cluster to the end of the area that the write 1113 * request actually writes to (excluding COW at the end) 1114 */ 1115 int requested_sectors = 1116 (*bytes + offset_into_cluster(s, guest_offset)) 1117 >> BDRV_SECTOR_BITS; 1118 int avail_sectors = nb_clusters 1119 << (s->cluster_bits - BDRV_SECTOR_BITS); 1120 int alloc_n_start = offset_into_cluster(s, guest_offset) 1121 >> BDRV_SECTOR_BITS; 1122 int nb_sectors = MIN(requested_sectors, avail_sectors); 1123 QCowL2Meta *old_m = *m; 1124 1125 *m = g_malloc0(sizeof(**m)); 1126 1127 **m = (QCowL2Meta) { 1128 .next = old_m, 1129 1130 .alloc_offset = alloc_cluster_offset, 1131 .offset = start_of_cluster(s, guest_offset), 1132 .nb_clusters = nb_clusters, 1133 .nb_available = nb_sectors, 1134 1135 .cow_start = { 1136 .offset = 0, 1137 .nb_sectors = alloc_n_start, 1138 }, 1139 .cow_end = { 1140 .offset = nb_sectors * BDRV_SECTOR_SIZE, 1141 .nb_sectors = avail_sectors - nb_sectors, 1142 }, 1143 }; 1144 qemu_co_queue_init(&(*m)->dependent_requests); 1145 QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 1146 1147 *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 1148 *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) 1149 - offset_into_cluster(s, guest_offset)); 1150 assert(*bytes != 0); 1151 1152 return 1; 1153 1154 fail: 1155 if (*m && (*m)->nb_clusters > 0) { 1156 QLIST_REMOVE(*m, next_in_flight); 1157 } 1158 return ret; 1159 } 1160 1161 /* 1162 * alloc_cluster_offset 1163 * 1164 * For a given offset on the virtual disk, find the cluster offset in qcow2 1165 * file. If the offset is not found, allocate a new cluster. 1166 * 1167 * If the cluster was already allocated, m->nb_clusters is set to 0 and 1168 * other fields in m are meaningless. 1169 * 1170 * If the cluster is newly allocated, m->nb_clusters is set to the number of 1171 * contiguous clusters that have been allocated. In this case, the other 1172 * fields of m are valid and contain information about the first allocated 1173 * cluster. 1174 * 1175 * If the request conflicts with another write request in flight, the coroutine 1176 * is queued and will be reentered when the dependency has completed. 1177 * 1178 * Return 0 on success and -errno in error cases 1179 */ 1180 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, 1181 int *num, uint64_t *host_offset, QCowL2Meta **m) 1182 { 1183 BDRVQcowState *s = bs->opaque; 1184 uint64_t start, remaining; 1185 uint64_t cluster_offset; 1186 uint64_t cur_bytes; 1187 int ret; 1188 1189 trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num); 1190 1191 assert((offset & ~BDRV_SECTOR_MASK) == 0); 1192 1193 again: 1194 start = offset; 1195 remaining = *num << BDRV_SECTOR_BITS; 1196 cluster_offset = 0; 1197 *host_offset = 0; 1198 cur_bytes = 0; 1199 *m = NULL; 1200 1201 while (true) { 1202 1203 if (!*host_offset) { 1204 *host_offset = start_of_cluster(s, cluster_offset); 1205 } 1206 1207 assert(remaining >= cur_bytes); 1208 1209 start += cur_bytes; 1210 remaining -= cur_bytes; 1211 cluster_offset += cur_bytes; 1212 1213 if (remaining == 0) { 1214 break; 1215 } 1216 1217 cur_bytes = remaining; 1218 1219 /* 1220 * Now start gathering as many contiguous clusters as possible: 1221 * 1222 * 1. Check for overlaps with in-flight allocations 1223 * 1224 * a) Overlap not in the first cluster -> shorten this request and 1225 * let the caller handle the rest in its next loop iteration. 1226 * 1227 * b) Real overlaps of two requests. Yield and restart the search 1228 * for contiguous clusters (the situation could have changed 1229 * while we were sleeping) 1230 * 1231 * c) TODO: Request starts in the same cluster as the in-flight 1232 * allocation ends. Shorten the COW of the in-fight allocation, 1233 * set cluster_offset to write to the same cluster and set up 1234 * the right synchronisation between the in-flight request and 1235 * the new one. 1236 */ 1237 ret = handle_dependencies(bs, start, &cur_bytes, m); 1238 if (ret == -EAGAIN) { 1239 /* Currently handle_dependencies() doesn't yield if we already had 1240 * an allocation. If it did, we would have to clean up the L2Meta 1241 * structs before starting over. */ 1242 assert(*m == NULL); 1243 goto again; 1244 } else if (ret < 0) { 1245 return ret; 1246 } else if (cur_bytes == 0) { 1247 break; 1248 } else { 1249 /* handle_dependencies() may have decreased cur_bytes (shortened 1250 * the allocations below) so that the next dependency is processed 1251 * correctly during the next loop iteration. */ 1252 } 1253 1254 /* 1255 * 2. Count contiguous COPIED clusters. 1256 */ 1257 ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 1258 if (ret < 0) { 1259 return ret; 1260 } else if (ret) { 1261 continue; 1262 } else if (cur_bytes == 0) { 1263 break; 1264 } 1265 1266 /* 1267 * 3. If the request still hasn't completed, allocate new clusters, 1268 * considering any cluster_offset of steps 1c or 2. 1269 */ 1270 ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1271 if (ret < 0) { 1272 return ret; 1273 } else if (ret) { 1274 continue; 1275 } else { 1276 assert(cur_bytes == 0); 1277 break; 1278 } 1279 } 1280 1281 *num -= remaining >> BDRV_SECTOR_BITS; 1282 assert(*num > 0); 1283 assert(*host_offset != 0); 1284 1285 return 0; 1286 } 1287 1288 static int decompress_buffer(uint8_t *out_buf, int out_buf_size, 1289 const uint8_t *buf, int buf_size) 1290 { 1291 z_stream strm1, *strm = &strm1; 1292 int ret, out_len; 1293 1294 memset(strm, 0, sizeof(*strm)); 1295 1296 strm->next_in = (uint8_t *)buf; 1297 strm->avail_in = buf_size; 1298 strm->next_out = out_buf; 1299 strm->avail_out = out_buf_size; 1300 1301 ret = inflateInit2(strm, -12); 1302 if (ret != Z_OK) 1303 return -1; 1304 ret = inflate(strm, Z_FINISH); 1305 out_len = strm->next_out - out_buf; 1306 if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || 1307 out_len != out_buf_size) { 1308 inflateEnd(strm); 1309 return -1; 1310 } 1311 inflateEnd(strm); 1312 return 0; 1313 } 1314 1315 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) 1316 { 1317 BDRVQcowState *s = bs->opaque; 1318 int ret, csize, nb_csectors, sector_offset; 1319 uint64_t coffset; 1320 1321 coffset = cluster_offset & s->cluster_offset_mask; 1322 if (s->cluster_cache_offset != coffset) { 1323 nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 1324 sector_offset = coffset & 511; 1325 csize = nb_csectors * 512 - sector_offset; 1326 BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 1327 ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); 1328 if (ret < 0) { 1329 return ret; 1330 } 1331 if (decompress_buffer(s->cluster_cache, s->cluster_size, 1332 s->cluster_data + sector_offset, csize) < 0) { 1333 return -EIO; 1334 } 1335 s->cluster_cache_offset = coffset; 1336 } 1337 return 0; 1338 } 1339 1340 /* 1341 * This discards as many clusters of nb_clusters as possible at once (i.e. 1342 * all clusters in the same L2 table) and returns the number of discarded 1343 * clusters. 1344 */ 1345 static int discard_single_l2(BlockDriverState *bs, uint64_t offset, 1346 unsigned int nb_clusters, enum qcow2_discard_type type) 1347 { 1348 BDRVQcowState *s = bs->opaque; 1349 uint64_t *l2_table; 1350 int l2_index; 1351 int ret; 1352 int i; 1353 1354 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1355 if (ret < 0) { 1356 return ret; 1357 } 1358 1359 /* Limit nb_clusters to one L2 table */ 1360 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1361 1362 for (i = 0; i < nb_clusters; i++) { 1363 uint64_t old_offset; 1364 1365 old_offset = be64_to_cpu(l2_table[l2_index + i]); 1366 1367 /* 1368 * Make sure that a discarded area reads back as zeroes for v3 images 1369 * (we cannot do it for v2 without actually writing a zero-filled 1370 * buffer). We can skip the operation if the cluster is already marked 1371 * as zero, or if it's unallocated and we don't have a backing file. 1372 * 1373 * TODO We might want to use bdrv_get_block_status(bs) here, but we're 1374 * holding s->lock, so that doesn't work today. 1375 */ 1376 if (old_offset & QCOW_OFLAG_ZERO) { 1377 continue; 1378 } 1379 1380 if ((old_offset & L2E_OFFSET_MASK) == 0 && !bs->backing_hd) { 1381 continue; 1382 } 1383 1384 /* First remove L2 entries */ 1385 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 1386 if (s->qcow_version >= 3) { 1387 l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1388 } else { 1389 l2_table[l2_index + i] = cpu_to_be64(0); 1390 } 1391 1392 /* Then decrease the refcount */ 1393 qcow2_free_any_clusters(bs, old_offset, 1, type); 1394 } 1395 1396 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1397 if (ret < 0) { 1398 return ret; 1399 } 1400 1401 return nb_clusters; 1402 } 1403 1404 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, 1405 int nb_sectors, enum qcow2_discard_type type) 1406 { 1407 BDRVQcowState *s = bs->opaque; 1408 uint64_t end_offset; 1409 unsigned int nb_clusters; 1410 int ret; 1411 1412 end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); 1413 1414 /* Round start up and end down */ 1415 offset = align_offset(offset, s->cluster_size); 1416 end_offset = start_of_cluster(s, end_offset); 1417 1418 if (offset > end_offset) { 1419 return 0; 1420 } 1421 1422 nb_clusters = size_to_clusters(s, end_offset - offset); 1423 1424 s->cache_discards = true; 1425 1426 /* Each L2 table is handled by its own loop iteration */ 1427 while (nb_clusters > 0) { 1428 ret = discard_single_l2(bs, offset, nb_clusters, type); 1429 if (ret < 0) { 1430 goto fail; 1431 } 1432 1433 nb_clusters -= ret; 1434 offset += (ret * s->cluster_size); 1435 } 1436 1437 ret = 0; 1438 fail: 1439 s->cache_discards = false; 1440 qcow2_process_discards(bs, ret); 1441 1442 return ret; 1443 } 1444 1445 /* 1446 * This zeroes as many clusters of nb_clusters as possible at once (i.e. 1447 * all clusters in the same L2 table) and returns the number of zeroed 1448 * clusters. 1449 */ 1450 static int zero_single_l2(BlockDriverState *bs, uint64_t offset, 1451 unsigned int nb_clusters) 1452 { 1453 BDRVQcowState *s = bs->opaque; 1454 uint64_t *l2_table; 1455 int l2_index; 1456 int ret; 1457 int i; 1458 1459 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1460 if (ret < 0) { 1461 return ret; 1462 } 1463 1464 /* Limit nb_clusters to one L2 table */ 1465 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1466 1467 for (i = 0; i < nb_clusters; i++) { 1468 uint64_t old_offset; 1469 1470 old_offset = be64_to_cpu(l2_table[l2_index + i]); 1471 1472 /* Update L2 entries */ 1473 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 1474 if (old_offset & QCOW_OFLAG_COMPRESSED) { 1475 l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1476 qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1477 } else { 1478 l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); 1479 } 1480 } 1481 1482 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1483 if (ret < 0) { 1484 return ret; 1485 } 1486 1487 return nb_clusters; 1488 } 1489 1490 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) 1491 { 1492 BDRVQcowState *s = bs->opaque; 1493 unsigned int nb_clusters; 1494 int ret; 1495 1496 /* The zero flag is only supported by version 3 and newer */ 1497 if (s->qcow_version < 3) { 1498 return -ENOTSUP; 1499 } 1500 1501 /* Each L2 table is handled by its own loop iteration */ 1502 nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); 1503 1504 s->cache_discards = true; 1505 1506 while (nb_clusters > 0) { 1507 ret = zero_single_l2(bs, offset, nb_clusters); 1508 if (ret < 0) { 1509 goto fail; 1510 } 1511 1512 nb_clusters -= ret; 1513 offset += (ret * s->cluster_size); 1514 } 1515 1516 ret = 0; 1517 fail: 1518 s->cache_discards = false; 1519 qcow2_process_discards(bs, ret); 1520 1521 return ret; 1522 } 1523 1524 /* 1525 * Expands all zero clusters in a specific L1 table (or deallocates them, for 1526 * non-backed non-pre-allocated zero clusters). 1527 * 1528 * expanded_clusters is a bitmap where every bit corresponds to one cluster in 1529 * the image file; a bit gets set if the corresponding cluster has been used for 1530 * zero expansion (i.e., has been filled with zeroes and is referenced from an 1531 * L2 table). nb_clusters contains the total cluster count of the image file, 1532 * i.e., the number of bits in expanded_clusters. 1533 */ 1534 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, 1535 int l1_size, uint8_t **expanded_clusters, 1536 uint64_t *nb_clusters) 1537 { 1538 BDRVQcowState *s = bs->opaque; 1539 bool is_active_l1 = (l1_table == s->l1_table); 1540 uint64_t *l2_table = NULL; 1541 int ret; 1542 int i, j; 1543 1544 if (!is_active_l1) { 1545 /* inactive L2 tables require a buffer to be stored in when loading 1546 * them from disk */ 1547 l2_table = qemu_blockalign(bs, s->cluster_size); 1548 } 1549 1550 for (i = 0; i < l1_size; i++) { 1551 uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; 1552 bool l2_dirty = false; 1553 1554 if (!l2_offset) { 1555 /* unallocated */ 1556 continue; 1557 } 1558 1559 if (is_active_l1) { 1560 /* get active L2 tables from cache */ 1561 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, 1562 (void **)&l2_table); 1563 } else { 1564 /* load inactive L2 tables from disk */ 1565 ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, 1566 (void *)l2_table, s->cluster_sectors); 1567 } 1568 if (ret < 0) { 1569 goto fail; 1570 } 1571 1572 for (j = 0; j < s->l2_size; j++) { 1573 uint64_t l2_entry = be64_to_cpu(l2_table[j]); 1574 int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index; 1575 int cluster_type = qcow2_get_cluster_type(l2_entry); 1576 bool preallocated = offset != 0; 1577 1578 if (cluster_type == QCOW2_CLUSTER_NORMAL) { 1579 cluster_index = offset >> s->cluster_bits; 1580 assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); 1581 if ((*expanded_clusters)[cluster_index / 8] & 1582 (1 << (cluster_index % 8))) { 1583 /* Probably a shared L2 table; this cluster was a zero 1584 * cluster which has been expanded, its refcount 1585 * therefore most likely requires an update. */ 1586 ret = qcow2_update_cluster_refcount(bs, cluster_index, 1, 1587 QCOW2_DISCARD_NEVER); 1588 if (ret < 0) { 1589 goto fail; 1590 } 1591 /* Since we just increased the refcount, the COPIED flag may 1592 * no longer be set. */ 1593 l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED); 1594 l2_dirty = true; 1595 } 1596 continue; 1597 } 1598 else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) { 1599 continue; 1600 } 1601 1602 if (!preallocated) { 1603 if (!bs->backing_hd) { 1604 /* not backed; therefore we can simply deallocate the 1605 * cluster */ 1606 l2_table[j] = 0; 1607 l2_dirty = true; 1608 continue; 1609 } 1610 1611 offset = qcow2_alloc_clusters(bs, s->cluster_size); 1612 if (offset < 0) { 1613 ret = offset; 1614 goto fail; 1615 } 1616 } 1617 1618 ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); 1619 if (ret < 0) { 1620 if (!preallocated) { 1621 qcow2_free_clusters(bs, offset, s->cluster_size, 1622 QCOW2_DISCARD_ALWAYS); 1623 } 1624 goto fail; 1625 } 1626 1627 ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE, 1628 s->cluster_sectors, 0); 1629 if (ret < 0) { 1630 if (!preallocated) { 1631 qcow2_free_clusters(bs, offset, s->cluster_size, 1632 QCOW2_DISCARD_ALWAYS); 1633 } 1634 goto fail; 1635 } 1636 1637 l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); 1638 l2_dirty = true; 1639 1640 cluster_index = offset >> s->cluster_bits; 1641 1642 if (cluster_index >= *nb_clusters) { 1643 uint64_t old_bitmap_size = (*nb_clusters + 7) / 8; 1644 uint64_t new_bitmap_size; 1645 /* The offset may lie beyond the old end of the underlying image 1646 * file for growable files only */ 1647 assert(bs->file->growable); 1648 *nb_clusters = size_to_clusters(s, bs->file->total_sectors * 1649 BDRV_SECTOR_SIZE); 1650 new_bitmap_size = (*nb_clusters + 7) / 8; 1651 *expanded_clusters = g_realloc(*expanded_clusters, 1652 new_bitmap_size); 1653 /* clear the newly allocated space */ 1654 memset(&(*expanded_clusters)[old_bitmap_size], 0, 1655 new_bitmap_size - old_bitmap_size); 1656 } 1657 1658 assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); 1659 (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8); 1660 } 1661 1662 if (is_active_l1) { 1663 if (l2_dirty) { 1664 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 1665 qcow2_cache_depends_on_flush(s->l2_table_cache); 1666 } 1667 ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); 1668 if (ret < 0) { 1669 l2_table = NULL; 1670 goto fail; 1671 } 1672 } else { 1673 if (l2_dirty) { 1674 ret = qcow2_pre_write_overlap_check(bs, 1675 QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, 1676 s->cluster_size); 1677 if (ret < 0) { 1678 goto fail; 1679 } 1680 1681 ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, 1682 (void *)l2_table, s->cluster_sectors); 1683 if (ret < 0) { 1684 goto fail; 1685 } 1686 } 1687 } 1688 } 1689 1690 ret = 0; 1691 1692 fail: 1693 if (l2_table) { 1694 if (!is_active_l1) { 1695 qemu_vfree(l2_table); 1696 } else { 1697 if (ret < 0) { 1698 qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); 1699 } else { 1700 ret = qcow2_cache_put(bs, s->l2_table_cache, 1701 (void **)&l2_table); 1702 } 1703 } 1704 } 1705 return ret; 1706 } 1707 1708 /* 1709 * For backed images, expands all zero clusters on the image. For non-backed 1710 * images, deallocates all non-pre-allocated zero clusters (and claims the 1711 * allocation for pre-allocated ones). This is important for downgrading to a 1712 * qcow2 version which doesn't yet support metadata zero clusters. 1713 */ 1714 int qcow2_expand_zero_clusters(BlockDriverState *bs) 1715 { 1716 BDRVQcowState *s = bs->opaque; 1717 uint64_t *l1_table = NULL; 1718 uint64_t nb_clusters; 1719 uint8_t *expanded_clusters; 1720 int ret; 1721 int i, j; 1722 1723 nb_clusters = size_to_clusters(s, bs->file->total_sectors * 1724 BDRV_SECTOR_SIZE); 1725 expanded_clusters = g_malloc0((nb_clusters + 7) / 8); 1726 1727 ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, 1728 &expanded_clusters, &nb_clusters); 1729 if (ret < 0) { 1730 goto fail; 1731 } 1732 1733 /* Inactive L1 tables may point to active L2 tables - therefore it is 1734 * necessary to flush the L2 table cache before trying to access the L2 1735 * tables pointed to by inactive L1 entries (else we might try to expand 1736 * zero clusters that have already been expanded); furthermore, it is also 1737 * necessary to empty the L2 table cache, since it may contain tables which 1738 * are now going to be modified directly on disk, bypassing the cache. 1739 * qcow2_cache_empty() does both for us. */ 1740 ret = qcow2_cache_empty(bs, s->l2_table_cache); 1741 if (ret < 0) { 1742 goto fail; 1743 } 1744 1745 for (i = 0; i < s->nb_snapshots; i++) { 1746 int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) + 1747 BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE; 1748 1749 l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); 1750 1751 ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset / 1752 BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors); 1753 if (ret < 0) { 1754 goto fail; 1755 } 1756 1757 for (j = 0; j < s->snapshots[i].l1_size; j++) { 1758 be64_to_cpus(&l1_table[j]); 1759 } 1760 1761 ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, 1762 &expanded_clusters, &nb_clusters); 1763 if (ret < 0) { 1764 goto fail; 1765 } 1766 } 1767 1768 ret = 0; 1769 1770 fail: 1771 g_free(expanded_clusters); 1772 g_free(l1_table); 1773 return ret; 1774 } 1775