1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include <zlib.h> 26 27 #include "qemu-common.h" 28 #include "block/block_int.h" 29 #include "block/qcow2.h" 30 #include "trace.h" 31 32 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 33 bool exact_size) 34 { 35 BDRVQcowState *s = bs->opaque; 36 int new_l1_size2, ret, i; 37 uint64_t *new_l1_table; 38 int64_t new_l1_table_offset, new_l1_size; 39 uint8_t data[12]; 40 41 if (min_size <= s->l1_size) 42 return 0; 43 44 if (exact_size) { 45 new_l1_size = min_size; 46 } else { 47 /* Bump size up to reduce the number of times we have to grow */ 48 new_l1_size = s->l1_size; 49 if (new_l1_size == 0) { 50 new_l1_size = 1; 51 } 52 while (min_size > new_l1_size) { 53 new_l1_size = (new_l1_size * 3 + 1) / 2; 54 } 55 } 56 57 if (new_l1_size > INT_MAX) { 58 return -EFBIG; 59 } 60 61 #ifdef DEBUG_ALLOC2 62 fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 63 s->l1_size, new_l1_size); 64 #endif 65 66 new_l1_size2 = sizeof(uint64_t) * new_l1_size; 67 new_l1_table = g_malloc0(align_offset(new_l1_size2, 512)); 68 memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); 69 70 /* write new table (align to cluster) */ 71 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 72 new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 73 if (new_l1_table_offset < 0) { 74 g_free(new_l1_table); 75 return new_l1_table_offset; 76 } 77 78 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 79 if (ret < 0) { 80 goto fail; 81 } 82 83 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 84 for(i = 0; i < s->l1_size; i++) 85 new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 86 ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); 87 if (ret < 0) 88 goto fail; 89 for(i = 0; i < s->l1_size; i++) 90 new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 91 92 /* set new table */ 93 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 94 cpu_to_be32w((uint32_t*)data, new_l1_size); 95 cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset); 96 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); 97 if (ret < 0) { 98 goto fail; 99 } 100 g_free(s->l1_table); 101 qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t), 102 QCOW2_DISCARD_OTHER); 103 s->l1_table_offset = new_l1_table_offset; 104 s->l1_table = new_l1_table; 105 s->l1_size = new_l1_size; 106 return 0; 107 fail: 108 g_free(new_l1_table); 109 qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 110 QCOW2_DISCARD_OTHER); 111 return ret; 112 } 113 114 /* 115 * l2_load 116 * 117 * Loads a L2 table into memory. If the table is in the cache, the cache 118 * is used; otherwise the L2 table is loaded from the image file. 119 * 120 * Returns a pointer to the L2 table on success, or NULL if the read from 121 * the image file failed. 122 */ 123 124 static int l2_load(BlockDriverState *bs, uint64_t l2_offset, 125 uint64_t **l2_table) 126 { 127 BDRVQcowState *s = bs->opaque; 128 int ret; 129 130 ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); 131 132 return ret; 133 } 134 135 /* 136 * Writes one sector of the L1 table to the disk (can't update single entries 137 * and we really don't want bdrv_pread to perform a read-modify-write) 138 */ 139 #define L1_ENTRIES_PER_SECTOR (512 / 8) 140 static int write_l1_entry(BlockDriverState *bs, int l1_index) 141 { 142 BDRVQcowState *s = bs->opaque; 143 uint64_t buf[L1_ENTRIES_PER_SECTOR]; 144 int l1_start_index; 145 int i, ret; 146 147 l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); 148 for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) { 149 buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 150 } 151 152 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 153 ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, 154 buf, sizeof(buf)); 155 if (ret < 0) { 156 return ret; 157 } 158 159 return 0; 160 } 161 162 /* 163 * l2_allocate 164 * 165 * Allocate a new l2 entry in the file. If l1_index points to an already 166 * used entry in the L2 table (i.e. we are doing a copy on write for the L2 167 * table) copy the contents of the old L2 table into the newly allocated one. 168 * Otherwise the new table is initialized with zeros. 169 * 170 */ 171 172 static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) 173 { 174 BDRVQcowState *s = bs->opaque; 175 uint64_t old_l2_offset; 176 uint64_t *l2_table; 177 int64_t l2_offset; 178 int ret; 179 180 old_l2_offset = s->l1_table[l1_index]; 181 182 trace_qcow2_l2_allocate(bs, l1_index); 183 184 /* allocate a new l2 entry */ 185 186 l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); 187 if (l2_offset < 0) { 188 return l2_offset; 189 } 190 191 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 192 if (ret < 0) { 193 goto fail; 194 } 195 196 /* allocate a new entry in the l2 cache */ 197 198 trace_qcow2_l2_allocate_get_empty(bs, l1_index); 199 ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); 200 if (ret < 0) { 201 return ret; 202 } 203 204 l2_table = *table; 205 206 if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 207 /* if there was no old l2 table, clear the new table */ 208 memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); 209 } else { 210 uint64_t* old_table; 211 212 /* if there was an old l2 table, read it from the disk */ 213 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 214 ret = qcow2_cache_get(bs, s->l2_table_cache, 215 old_l2_offset & L1E_OFFSET_MASK, 216 (void**) &old_table); 217 if (ret < 0) { 218 goto fail; 219 } 220 221 memcpy(l2_table, old_table, s->cluster_size); 222 223 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &old_table); 224 if (ret < 0) { 225 goto fail; 226 } 227 } 228 229 /* write the l2 table to the file */ 230 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 231 232 trace_qcow2_l2_allocate_write_l2(bs, l1_index); 233 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 234 ret = qcow2_cache_flush(bs, s->l2_table_cache); 235 if (ret < 0) { 236 goto fail; 237 } 238 239 /* update the L1 entry */ 240 trace_qcow2_l2_allocate_write_l1(bs, l1_index); 241 s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 242 ret = write_l1_entry(bs, l1_index); 243 if (ret < 0) { 244 goto fail; 245 } 246 247 *table = l2_table; 248 trace_qcow2_l2_allocate_done(bs, l1_index, 0); 249 return 0; 250 251 fail: 252 trace_qcow2_l2_allocate_done(bs, l1_index, ret); 253 qcow2_cache_put(bs, s->l2_table_cache, (void**) table); 254 s->l1_table[l1_index] = old_l2_offset; 255 return ret; 256 } 257 258 /* 259 * Checks how many clusters in a given L2 table are contiguous in the image 260 * file. As soon as one of the flags in the bitmask stop_flags changes compared 261 * to the first cluster, the search is stopped and the cluster is not counted 262 * as contiguous. (This allows it, for example, to stop at the first compressed 263 * cluster which may require a different handling) 264 */ 265 static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, 266 uint64_t *l2_table, uint64_t start, uint64_t stop_flags) 267 { 268 int i; 269 uint64_t mask = stop_flags | L2E_OFFSET_MASK; 270 uint64_t offset = be64_to_cpu(l2_table[0]) & mask; 271 272 if (!offset) 273 return 0; 274 275 for (i = start; i < start + nb_clusters; i++) { 276 uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; 277 if (offset + (uint64_t) i * cluster_size != l2_entry) { 278 break; 279 } 280 } 281 282 return (i - start); 283 } 284 285 static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) 286 { 287 int i; 288 289 for (i = 0; i < nb_clusters; i++) { 290 int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); 291 292 if (type != QCOW2_CLUSTER_UNALLOCATED) { 293 break; 294 } 295 } 296 297 return i; 298 } 299 300 /* The crypt function is compatible with the linux cryptoloop 301 algorithm for < 4 GB images. NOTE: out_buf == in_buf is 302 supported */ 303 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, 304 uint8_t *out_buf, const uint8_t *in_buf, 305 int nb_sectors, int enc, 306 const AES_KEY *key) 307 { 308 union { 309 uint64_t ll[2]; 310 uint8_t b[16]; 311 } ivec; 312 int i; 313 314 for(i = 0; i < nb_sectors; i++) { 315 ivec.ll[0] = cpu_to_le64(sector_num); 316 ivec.ll[1] = 0; 317 AES_cbc_encrypt(in_buf, out_buf, 512, key, 318 ivec.b, enc); 319 sector_num++; 320 in_buf += 512; 321 out_buf += 512; 322 } 323 } 324 325 static int coroutine_fn copy_sectors(BlockDriverState *bs, 326 uint64_t start_sect, 327 uint64_t cluster_offset, 328 int n_start, int n_end) 329 { 330 BDRVQcowState *s = bs->opaque; 331 QEMUIOVector qiov; 332 struct iovec iov; 333 int n, ret; 334 335 /* 336 * If this is the last cluster and it is only partially used, we must only 337 * copy until the end of the image, or bdrv_check_request will fail for the 338 * bdrv_read/write calls below. 339 */ 340 if (start_sect + n_end > bs->total_sectors) { 341 n_end = bs->total_sectors - start_sect; 342 } 343 344 n = n_end - n_start; 345 if (n <= 0) { 346 return 0; 347 } 348 349 iov.iov_len = n * BDRV_SECTOR_SIZE; 350 iov.iov_base = qemu_blockalign(bs, iov.iov_len); 351 352 qemu_iovec_init_external(&qiov, &iov, 1); 353 354 BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 355 356 /* Call .bdrv_co_readv() directly instead of using the public block-layer 357 * interface. This avoids double I/O throttling and request tracking, 358 * which can lead to deadlock when block layer copy-on-read is enabled. 359 */ 360 ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); 361 if (ret < 0) { 362 goto out; 363 } 364 365 if (s->crypt_method) { 366 qcow2_encrypt_sectors(s, start_sect + n_start, 367 iov.iov_base, iov.iov_base, n, 1, 368 &s->aes_encrypt_key); 369 } 370 371 BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 372 ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); 373 if (ret < 0) { 374 goto out; 375 } 376 377 ret = 0; 378 out: 379 qemu_vfree(iov.iov_base); 380 return ret; 381 } 382 383 384 /* 385 * get_cluster_offset 386 * 387 * For a given offset of the disk image, find the cluster offset in 388 * qcow2 file. The offset is stored in *cluster_offset. 389 * 390 * on entry, *num is the number of contiguous sectors we'd like to 391 * access following offset. 392 * 393 * on exit, *num is the number of contiguous sectors we can read. 394 * 395 * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error 396 * cases. 397 */ 398 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, 399 int *num, uint64_t *cluster_offset) 400 { 401 BDRVQcowState *s = bs->opaque; 402 unsigned int l2_index; 403 uint64_t l1_index, l2_offset, *l2_table; 404 int l1_bits, c; 405 unsigned int index_in_cluster, nb_clusters; 406 uint64_t nb_available, nb_needed; 407 int ret; 408 409 index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); 410 nb_needed = *num + index_in_cluster; 411 412 l1_bits = s->l2_bits + s->cluster_bits; 413 414 /* compute how many bytes there are between the offset and 415 * the end of the l1 entry 416 */ 417 418 nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); 419 420 /* compute the number of available sectors */ 421 422 nb_available = (nb_available >> 9) + index_in_cluster; 423 424 if (nb_needed > nb_available) { 425 nb_needed = nb_available; 426 } 427 428 *cluster_offset = 0; 429 430 /* seek the the l2 offset in the l1 table */ 431 432 l1_index = offset >> l1_bits; 433 if (l1_index >= s->l1_size) { 434 ret = QCOW2_CLUSTER_UNALLOCATED; 435 goto out; 436 } 437 438 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 439 if (!l2_offset) { 440 ret = QCOW2_CLUSTER_UNALLOCATED; 441 goto out; 442 } 443 444 /* load the l2 table in memory */ 445 446 ret = l2_load(bs, l2_offset, &l2_table); 447 if (ret < 0) { 448 return ret; 449 } 450 451 /* find the cluster offset for the given disk offset */ 452 453 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 454 *cluster_offset = be64_to_cpu(l2_table[l2_index]); 455 nb_clusters = size_to_clusters(s, nb_needed << 9); 456 457 ret = qcow2_get_cluster_type(*cluster_offset); 458 switch (ret) { 459 case QCOW2_CLUSTER_COMPRESSED: 460 /* Compressed clusters can only be processed one by one */ 461 c = 1; 462 *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; 463 break; 464 case QCOW2_CLUSTER_ZERO: 465 if (s->qcow_version < 3) { 466 return -EIO; 467 } 468 c = count_contiguous_clusters(nb_clusters, s->cluster_size, 469 &l2_table[l2_index], 0, 470 QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); 471 *cluster_offset = 0; 472 break; 473 case QCOW2_CLUSTER_UNALLOCATED: 474 /* how many empty clusters ? */ 475 c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); 476 *cluster_offset = 0; 477 break; 478 case QCOW2_CLUSTER_NORMAL: 479 /* how many allocated clusters ? */ 480 c = count_contiguous_clusters(nb_clusters, s->cluster_size, 481 &l2_table[l2_index], 0, 482 QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); 483 *cluster_offset &= L2E_OFFSET_MASK; 484 break; 485 default: 486 abort(); 487 } 488 489 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 490 491 nb_available = (c * s->cluster_sectors); 492 493 out: 494 if (nb_available > nb_needed) 495 nb_available = nb_needed; 496 497 *num = nb_available - index_in_cluster; 498 499 return ret; 500 } 501 502 /* 503 * get_cluster_table 504 * 505 * for a given disk offset, load (and allocate if needed) 506 * the l2 table. 507 * 508 * the l2 table offset in the qcow2 file and the cluster index 509 * in the l2 table are given to the caller. 510 * 511 * Returns 0 on success, -errno in failure case 512 */ 513 static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 514 uint64_t **new_l2_table, 515 int *new_l2_index) 516 { 517 BDRVQcowState *s = bs->opaque; 518 unsigned int l2_index; 519 uint64_t l1_index, l2_offset; 520 uint64_t *l2_table = NULL; 521 int ret; 522 523 /* seek the the l2 offset in the l1 table */ 524 525 l1_index = offset >> (s->l2_bits + s->cluster_bits); 526 if (l1_index >= s->l1_size) { 527 ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 528 if (ret < 0) { 529 return ret; 530 } 531 } 532 533 assert(l1_index < s->l1_size); 534 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 535 536 /* seek the l2 table of the given l2 offset */ 537 538 if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { 539 /* load the l2 table in memory */ 540 ret = l2_load(bs, l2_offset, &l2_table); 541 if (ret < 0) { 542 return ret; 543 } 544 } else { 545 /* First allocate a new L2 table (and do COW if needed) */ 546 ret = l2_allocate(bs, l1_index, &l2_table); 547 if (ret < 0) { 548 return ret; 549 } 550 551 /* Then decrease the refcount of the old table */ 552 if (l2_offset) { 553 qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), 554 QCOW2_DISCARD_OTHER); 555 } 556 } 557 558 /* find the cluster offset for the given disk offset */ 559 560 l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); 561 562 *new_l2_table = l2_table; 563 *new_l2_index = l2_index; 564 565 return 0; 566 } 567 568 /* 569 * alloc_compressed_cluster_offset 570 * 571 * For a given offset of the disk image, return cluster offset in 572 * qcow2 file. 573 * 574 * If the offset is not found, allocate a new compressed cluster. 575 * 576 * Return the cluster offset if successful, 577 * Return 0, otherwise. 578 * 579 */ 580 581 uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 582 uint64_t offset, 583 int compressed_size) 584 { 585 BDRVQcowState *s = bs->opaque; 586 int l2_index, ret; 587 uint64_t *l2_table; 588 int64_t cluster_offset; 589 int nb_csectors; 590 591 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 592 if (ret < 0) { 593 return 0; 594 } 595 596 /* Compression can't overwrite anything. Fail if the cluster was already 597 * allocated. */ 598 cluster_offset = be64_to_cpu(l2_table[l2_index]); 599 if (cluster_offset & L2E_OFFSET_MASK) { 600 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 601 return 0; 602 } 603 604 cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 605 if (cluster_offset < 0) { 606 qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 607 return 0; 608 } 609 610 nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - 611 (cluster_offset >> 9); 612 613 cluster_offset |= QCOW_OFLAG_COMPRESSED | 614 ((uint64_t)nb_csectors << s->csize_shift); 615 616 /* update L2 table */ 617 618 /* compressed clusters never have the copied flag */ 619 620 BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 621 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 622 l2_table[l2_index] = cpu_to_be64(cluster_offset); 623 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 624 if (ret < 0) { 625 return 0; 626 } 627 628 return cluster_offset; 629 } 630 631 static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) 632 { 633 BDRVQcowState *s = bs->opaque; 634 int ret; 635 636 if (r->nb_sectors == 0) { 637 return 0; 638 } 639 640 qemu_co_mutex_unlock(&s->lock); 641 ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, 642 r->offset / BDRV_SECTOR_SIZE, 643 r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); 644 qemu_co_mutex_lock(&s->lock); 645 646 if (ret < 0) { 647 return ret; 648 } 649 650 /* 651 * Before we update the L2 table to actually point to the new cluster, we 652 * need to be sure that the refcounts have been increased and COW was 653 * handled. 654 */ 655 qcow2_cache_depends_on_flush(s->l2_table_cache); 656 657 return 0; 658 } 659 660 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) 661 { 662 BDRVQcowState *s = bs->opaque; 663 int i, j = 0, l2_index, ret; 664 uint64_t *old_cluster, *l2_table; 665 uint64_t cluster_offset = m->alloc_offset; 666 667 trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 668 assert(m->nb_clusters > 0); 669 670 old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); 671 672 /* copy content of unmodified sectors */ 673 ret = perform_cow(bs, m, &m->cow_start); 674 if (ret < 0) { 675 goto err; 676 } 677 678 ret = perform_cow(bs, m, &m->cow_end); 679 if (ret < 0) { 680 goto err; 681 } 682 683 /* Update L2 table. */ 684 if (s->use_lazy_refcounts) { 685 qcow2_mark_dirty(bs); 686 } 687 if (qcow2_need_accurate_refcounts(s)) { 688 qcow2_cache_set_dependency(bs, s->l2_table_cache, 689 s->refcount_block_cache); 690 } 691 692 ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); 693 if (ret < 0) { 694 goto err; 695 } 696 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 697 698 for (i = 0; i < m->nb_clusters; i++) { 699 /* if two concurrent writes happen to the same unallocated cluster 700 * each write allocates separate cluster and writes data concurrently. 701 * The first one to complete updates l2 table with pointer to its 702 * cluster the second one has to do RMW (which is done above by 703 * copy_sectors()), update l2 table with its cluster pointer and free 704 * old cluster. This is what this loop does */ 705 if(l2_table[l2_index + i] != 0) 706 old_cluster[j++] = l2_table[l2_index + i]; 707 708 l2_table[l2_index + i] = cpu_to_be64((cluster_offset + 709 (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); 710 } 711 712 713 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 714 if (ret < 0) { 715 goto err; 716 } 717 718 /* 719 * If this was a COW, we need to decrease the refcount of the old cluster. 720 * Also flush bs->file to get the right order for L2 and refcount update. 721 * 722 * Don't discard clusters that reach a refcount of 0 (e.g. compressed 723 * clusters), the next write will reuse them anyway. 724 */ 725 if (j != 0) { 726 for (i = 0; i < j; i++) { 727 qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, 728 QCOW2_DISCARD_NEVER); 729 } 730 } 731 732 ret = 0; 733 err: 734 g_free(old_cluster); 735 return ret; 736 } 737 738 /* 739 * Returns the number of contiguous clusters that can be used for an allocating 740 * write, but require COW to be performed (this includes yet unallocated space, 741 * which must copy from the backing file) 742 */ 743 static int count_cow_clusters(BDRVQcowState *s, int nb_clusters, 744 uint64_t *l2_table, int l2_index) 745 { 746 int i; 747 748 for (i = 0; i < nb_clusters; i++) { 749 uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); 750 int cluster_type = qcow2_get_cluster_type(l2_entry); 751 752 switch(cluster_type) { 753 case QCOW2_CLUSTER_NORMAL: 754 if (l2_entry & QCOW_OFLAG_COPIED) { 755 goto out; 756 } 757 break; 758 case QCOW2_CLUSTER_UNALLOCATED: 759 case QCOW2_CLUSTER_COMPRESSED: 760 case QCOW2_CLUSTER_ZERO: 761 break; 762 default: 763 abort(); 764 } 765 } 766 767 out: 768 assert(i <= nb_clusters); 769 return i; 770 } 771 772 /* 773 * Check if there already is an AIO write request in flight which allocates 774 * the same cluster. In this case we need to wait until the previous 775 * request has completed and updated the L2 table accordingly. 776 * 777 * Returns: 778 * 0 if there was no dependency. *cur_bytes indicates the number of 779 * bytes from guest_offset that can be read before the next 780 * dependency must be processed (or the request is complete) 781 * 782 * -EAGAIN if we had to wait for another request, previously gathered 783 * information on cluster allocation may be invalid now. The caller 784 * must start over anyway, so consider *cur_bytes undefined. 785 */ 786 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, 787 uint64_t *cur_bytes, QCowL2Meta **m) 788 { 789 BDRVQcowState *s = bs->opaque; 790 QCowL2Meta *old_alloc; 791 uint64_t bytes = *cur_bytes; 792 793 QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 794 795 uint64_t start = guest_offset; 796 uint64_t end = start + bytes; 797 uint64_t old_start = l2meta_cow_start(old_alloc); 798 uint64_t old_end = l2meta_cow_end(old_alloc); 799 800 if (end <= old_start || start >= old_end) { 801 /* No intersection */ 802 } else { 803 if (start < old_start) { 804 /* Stop at the start of a running allocation */ 805 bytes = old_start - start; 806 } else { 807 bytes = 0; 808 } 809 810 /* Stop if already an l2meta exists. After yielding, it wouldn't 811 * be valid any more, so we'd have to clean up the old L2Metas 812 * and deal with requests depending on them before starting to 813 * gather new ones. Not worth the trouble. */ 814 if (bytes == 0 && *m) { 815 *cur_bytes = 0; 816 return 0; 817 } 818 819 if (bytes == 0) { 820 /* Wait for the dependency to complete. We need to recheck 821 * the free/allocated clusters when we continue. */ 822 qemu_co_mutex_unlock(&s->lock); 823 qemu_co_queue_wait(&old_alloc->dependent_requests); 824 qemu_co_mutex_lock(&s->lock); 825 return -EAGAIN; 826 } 827 } 828 } 829 830 /* Make sure that existing clusters and new allocations are only used up to 831 * the next dependency if we shortened the request above */ 832 *cur_bytes = bytes; 833 834 return 0; 835 } 836 837 /* 838 * Checks how many already allocated clusters that don't require a copy on 839 * write there are at the given guest_offset (up to *bytes). If 840 * *host_offset is not zero, only physically contiguous clusters beginning at 841 * this host offset are counted. 842 * 843 * Note that guest_offset may not be cluster aligned. In this case, the 844 * returned *host_offset points to exact byte referenced by guest_offset and 845 * therefore isn't cluster aligned as well. 846 * 847 * Returns: 848 * 0: if no allocated clusters are available at the given offset. 849 * *bytes is normally unchanged. It is set to 0 if the cluster 850 * is allocated and doesn't need COW, but doesn't have the right 851 * physical offset. 852 * 853 * 1: if allocated clusters that don't require a COW are available at 854 * the requested offset. *bytes may have decreased and describes 855 * the length of the area that can be written to. 856 * 857 * -errno: in error cases 858 */ 859 static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, 860 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 861 { 862 BDRVQcowState *s = bs->opaque; 863 int l2_index; 864 uint64_t cluster_offset; 865 uint64_t *l2_table; 866 unsigned int nb_clusters; 867 unsigned int keep_clusters; 868 int ret, pret; 869 870 trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 871 *bytes); 872 873 assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) 874 == offset_into_cluster(s, *host_offset)); 875 876 /* 877 * Calculate the number of clusters to look for. We stop at L2 table 878 * boundaries to keep things simple. 879 */ 880 nb_clusters = 881 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 882 883 l2_index = offset_to_l2_index(s, guest_offset); 884 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 885 886 /* Find L2 entry for the first involved cluster */ 887 ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 888 if (ret < 0) { 889 return ret; 890 } 891 892 cluster_offset = be64_to_cpu(l2_table[l2_index]); 893 894 /* Check how many clusters are already allocated and don't need COW */ 895 if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL 896 && (cluster_offset & QCOW_OFLAG_COPIED)) 897 { 898 /* If a specific host_offset is required, check it */ 899 bool offset_matches = 900 (cluster_offset & L2E_OFFSET_MASK) == *host_offset; 901 902 if (*host_offset != 0 && !offset_matches) { 903 *bytes = 0; 904 ret = 0; 905 goto out; 906 } 907 908 /* We keep all QCOW_OFLAG_COPIED clusters */ 909 keep_clusters = 910 count_contiguous_clusters(nb_clusters, s->cluster_size, 911 &l2_table[l2_index], 0, 912 QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); 913 assert(keep_clusters <= nb_clusters); 914 915 *bytes = MIN(*bytes, 916 keep_clusters * s->cluster_size 917 - offset_into_cluster(s, guest_offset)); 918 919 ret = 1; 920 } else { 921 ret = 0; 922 } 923 924 /* Cleanup */ 925 out: 926 pret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 927 if (pret < 0) { 928 return pret; 929 } 930 931 /* Only return a host offset if we actually made progress. Otherwise we 932 * would make requirements for handle_alloc() that it can't fulfill */ 933 if (ret) { 934 *host_offset = (cluster_offset & L2E_OFFSET_MASK) 935 + offset_into_cluster(s, guest_offset); 936 } 937 938 return ret; 939 } 940 941 /* 942 * Allocates new clusters for the given guest_offset. 943 * 944 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 945 * contain the number of clusters that have been allocated and are contiguous 946 * in the image file. 947 * 948 * If *host_offset is non-zero, it specifies the offset in the image file at 949 * which the new clusters must start. *nb_clusters can be 0 on return in this 950 * case if the cluster at host_offset is already in use. If *host_offset is 951 * zero, the clusters can be allocated anywhere in the image file. 952 * 953 * *host_offset is updated to contain the offset into the image file at which 954 * the first allocated cluster starts. 955 * 956 * Return 0 on success and -errno in error cases. -EAGAIN means that the 957 * function has been waiting for another request and the allocation must be 958 * restarted, but the whole request should not be failed. 959 */ 960 static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, 961 uint64_t *host_offset, unsigned int *nb_clusters) 962 { 963 BDRVQcowState *s = bs->opaque; 964 965 trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 966 *host_offset, *nb_clusters); 967 968 /* Allocate new clusters */ 969 trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 970 if (*host_offset == 0) { 971 int64_t cluster_offset = 972 qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 973 if (cluster_offset < 0) { 974 return cluster_offset; 975 } 976 *host_offset = cluster_offset; 977 return 0; 978 } else { 979 int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 980 if (ret < 0) { 981 return ret; 982 } 983 *nb_clusters = ret; 984 return 0; 985 } 986 } 987 988 /* 989 * Allocates new clusters for an area that either is yet unallocated or needs a 990 * copy on write. If *host_offset is non-zero, clusters are only allocated if 991 * the new allocation can match the specified host offset. 992 * 993 * Note that guest_offset may not be cluster aligned. In this case, the 994 * returned *host_offset points to exact byte referenced by guest_offset and 995 * therefore isn't cluster aligned as well. 996 * 997 * Returns: 998 * 0: if no clusters could be allocated. *bytes is set to 0, 999 * *host_offset is left unchanged. 1000 * 1001 * 1: if new clusters were allocated. *bytes may be decreased if the 1002 * new allocation doesn't cover all of the requested area. 1003 * *host_offset is updated to contain the host offset of the first 1004 * newly allocated cluster. 1005 * 1006 * -errno: in error cases 1007 */ 1008 static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, 1009 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) 1010 { 1011 BDRVQcowState *s = bs->opaque; 1012 int l2_index; 1013 uint64_t *l2_table; 1014 uint64_t entry; 1015 unsigned int nb_clusters; 1016 int ret; 1017 1018 uint64_t alloc_cluster_offset; 1019 1020 trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 1021 *bytes); 1022 assert(*bytes > 0); 1023 1024 /* 1025 * Calculate the number of clusters to look for. We stop at L2 table 1026 * boundaries to keep things simple. 1027 */ 1028 nb_clusters = 1029 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1030 1031 l2_index = offset_to_l2_index(s, guest_offset); 1032 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1033 1034 /* Find L2 entry for the first involved cluster */ 1035 ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); 1036 if (ret < 0) { 1037 return ret; 1038 } 1039 1040 entry = be64_to_cpu(l2_table[l2_index]); 1041 1042 /* For the moment, overwrite compressed clusters one by one */ 1043 if (entry & QCOW_OFLAG_COMPRESSED) { 1044 nb_clusters = 1; 1045 } else { 1046 nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); 1047 } 1048 1049 /* This function is only called when there were no non-COW clusters, so if 1050 * we can't find any unallocated or COW clusters either, something is 1051 * wrong with our code. */ 1052 assert(nb_clusters > 0); 1053 1054 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1055 if (ret < 0) { 1056 return ret; 1057 } 1058 1059 /* Allocate, if necessary at a given offset in the image file */ 1060 alloc_cluster_offset = start_of_cluster(s, *host_offset); 1061 ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 1062 &nb_clusters); 1063 if (ret < 0) { 1064 goto fail; 1065 } 1066 1067 /* Can't extend contiguous allocation */ 1068 if (nb_clusters == 0) { 1069 *bytes = 0; 1070 return 0; 1071 } 1072 1073 /* 1074 * Save info needed for meta data update. 1075 * 1076 * requested_sectors: Number of sectors from the start of the first 1077 * newly allocated cluster to the end of the (possibly shortened 1078 * before) write request. 1079 * 1080 * avail_sectors: Number of sectors from the start of the first 1081 * newly allocated to the end of the last newly allocated cluster. 1082 * 1083 * nb_sectors: The number of sectors from the start of the first 1084 * newly allocated cluster to the end of the area that the write 1085 * request actually writes to (excluding COW at the end) 1086 */ 1087 int requested_sectors = 1088 (*bytes + offset_into_cluster(s, guest_offset)) 1089 >> BDRV_SECTOR_BITS; 1090 int avail_sectors = nb_clusters 1091 << (s->cluster_bits - BDRV_SECTOR_BITS); 1092 int alloc_n_start = offset_into_cluster(s, guest_offset) 1093 >> BDRV_SECTOR_BITS; 1094 int nb_sectors = MIN(requested_sectors, avail_sectors); 1095 QCowL2Meta *old_m = *m; 1096 1097 *m = g_malloc0(sizeof(**m)); 1098 1099 **m = (QCowL2Meta) { 1100 .next = old_m, 1101 1102 .alloc_offset = alloc_cluster_offset, 1103 .offset = start_of_cluster(s, guest_offset), 1104 .nb_clusters = nb_clusters, 1105 .nb_available = nb_sectors, 1106 1107 .cow_start = { 1108 .offset = 0, 1109 .nb_sectors = alloc_n_start, 1110 }, 1111 .cow_end = { 1112 .offset = nb_sectors * BDRV_SECTOR_SIZE, 1113 .nb_sectors = avail_sectors - nb_sectors, 1114 }, 1115 }; 1116 qemu_co_queue_init(&(*m)->dependent_requests); 1117 QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 1118 1119 *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 1120 *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) 1121 - offset_into_cluster(s, guest_offset)); 1122 assert(*bytes != 0); 1123 1124 return 1; 1125 1126 fail: 1127 if (*m && (*m)->nb_clusters > 0) { 1128 QLIST_REMOVE(*m, next_in_flight); 1129 } 1130 return ret; 1131 } 1132 1133 /* 1134 * alloc_cluster_offset 1135 * 1136 * For a given offset on the virtual disk, find the cluster offset in qcow2 1137 * file. If the offset is not found, allocate a new cluster. 1138 * 1139 * If the cluster was already allocated, m->nb_clusters is set to 0 and 1140 * other fields in m are meaningless. 1141 * 1142 * If the cluster is newly allocated, m->nb_clusters is set to the number of 1143 * contiguous clusters that have been allocated. In this case, the other 1144 * fields of m are valid and contain information about the first allocated 1145 * cluster. 1146 * 1147 * If the request conflicts with another write request in flight, the coroutine 1148 * is queued and will be reentered when the dependency has completed. 1149 * 1150 * Return 0 on success and -errno in error cases 1151 */ 1152 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, 1153 int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m) 1154 { 1155 BDRVQcowState *s = bs->opaque; 1156 uint64_t start, remaining; 1157 uint64_t cluster_offset; 1158 uint64_t cur_bytes; 1159 int ret; 1160 1161 trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, 1162 n_start, n_end); 1163 1164 assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset)); 1165 offset = start_of_cluster(s, offset); 1166 1167 again: 1168 start = offset + (n_start << BDRV_SECTOR_BITS); 1169 remaining = (n_end - n_start) << BDRV_SECTOR_BITS; 1170 cluster_offset = 0; 1171 *host_offset = 0; 1172 cur_bytes = 0; 1173 *m = NULL; 1174 1175 while (true) { 1176 1177 if (!*host_offset) { 1178 *host_offset = start_of_cluster(s, cluster_offset); 1179 } 1180 1181 assert(remaining >= cur_bytes); 1182 1183 start += cur_bytes; 1184 remaining -= cur_bytes; 1185 cluster_offset += cur_bytes; 1186 1187 if (remaining == 0) { 1188 break; 1189 } 1190 1191 cur_bytes = remaining; 1192 1193 /* 1194 * Now start gathering as many contiguous clusters as possible: 1195 * 1196 * 1. Check for overlaps with in-flight allocations 1197 * 1198 * a) Overlap not in the first cluster -> shorten this request and 1199 * let the caller handle the rest in its next loop iteration. 1200 * 1201 * b) Real overlaps of two requests. Yield and restart the search 1202 * for contiguous clusters (the situation could have changed 1203 * while we were sleeping) 1204 * 1205 * c) TODO: Request starts in the same cluster as the in-flight 1206 * allocation ends. Shorten the COW of the in-fight allocation, 1207 * set cluster_offset to write to the same cluster and set up 1208 * the right synchronisation between the in-flight request and 1209 * the new one. 1210 */ 1211 ret = handle_dependencies(bs, start, &cur_bytes, m); 1212 if (ret == -EAGAIN) { 1213 /* Currently handle_dependencies() doesn't yield if we already had 1214 * an allocation. If it did, we would have to clean up the L2Meta 1215 * structs before starting over. */ 1216 assert(*m == NULL); 1217 goto again; 1218 } else if (ret < 0) { 1219 return ret; 1220 } else if (cur_bytes == 0) { 1221 break; 1222 } else { 1223 /* handle_dependencies() may have decreased cur_bytes (shortened 1224 * the allocations below) so that the next dependency is processed 1225 * correctly during the next loop iteration. */ 1226 } 1227 1228 /* 1229 * 2. Count contiguous COPIED clusters. 1230 */ 1231 ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 1232 if (ret < 0) { 1233 return ret; 1234 } else if (ret) { 1235 continue; 1236 } else if (cur_bytes == 0) { 1237 break; 1238 } 1239 1240 /* 1241 * 3. If the request still hasn't completed, allocate new clusters, 1242 * considering any cluster_offset of steps 1c or 2. 1243 */ 1244 ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1245 if (ret < 0) { 1246 return ret; 1247 } else if (ret) { 1248 continue; 1249 } else { 1250 assert(cur_bytes == 0); 1251 break; 1252 } 1253 } 1254 1255 *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS); 1256 assert(*num > 0); 1257 assert(*host_offset != 0); 1258 1259 return 0; 1260 } 1261 1262 static int decompress_buffer(uint8_t *out_buf, int out_buf_size, 1263 const uint8_t *buf, int buf_size) 1264 { 1265 z_stream strm1, *strm = &strm1; 1266 int ret, out_len; 1267 1268 memset(strm, 0, sizeof(*strm)); 1269 1270 strm->next_in = (uint8_t *)buf; 1271 strm->avail_in = buf_size; 1272 strm->next_out = out_buf; 1273 strm->avail_out = out_buf_size; 1274 1275 ret = inflateInit2(strm, -12); 1276 if (ret != Z_OK) 1277 return -1; 1278 ret = inflate(strm, Z_FINISH); 1279 out_len = strm->next_out - out_buf; 1280 if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || 1281 out_len != out_buf_size) { 1282 inflateEnd(strm); 1283 return -1; 1284 } 1285 inflateEnd(strm); 1286 return 0; 1287 } 1288 1289 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) 1290 { 1291 BDRVQcowState *s = bs->opaque; 1292 int ret, csize, nb_csectors, sector_offset; 1293 uint64_t coffset; 1294 1295 coffset = cluster_offset & s->cluster_offset_mask; 1296 if (s->cluster_cache_offset != coffset) { 1297 nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; 1298 sector_offset = coffset & 511; 1299 csize = nb_csectors * 512 - sector_offset; 1300 BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); 1301 ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); 1302 if (ret < 0) { 1303 return ret; 1304 } 1305 if (decompress_buffer(s->cluster_cache, s->cluster_size, 1306 s->cluster_data + sector_offset, csize) < 0) { 1307 return -EIO; 1308 } 1309 s->cluster_cache_offset = coffset; 1310 } 1311 return 0; 1312 } 1313 1314 /* 1315 * This discards as many clusters of nb_clusters as possible at once (i.e. 1316 * all clusters in the same L2 table) and returns the number of discarded 1317 * clusters. 1318 */ 1319 static int discard_single_l2(BlockDriverState *bs, uint64_t offset, 1320 unsigned int nb_clusters) 1321 { 1322 BDRVQcowState *s = bs->opaque; 1323 uint64_t *l2_table; 1324 int l2_index; 1325 int ret; 1326 int i; 1327 1328 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1329 if (ret < 0) { 1330 return ret; 1331 } 1332 1333 /* Limit nb_clusters to one L2 table */ 1334 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1335 1336 for (i = 0; i < nb_clusters; i++) { 1337 uint64_t old_offset; 1338 1339 old_offset = be64_to_cpu(l2_table[l2_index + i]); 1340 if ((old_offset & L2E_OFFSET_MASK) == 0) { 1341 continue; 1342 } 1343 1344 /* First remove L2 entries */ 1345 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 1346 l2_table[l2_index + i] = cpu_to_be64(0); 1347 1348 /* Then decrease the refcount */ 1349 qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1350 } 1351 1352 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1353 if (ret < 0) { 1354 return ret; 1355 } 1356 1357 return nb_clusters; 1358 } 1359 1360 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, 1361 int nb_sectors) 1362 { 1363 BDRVQcowState *s = bs->opaque; 1364 uint64_t end_offset; 1365 unsigned int nb_clusters; 1366 int ret; 1367 1368 end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); 1369 1370 /* Round start up and end down */ 1371 offset = align_offset(offset, s->cluster_size); 1372 end_offset &= ~(s->cluster_size - 1); 1373 1374 if (offset > end_offset) { 1375 return 0; 1376 } 1377 1378 nb_clusters = size_to_clusters(s, end_offset - offset); 1379 1380 s->cache_discards = true; 1381 1382 /* Each L2 table is handled by its own loop iteration */ 1383 while (nb_clusters > 0) { 1384 ret = discard_single_l2(bs, offset, nb_clusters); 1385 if (ret < 0) { 1386 goto fail; 1387 } 1388 1389 nb_clusters -= ret; 1390 offset += (ret * s->cluster_size); 1391 } 1392 1393 ret = 0; 1394 fail: 1395 s->cache_discards = false; 1396 qcow2_process_discards(bs, ret); 1397 1398 return ret; 1399 } 1400 1401 /* 1402 * This zeroes as many clusters of nb_clusters as possible at once (i.e. 1403 * all clusters in the same L2 table) and returns the number of zeroed 1404 * clusters. 1405 */ 1406 static int zero_single_l2(BlockDriverState *bs, uint64_t offset, 1407 unsigned int nb_clusters) 1408 { 1409 BDRVQcowState *s = bs->opaque; 1410 uint64_t *l2_table; 1411 int l2_index; 1412 int ret; 1413 int i; 1414 1415 ret = get_cluster_table(bs, offset, &l2_table, &l2_index); 1416 if (ret < 0) { 1417 return ret; 1418 } 1419 1420 /* Limit nb_clusters to one L2 table */ 1421 nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); 1422 1423 for (i = 0; i < nb_clusters; i++) { 1424 uint64_t old_offset; 1425 1426 old_offset = be64_to_cpu(l2_table[l2_index + i]); 1427 1428 /* Update L2 entries */ 1429 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); 1430 if (old_offset & QCOW_OFLAG_COMPRESSED) { 1431 l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); 1432 qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); 1433 } else { 1434 l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); 1435 } 1436 } 1437 1438 ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); 1439 if (ret < 0) { 1440 return ret; 1441 } 1442 1443 return nb_clusters; 1444 } 1445 1446 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) 1447 { 1448 BDRVQcowState *s = bs->opaque; 1449 unsigned int nb_clusters; 1450 int ret; 1451 1452 /* The zero flag is only supported by version 3 and newer */ 1453 if (s->qcow_version < 3) { 1454 return -ENOTSUP; 1455 } 1456 1457 /* Each L2 table is handled by its own loop iteration */ 1458 nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); 1459 1460 s->cache_discards = true; 1461 1462 while (nb_clusters > 0) { 1463 ret = zero_single_l2(bs, offset, nb_clusters); 1464 if (ret < 0) { 1465 goto fail; 1466 } 1467 1468 nb_clusters -= ret; 1469 offset += (ret * s->cluster_size); 1470 } 1471 1472 ret = 0; 1473 fail: 1474 s->cache_discards = false; 1475 qcow2_process_discards(bs, ret); 1476 1477 return ret; 1478 } 1479