1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 25 #include "qemu/osdep.h" 26 #include <zlib.h> 27 28 #include "block/block-io.h" 29 #include "qapi/error.h" 30 #include "qcow2.h" 31 #include "qemu/bswap.h" 32 #include "qemu/memalign.h" 33 #include "trace.h" 34 35 int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs, 36 uint64_t exact_size) 37 { 38 BDRVQcow2State *s = bs->opaque; 39 int new_l1_size, i, ret; 40 41 if (exact_size >= s->l1_size) { 42 return 0; 43 } 44 45 new_l1_size = exact_size; 46 47 #ifdef DEBUG_ALLOC2 48 fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size); 49 #endif 50 51 BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE); 52 ret = bdrv_co_pwrite_zeroes(bs->file, 53 s->l1_table_offset + new_l1_size * L1E_SIZE, 54 (s->l1_size - new_l1_size) * L1E_SIZE, 0); 55 if (ret < 0) { 56 goto fail; 57 } 58 59 ret = bdrv_co_flush(bs->file->bs); 60 if (ret < 0) { 61 goto fail; 62 } 63 64 BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS); 65 for (i = s->l1_size - 1; i > new_l1_size - 1; i--) { 66 if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) { 67 continue; 68 } 69 qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK, 70 s->cluster_size, QCOW2_DISCARD_ALWAYS); 71 s->l1_table[i] = 0; 72 } 73 return 0; 74 75 fail: 76 /* 77 * If the write in the l1_table failed the image may contain a partially 78 * overwritten l1_table. In this case it would be better to clear the 79 * l1_table in memory to avoid possible image corruption. 80 */ 81 memset(s->l1_table + new_l1_size, 0, 82 (s->l1_size - new_l1_size) * L1E_SIZE); 83 return ret; 84 } 85 86 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, 87 bool exact_size) 88 { 89 BDRVQcow2State *s = bs->opaque; 90 int new_l1_size2, ret, i; 91 uint64_t *new_l1_table; 92 int64_t old_l1_table_offset, old_l1_size; 93 int64_t new_l1_table_offset, new_l1_size; 94 uint8_t data[12]; 95 96 if (min_size <= s->l1_size) 97 return 0; 98 99 /* Do a sanity check on min_size before trying to calculate new_l1_size 100 * (this prevents overflows during the while loop for the calculation of 101 * new_l1_size) */ 102 if (min_size > INT_MAX / L1E_SIZE) { 103 return -EFBIG; 104 } 105 106 if (exact_size) { 107 new_l1_size = min_size; 108 } else { 109 /* Bump size up to reduce the number of times we have to grow */ 110 new_l1_size = s->l1_size; 111 if (new_l1_size == 0) { 112 new_l1_size = 1; 113 } 114 while (min_size > new_l1_size) { 115 new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2); 116 } 117 } 118 119 QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX); 120 if (new_l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) { 121 return -EFBIG; 122 } 123 124 #ifdef DEBUG_ALLOC2 125 fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", 126 s->l1_size, new_l1_size); 127 #endif 128 129 new_l1_size2 = L1E_SIZE * new_l1_size; 130 new_l1_table = qemu_try_blockalign(bs->file->bs, new_l1_size2); 131 if (new_l1_table == NULL) { 132 return -ENOMEM; 133 } 134 memset(new_l1_table, 0, new_l1_size2); 135 136 if (s->l1_size) { 137 memcpy(new_l1_table, s->l1_table, s->l1_size * L1E_SIZE); 138 } 139 140 /* write new table (align to cluster) */ 141 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); 142 new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); 143 if (new_l1_table_offset < 0) { 144 qemu_vfree(new_l1_table); 145 return new_l1_table_offset; 146 } 147 148 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 149 if (ret < 0) { 150 goto fail; 151 } 152 153 /* the L1 position has not yet been updated, so these clusters must 154 * indeed be completely free */ 155 ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, 156 new_l1_size2, false); 157 if (ret < 0) { 158 goto fail; 159 } 160 161 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); 162 for(i = 0; i < s->l1_size; i++) 163 new_l1_table[i] = cpu_to_be64(new_l1_table[i]); 164 ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_size2, 165 new_l1_table, 0); 166 if (ret < 0) 167 goto fail; 168 for(i = 0; i < s->l1_size; i++) 169 new_l1_table[i] = be64_to_cpu(new_l1_table[i]); 170 171 /* set new table */ 172 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); 173 stl_be_p(data, new_l1_size); 174 stq_be_p(data + 4, new_l1_table_offset); 175 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), 176 sizeof(data), data, 0); 177 if (ret < 0) { 178 goto fail; 179 } 180 qemu_vfree(s->l1_table); 181 old_l1_table_offset = s->l1_table_offset; 182 s->l1_table_offset = new_l1_table_offset; 183 s->l1_table = new_l1_table; 184 old_l1_size = s->l1_size; 185 s->l1_size = new_l1_size; 186 qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * L1E_SIZE, 187 QCOW2_DISCARD_OTHER); 188 return 0; 189 fail: 190 qemu_vfree(new_l1_table); 191 qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, 192 QCOW2_DISCARD_OTHER); 193 return ret; 194 } 195 196 /* 197 * l2_load 198 * 199 * @bs: The BlockDriverState 200 * @offset: A guest offset, used to calculate what slice of the L2 201 * table to load. 202 * @l2_offset: Offset to the L2 table in the image file. 203 * @l2_slice: Location to store the pointer to the L2 slice. 204 * 205 * Loads a L2 slice into memory (L2 slices are the parts of L2 tables 206 * that are loaded by the qcow2 cache). If the slice is in the cache, 207 * the cache is used; otherwise the L2 slice is loaded from the image 208 * file. 209 */ 210 static int l2_load(BlockDriverState *bs, uint64_t offset, 211 uint64_t l2_offset, uint64_t **l2_slice) 212 { 213 BDRVQcow2State *s = bs->opaque; 214 int start_of_slice = l2_entry_size(s) * 215 (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset)); 216 217 return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice, 218 (void **)l2_slice); 219 } 220 221 /* 222 * Writes an L1 entry to disk (note that depending on the alignment 223 * requirements this function may write more that just one entry in 224 * order to prevent bdrv_pwrite from performing a read-modify-write) 225 */ 226 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) 227 { 228 BDRVQcow2State *s = bs->opaque; 229 int l1_start_index; 230 int i, ret; 231 int bufsize = MAX(L1E_SIZE, 232 MIN(bs->file->bs->bl.request_alignment, s->cluster_size)); 233 int nentries = bufsize / L1E_SIZE; 234 g_autofree uint64_t *buf = g_try_new0(uint64_t, nentries); 235 236 if (buf == NULL) { 237 return -ENOMEM; 238 } 239 240 l1_start_index = QEMU_ALIGN_DOWN(l1_index, nentries); 241 for (i = 0; i < MIN(nentries, s->l1_size - l1_start_index); i++) { 242 buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); 243 } 244 245 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, 246 s->l1_table_offset + L1E_SIZE * l1_start_index, bufsize, false); 247 if (ret < 0) { 248 return ret; 249 } 250 251 BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); 252 ret = bdrv_pwrite_sync(bs->file, 253 s->l1_table_offset + L1E_SIZE * l1_start_index, 254 bufsize, buf, 0); 255 if (ret < 0) { 256 return ret; 257 } 258 259 return 0; 260 } 261 262 /* 263 * l2_allocate 264 * 265 * Allocate a new l2 entry in the file. If l1_index points to an already 266 * used entry in the L2 table (i.e. we are doing a copy on write for the L2 267 * table) copy the contents of the old L2 table into the newly allocated one. 268 * Otherwise the new table is initialized with zeros. 269 * 270 */ 271 272 static int l2_allocate(BlockDriverState *bs, int l1_index) 273 { 274 BDRVQcow2State *s = bs->opaque; 275 uint64_t old_l2_offset; 276 uint64_t *l2_slice = NULL; 277 unsigned slice, slice_size2, n_slices; 278 int64_t l2_offset; 279 int ret; 280 281 old_l2_offset = s->l1_table[l1_index]; 282 283 trace_qcow2_l2_allocate(bs, l1_index); 284 285 /* allocate a new l2 entry */ 286 287 l2_offset = qcow2_alloc_clusters(bs, s->l2_size * l2_entry_size(s)); 288 if (l2_offset < 0) { 289 ret = l2_offset; 290 goto fail; 291 } 292 293 /* The offset must fit in the offset field of the L1 table entry */ 294 assert((l2_offset & L1E_OFFSET_MASK) == l2_offset); 295 296 /* If we're allocating the table at offset 0 then something is wrong */ 297 if (l2_offset == 0) { 298 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid " 299 "allocation of L2 table at offset 0"); 300 ret = -EIO; 301 goto fail; 302 } 303 304 ret = qcow2_cache_flush(bs, s->refcount_block_cache); 305 if (ret < 0) { 306 goto fail; 307 } 308 309 /* allocate a new entry in the l2 cache */ 310 311 slice_size2 = s->l2_slice_size * l2_entry_size(s); 312 n_slices = s->cluster_size / slice_size2; 313 314 trace_qcow2_l2_allocate_get_empty(bs, l1_index); 315 for (slice = 0; slice < n_slices; slice++) { 316 ret = qcow2_cache_get_empty(bs, s->l2_table_cache, 317 l2_offset + slice * slice_size2, 318 (void **) &l2_slice); 319 if (ret < 0) { 320 goto fail; 321 } 322 323 if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { 324 /* if there was no old l2 table, clear the new slice */ 325 memset(l2_slice, 0, slice_size2); 326 } else { 327 uint64_t *old_slice; 328 uint64_t old_l2_slice_offset = 329 (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2; 330 331 /* if there was an old l2 table, read a slice from the disk */ 332 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); 333 ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset, 334 (void **) &old_slice); 335 if (ret < 0) { 336 goto fail; 337 } 338 339 memcpy(l2_slice, old_slice, slice_size2); 340 341 qcow2_cache_put(s->l2_table_cache, (void **) &old_slice); 342 } 343 344 /* write the l2 slice to the file */ 345 BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); 346 347 trace_qcow2_l2_allocate_write_l2(bs, l1_index); 348 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 349 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 350 } 351 352 ret = qcow2_cache_flush(bs, s->l2_table_cache); 353 if (ret < 0) { 354 goto fail; 355 } 356 357 /* update the L1 entry */ 358 trace_qcow2_l2_allocate_write_l1(bs, l1_index); 359 s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; 360 ret = qcow2_write_l1_entry(bs, l1_index); 361 if (ret < 0) { 362 goto fail; 363 } 364 365 trace_qcow2_l2_allocate_done(bs, l1_index, 0); 366 return 0; 367 368 fail: 369 trace_qcow2_l2_allocate_done(bs, l1_index, ret); 370 if (l2_slice != NULL) { 371 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 372 } 373 s->l1_table[l1_index] = old_l2_offset; 374 if (l2_offset > 0) { 375 qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s), 376 QCOW2_DISCARD_ALWAYS); 377 } 378 return ret; 379 } 380 381 /* 382 * For a given L2 entry, count the number of contiguous subclusters of 383 * the same type starting from @sc_from. Compressed clusters are 384 * treated as if they were divided into subclusters of size 385 * s->subcluster_size. 386 * 387 * Return the number of contiguous subclusters and set @type to the 388 * subcluster type. 389 * 390 * If the L2 entry is invalid return -errno and set @type to 391 * QCOW2_SUBCLUSTER_INVALID. 392 */ 393 static int qcow2_get_subcluster_range_type(BlockDriverState *bs, 394 uint64_t l2_entry, 395 uint64_t l2_bitmap, 396 unsigned sc_from, 397 QCow2SubclusterType *type) 398 { 399 BDRVQcow2State *s = bs->opaque; 400 uint32_t val; 401 402 *type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_from); 403 404 if (*type == QCOW2_SUBCLUSTER_INVALID) { 405 return -EINVAL; 406 } else if (!has_subclusters(s) || *type == QCOW2_SUBCLUSTER_COMPRESSED) { 407 return s->subclusters_per_cluster - sc_from; 408 } 409 410 switch (*type) { 411 case QCOW2_SUBCLUSTER_NORMAL: 412 val = l2_bitmap | QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from); 413 return cto32(val) - sc_from; 414 415 case QCOW2_SUBCLUSTER_ZERO_PLAIN: 416 case QCOW2_SUBCLUSTER_ZERO_ALLOC: 417 val = (l2_bitmap | QCOW_OFLAG_SUB_ZERO_RANGE(0, sc_from)) >> 32; 418 return cto32(val) - sc_from; 419 420 case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN: 421 case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: 422 val = ((l2_bitmap >> 32) | l2_bitmap) 423 & ~QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from); 424 return ctz32(val) - sc_from; 425 426 default: 427 g_assert_not_reached(); 428 } 429 } 430 431 /* 432 * Return the number of contiguous subclusters of the exact same type 433 * in a given L2 slice, starting from cluster @l2_index, subcluster 434 * @sc_index. Allocated subclusters are required to be contiguous in 435 * the image file. 436 * At most @nb_clusters are checked (note that this means clusters, 437 * not subclusters). 438 * Compressed clusters are always processed one by one but for the 439 * purpose of this count they are treated as if they were divided into 440 * subclusters of size s->subcluster_size. 441 * On failure return -errno and update @l2_index to point to the 442 * invalid entry. 443 */ 444 static int count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters, 445 unsigned sc_index, uint64_t *l2_slice, 446 unsigned *l2_index) 447 { 448 BDRVQcow2State *s = bs->opaque; 449 int i, count = 0; 450 bool check_offset = false; 451 uint64_t expected_offset = 0; 452 QCow2SubclusterType expected_type = QCOW2_SUBCLUSTER_NORMAL, type; 453 454 assert(*l2_index + nb_clusters <= s->l2_slice_size); 455 456 for (i = 0; i < nb_clusters; i++) { 457 unsigned first_sc = (i == 0) ? sc_index : 0; 458 uint64_t l2_entry = get_l2_entry(s, l2_slice, *l2_index + i); 459 uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, *l2_index + i); 460 int ret = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap, 461 first_sc, &type); 462 if (ret < 0) { 463 *l2_index += i; /* Point to the invalid entry */ 464 return -EIO; 465 } 466 if (i == 0) { 467 if (type == QCOW2_SUBCLUSTER_COMPRESSED) { 468 /* Compressed clusters are always processed one by one */ 469 return ret; 470 } 471 expected_type = type; 472 expected_offset = l2_entry & L2E_OFFSET_MASK; 473 check_offset = (type == QCOW2_SUBCLUSTER_NORMAL || 474 type == QCOW2_SUBCLUSTER_ZERO_ALLOC || 475 type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC); 476 } else if (type != expected_type) { 477 break; 478 } else if (check_offset) { 479 expected_offset += s->cluster_size; 480 if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) { 481 break; 482 } 483 } 484 count += ret; 485 /* Stop if there are type changes before the end of the cluster */ 486 if (first_sc + ret < s->subclusters_per_cluster) { 487 break; 488 } 489 } 490 491 return count; 492 } 493 494 static int coroutine_fn GRAPH_RDLOCK 495 do_perform_cow_read(BlockDriverState *bs, uint64_t src_cluster_offset, 496 unsigned offset_in_cluster, QEMUIOVector *qiov) 497 { 498 int ret; 499 500 if (qiov->size == 0) { 501 return 0; 502 } 503 504 BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); 505 506 if (!bs->drv) { 507 return -ENOMEDIUM; 508 } 509 510 /* 511 * We never deal with requests that don't satisfy 512 * bdrv_check_qiov_request(), and aligning requests to clusters never 513 * breaks this condition. So, do some assertions before calling 514 * bs->drv->bdrv_co_preadv_part() which has int64_t arguments. 515 */ 516 assert(src_cluster_offset <= INT64_MAX); 517 assert(src_cluster_offset + offset_in_cluster <= INT64_MAX); 518 /* Cast qiov->size to uint64_t to silence a compiler warning on -m32 */ 519 assert((uint64_t)qiov->size <= INT64_MAX); 520 bdrv_check_qiov_request(src_cluster_offset + offset_in_cluster, qiov->size, 521 qiov, 0, &error_abort); 522 /* 523 * Call .bdrv_co_readv() directly instead of using the public block-layer 524 * interface. This avoids double I/O throttling and request tracking, 525 * which can lead to deadlock when block layer copy-on-read is enabled. 526 */ 527 ret = bs->drv->bdrv_co_preadv_part(bs, 528 src_cluster_offset + offset_in_cluster, 529 qiov->size, qiov, 0, 0); 530 if (ret < 0) { 531 return ret; 532 } 533 534 return 0; 535 } 536 537 static int coroutine_fn GRAPH_RDLOCK 538 do_perform_cow_write(BlockDriverState *bs, uint64_t cluster_offset, 539 unsigned offset_in_cluster, QEMUIOVector *qiov) 540 { 541 BDRVQcow2State *s = bs->opaque; 542 int ret; 543 544 if (qiov->size == 0) { 545 return 0; 546 } 547 548 ret = qcow2_pre_write_overlap_check(bs, 0, 549 cluster_offset + offset_in_cluster, qiov->size, true); 550 if (ret < 0) { 551 return ret; 552 } 553 554 BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); 555 ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster, 556 qiov->size, qiov, 0); 557 if (ret < 0) { 558 return ret; 559 } 560 561 return 0; 562 } 563 564 565 /* 566 * get_host_offset 567 * 568 * For a given offset of the virtual disk find the equivalent host 569 * offset in the qcow2 file and store it in *host_offset. Neither 570 * offset needs to be aligned to a cluster boundary. 571 * 572 * If the cluster is unallocated then *host_offset will be 0. 573 * If the cluster is compressed then *host_offset will contain the l2 entry. 574 * 575 * On entry, *bytes is the maximum number of contiguous bytes starting at 576 * offset that we are interested in. 577 * 578 * On exit, *bytes is the number of bytes starting at offset that have the same 579 * subcluster type and (if applicable) are stored contiguously in the image 580 * file. The subcluster type is stored in *subcluster_type. 581 * Compressed clusters are always processed one by one. 582 * 583 * Returns 0 on success, -errno in error cases. 584 */ 585 int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset, 586 unsigned int *bytes, uint64_t *host_offset, 587 QCow2SubclusterType *subcluster_type) 588 { 589 BDRVQcow2State *s = bs->opaque; 590 unsigned int l2_index, sc_index; 591 uint64_t l1_index, l2_offset, *l2_slice, l2_entry, l2_bitmap; 592 int sc; 593 unsigned int offset_in_cluster; 594 uint64_t bytes_available, bytes_needed, nb_clusters; 595 QCow2SubclusterType type; 596 int ret; 597 598 offset_in_cluster = offset_into_cluster(s, offset); 599 bytes_needed = (uint64_t) *bytes + offset_in_cluster; 600 601 /* compute how many bytes there are between the start of the cluster 602 * containing offset and the end of the l2 slice that contains 603 * the entry pointing to it */ 604 bytes_available = 605 ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset))) 606 << s->cluster_bits; 607 608 if (bytes_needed > bytes_available) { 609 bytes_needed = bytes_available; 610 } 611 612 *host_offset = 0; 613 614 /* seek to the l2 offset in the l1 table */ 615 616 l1_index = offset_to_l1_index(s, offset); 617 if (l1_index >= s->l1_size) { 618 type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN; 619 goto out; 620 } 621 622 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 623 if (!l2_offset) { 624 type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN; 625 goto out; 626 } 627 628 if (offset_into_cluster(s, l2_offset)) { 629 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 630 " unaligned (L1 index: %#" PRIx64 ")", 631 l2_offset, l1_index); 632 return -EIO; 633 } 634 635 /* load the l2 slice in memory */ 636 637 ret = l2_load(bs, offset, l2_offset, &l2_slice); 638 if (ret < 0) { 639 return ret; 640 } 641 642 /* find the cluster offset for the given disk offset */ 643 644 l2_index = offset_to_l2_slice_index(s, offset); 645 sc_index = offset_to_sc_index(s, offset); 646 l2_entry = get_l2_entry(s, l2_slice, l2_index); 647 l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index); 648 649 nb_clusters = size_to_clusters(s, bytes_needed); 650 /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned 651 * integers; the minimum cluster size is 512, so this assertion is always 652 * true */ 653 assert(nb_clusters <= INT_MAX); 654 655 type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index); 656 if (s->qcow_version < 3 && (type == QCOW2_SUBCLUSTER_ZERO_PLAIN || 657 type == QCOW2_SUBCLUSTER_ZERO_ALLOC)) { 658 qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" 659 " in pre-v3 image (L2 offset: %#" PRIx64 660 ", L2 index: %#x)", l2_offset, l2_index); 661 ret = -EIO; 662 goto fail; 663 } 664 switch (type) { 665 case QCOW2_SUBCLUSTER_INVALID: 666 break; /* This is handled by count_contiguous_subclusters() below */ 667 case QCOW2_SUBCLUSTER_COMPRESSED: 668 if (has_data_file(bs)) { 669 qcow2_signal_corruption(bs, true, -1, -1, "Compressed cluster " 670 "entry found in image with external data " 671 "file (L2 offset: %#" PRIx64 ", L2 index: " 672 "%#x)", l2_offset, l2_index); 673 ret = -EIO; 674 goto fail; 675 } 676 *host_offset = l2_entry; 677 break; 678 case QCOW2_SUBCLUSTER_ZERO_PLAIN: 679 case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN: 680 break; 681 case QCOW2_SUBCLUSTER_ZERO_ALLOC: 682 case QCOW2_SUBCLUSTER_NORMAL: 683 case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: { 684 uint64_t host_cluster_offset = l2_entry & L2E_OFFSET_MASK; 685 *host_offset = host_cluster_offset + offset_in_cluster; 686 if (offset_into_cluster(s, host_cluster_offset)) { 687 qcow2_signal_corruption(bs, true, -1, -1, 688 "Cluster allocation offset %#" 689 PRIx64 " unaligned (L2 offset: %#" PRIx64 690 ", L2 index: %#x)", host_cluster_offset, 691 l2_offset, l2_index); 692 ret = -EIO; 693 goto fail; 694 } 695 if (has_data_file(bs) && *host_offset != offset) { 696 qcow2_signal_corruption(bs, true, -1, -1, 697 "External data file host cluster offset %#" 698 PRIx64 " does not match guest cluster " 699 "offset: %#" PRIx64 700 ", L2 index: %#x)", host_cluster_offset, 701 offset - offset_in_cluster, l2_index); 702 ret = -EIO; 703 goto fail; 704 } 705 break; 706 } 707 default: 708 abort(); 709 } 710 711 sc = count_contiguous_subclusters(bs, nb_clusters, sc_index, 712 l2_slice, &l2_index); 713 if (sc < 0) { 714 qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster entry found " 715 " (L2 offset: %#" PRIx64 ", L2 index: %#x)", 716 l2_offset, l2_index); 717 ret = -EIO; 718 goto fail; 719 } 720 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 721 722 bytes_available = ((int64_t)sc + sc_index) << s->subcluster_bits; 723 724 out: 725 if (bytes_available > bytes_needed) { 726 bytes_available = bytes_needed; 727 } 728 729 /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster; 730 * subtracting offset_in_cluster will therefore definitely yield something 731 * not exceeding UINT_MAX */ 732 assert(bytes_available - offset_in_cluster <= UINT_MAX); 733 *bytes = bytes_available - offset_in_cluster; 734 735 *subcluster_type = type; 736 737 return 0; 738 739 fail: 740 qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice); 741 return ret; 742 } 743 744 /* 745 * get_cluster_table 746 * 747 * for a given disk offset, load (and allocate if needed) 748 * the appropriate slice of its l2 table. 749 * 750 * the cluster index in the l2 slice is given to the caller. 751 * 752 * Returns 0 on success, -errno in failure case 753 */ 754 static int get_cluster_table(BlockDriverState *bs, uint64_t offset, 755 uint64_t **new_l2_slice, 756 int *new_l2_index) 757 { 758 BDRVQcow2State *s = bs->opaque; 759 unsigned int l2_index; 760 uint64_t l1_index, l2_offset; 761 uint64_t *l2_slice = NULL; 762 int ret; 763 764 /* seek to the l2 offset in the l1 table */ 765 766 l1_index = offset_to_l1_index(s, offset); 767 if (l1_index >= s->l1_size) { 768 ret = qcow2_grow_l1_table(bs, l1_index + 1, false); 769 if (ret < 0) { 770 return ret; 771 } 772 } 773 774 assert(l1_index < s->l1_size); 775 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 776 if (offset_into_cluster(s, l2_offset)) { 777 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 778 " unaligned (L1 index: %#" PRIx64 ")", 779 l2_offset, l1_index); 780 return -EIO; 781 } 782 783 if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) { 784 /* First allocate a new L2 table (and do COW if needed) */ 785 ret = l2_allocate(bs, l1_index); 786 if (ret < 0) { 787 return ret; 788 } 789 790 /* Then decrease the refcount of the old table */ 791 if (l2_offset) { 792 qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s), 793 QCOW2_DISCARD_OTHER); 794 } 795 796 /* Get the offset of the newly-allocated l2 table */ 797 l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 798 assert(offset_into_cluster(s, l2_offset) == 0); 799 } 800 801 /* load the l2 slice in memory */ 802 ret = l2_load(bs, offset, l2_offset, &l2_slice); 803 if (ret < 0) { 804 return ret; 805 } 806 807 /* find the cluster offset for the given disk offset */ 808 809 l2_index = offset_to_l2_slice_index(s, offset); 810 811 *new_l2_slice = l2_slice; 812 *new_l2_index = l2_index; 813 814 return 0; 815 } 816 817 /* 818 * alloc_compressed_cluster_offset 819 * 820 * For a given offset on the virtual disk, allocate a new compressed cluster 821 * and put the host offset of the cluster into *host_offset. If a cluster is 822 * already allocated at the offset, return an error. 823 * 824 * Return 0 on success and -errno in error cases 825 */ 826 int coroutine_fn qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, 827 uint64_t offset, 828 int compressed_size, 829 uint64_t *host_offset) 830 { 831 BDRVQcow2State *s = bs->opaque; 832 int l2_index, ret; 833 uint64_t *l2_slice; 834 int64_t cluster_offset; 835 int nb_csectors; 836 837 if (has_data_file(bs)) { 838 return 0; 839 } 840 841 ret = get_cluster_table(bs, offset, &l2_slice, &l2_index); 842 if (ret < 0) { 843 return ret; 844 } 845 846 /* Compression can't overwrite anything. Fail if the cluster was already 847 * allocated. */ 848 cluster_offset = get_l2_entry(s, l2_slice, l2_index); 849 if (cluster_offset & L2E_OFFSET_MASK) { 850 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 851 return -EIO; 852 } 853 854 cluster_offset = qcow2_alloc_bytes(bs, compressed_size); 855 if (cluster_offset < 0) { 856 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 857 return cluster_offset; 858 } 859 860 nb_csectors = 861 (cluster_offset + compressed_size - 1) / QCOW2_COMPRESSED_SECTOR_SIZE - 862 (cluster_offset / QCOW2_COMPRESSED_SECTOR_SIZE); 863 864 /* The offset and size must fit in their fields of the L2 table entry */ 865 assert((cluster_offset & s->cluster_offset_mask) == cluster_offset); 866 assert((nb_csectors & s->csize_mask) == nb_csectors); 867 868 cluster_offset |= QCOW_OFLAG_COMPRESSED | 869 ((uint64_t)nb_csectors << s->csize_shift); 870 871 /* update L2 table */ 872 873 /* compressed clusters never have the copied flag */ 874 875 BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); 876 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 877 set_l2_entry(s, l2_slice, l2_index, cluster_offset); 878 if (has_subclusters(s)) { 879 set_l2_bitmap(s, l2_slice, l2_index, 0); 880 } 881 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 882 883 *host_offset = cluster_offset & s->cluster_offset_mask; 884 return 0; 885 } 886 887 static int coroutine_fn GRAPH_RDLOCK 888 perform_cow(BlockDriverState *bs, QCowL2Meta *m) 889 { 890 BDRVQcow2State *s = bs->opaque; 891 Qcow2COWRegion *start = &m->cow_start; 892 Qcow2COWRegion *end = &m->cow_end; 893 unsigned buffer_size; 894 unsigned data_bytes = end->offset - (start->offset + start->nb_bytes); 895 bool merge_reads; 896 uint8_t *start_buffer, *end_buffer; 897 QEMUIOVector qiov; 898 int ret; 899 900 assert(start->nb_bytes <= UINT_MAX - end->nb_bytes); 901 assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes); 902 assert(start->offset + start->nb_bytes <= end->offset); 903 904 if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) { 905 return 0; 906 } 907 908 /* If we have to read both the start and end COW regions and the 909 * middle region is not too large then perform just one read 910 * operation */ 911 merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384; 912 if (merge_reads) { 913 buffer_size = start->nb_bytes + data_bytes + end->nb_bytes; 914 } else { 915 /* If we have to do two reads, add some padding in the middle 916 * if necessary to make sure that the end region is optimally 917 * aligned. */ 918 size_t align = bdrv_opt_mem_align(bs); 919 assert(align > 0 && align <= UINT_MAX); 920 assert(QEMU_ALIGN_UP(start->nb_bytes, align) <= 921 UINT_MAX - end->nb_bytes); 922 buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes; 923 } 924 925 /* Reserve a buffer large enough to store all the data that we're 926 * going to read */ 927 start_buffer = qemu_try_blockalign(bs, buffer_size); 928 if (start_buffer == NULL) { 929 return -ENOMEM; 930 } 931 /* The part of the buffer where the end region is located */ 932 end_buffer = start_buffer + buffer_size - end->nb_bytes; 933 934 qemu_iovec_init(&qiov, 2 + (m->data_qiov ? 935 qemu_iovec_subvec_niov(m->data_qiov, 936 m->data_qiov_offset, 937 data_bytes) 938 : 0)); 939 940 qemu_co_mutex_unlock(&s->lock); 941 /* First we read the existing data from both COW regions. We 942 * either read the whole region in one go, or the start and end 943 * regions separately. */ 944 if (merge_reads) { 945 qemu_iovec_add(&qiov, start_buffer, buffer_size); 946 ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov); 947 } else { 948 qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 949 ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov); 950 if (ret < 0) { 951 goto fail; 952 } 953 954 qemu_iovec_reset(&qiov); 955 qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 956 ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov); 957 } 958 if (ret < 0) { 959 goto fail; 960 } 961 962 /* Encrypt the data if necessary before writing it */ 963 if (bs->encrypted) { 964 ret = qcow2_co_encrypt(bs, 965 m->alloc_offset + start->offset, 966 m->offset + start->offset, 967 start_buffer, start->nb_bytes); 968 if (ret < 0) { 969 goto fail; 970 } 971 972 ret = qcow2_co_encrypt(bs, 973 m->alloc_offset + end->offset, 974 m->offset + end->offset, 975 end_buffer, end->nb_bytes); 976 if (ret < 0) { 977 goto fail; 978 } 979 } 980 981 /* And now we can write everything. If we have the guest data we 982 * can write everything in one single operation */ 983 if (m->data_qiov) { 984 qemu_iovec_reset(&qiov); 985 if (start->nb_bytes) { 986 qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 987 } 988 qemu_iovec_concat(&qiov, m->data_qiov, m->data_qiov_offset, data_bytes); 989 if (end->nb_bytes) { 990 qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 991 } 992 /* NOTE: we have a write_aio blkdebug event here followed by 993 * a cow_write one in do_perform_cow_write(), but there's only 994 * one single I/O operation */ 995 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); 996 ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov); 997 } else { 998 /* If there's no guest data then write both COW regions separately */ 999 qemu_iovec_reset(&qiov); 1000 qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); 1001 ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov); 1002 if (ret < 0) { 1003 goto fail; 1004 } 1005 1006 qemu_iovec_reset(&qiov); 1007 qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); 1008 ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov); 1009 } 1010 1011 fail: 1012 qemu_co_mutex_lock(&s->lock); 1013 1014 /* 1015 * Before we update the L2 table to actually point to the new cluster, we 1016 * need to be sure that the refcounts have been increased and COW was 1017 * handled. 1018 */ 1019 if (ret == 0) { 1020 qcow2_cache_depends_on_flush(s->l2_table_cache); 1021 } 1022 1023 qemu_vfree(start_buffer); 1024 qemu_iovec_destroy(&qiov); 1025 return ret; 1026 } 1027 1028 int coroutine_fn qcow2_alloc_cluster_link_l2(BlockDriverState *bs, 1029 QCowL2Meta *m) 1030 { 1031 BDRVQcow2State *s = bs->opaque; 1032 int i, j = 0, l2_index, ret; 1033 uint64_t *old_cluster, *l2_slice; 1034 uint64_t cluster_offset = m->alloc_offset; 1035 1036 trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); 1037 assert(m->nb_clusters > 0); 1038 1039 old_cluster = g_try_new(uint64_t, m->nb_clusters); 1040 if (old_cluster == NULL) { 1041 ret = -ENOMEM; 1042 goto err; 1043 } 1044 1045 /* copy content of unmodified sectors */ 1046 ret = perform_cow(bs, m); 1047 if (ret < 0) { 1048 goto err; 1049 } 1050 1051 /* Update L2 table. */ 1052 if (s->use_lazy_refcounts) { 1053 qcow2_mark_dirty(bs); 1054 } 1055 if (qcow2_need_accurate_refcounts(s)) { 1056 qcow2_cache_set_dependency(bs, s->l2_table_cache, 1057 s->refcount_block_cache); 1058 } 1059 1060 ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index); 1061 if (ret < 0) { 1062 goto err; 1063 } 1064 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 1065 1066 assert(l2_index + m->nb_clusters <= s->l2_slice_size); 1067 assert(m->cow_end.offset + m->cow_end.nb_bytes <= 1068 m->nb_clusters << s->cluster_bits); 1069 for (i = 0; i < m->nb_clusters; i++) { 1070 uint64_t offset = cluster_offset + ((uint64_t)i << s->cluster_bits); 1071 /* if two concurrent writes happen to the same unallocated cluster 1072 * each write allocates separate cluster and writes data concurrently. 1073 * The first one to complete updates l2 table with pointer to its 1074 * cluster the second one has to do RMW (which is done above by 1075 * perform_cow()), update l2 table with its cluster pointer and free 1076 * old cluster. This is what this loop does */ 1077 if (get_l2_entry(s, l2_slice, l2_index + i) != 0) { 1078 old_cluster[j++] = get_l2_entry(s, l2_slice, l2_index + i); 1079 } 1080 1081 /* The offset must fit in the offset field of the L2 table entry */ 1082 assert((offset & L2E_OFFSET_MASK) == offset); 1083 1084 set_l2_entry(s, l2_slice, l2_index + i, offset | QCOW_OFLAG_COPIED); 1085 1086 /* Update bitmap with the subclusters that were just written */ 1087 if (has_subclusters(s) && !m->prealloc) { 1088 uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i); 1089 unsigned written_from = m->cow_start.offset; 1090 unsigned written_to = m->cow_end.offset + m->cow_end.nb_bytes; 1091 int first_sc, last_sc; 1092 /* Narrow written_from and written_to down to the current cluster */ 1093 written_from = MAX(written_from, i << s->cluster_bits); 1094 written_to = MIN(written_to, (i + 1) << s->cluster_bits); 1095 assert(written_from < written_to); 1096 first_sc = offset_to_sc_index(s, written_from); 1097 last_sc = offset_to_sc_index(s, written_to - 1); 1098 l2_bitmap |= QCOW_OFLAG_SUB_ALLOC_RANGE(first_sc, last_sc + 1); 1099 l2_bitmap &= ~QCOW_OFLAG_SUB_ZERO_RANGE(first_sc, last_sc + 1); 1100 set_l2_bitmap(s, l2_slice, l2_index + i, l2_bitmap); 1101 } 1102 } 1103 1104 1105 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1106 1107 /* 1108 * If this was a COW, we need to decrease the refcount of the old cluster. 1109 * 1110 * Don't discard clusters that reach a refcount of 0 (e.g. compressed 1111 * clusters), the next write will reuse them anyway. 1112 */ 1113 if (!m->keep_old_clusters && j != 0) { 1114 for (i = 0; i < j; i++) { 1115 qcow2_free_any_cluster(bs, old_cluster[i], QCOW2_DISCARD_NEVER); 1116 } 1117 } 1118 1119 ret = 0; 1120 err: 1121 g_free(old_cluster); 1122 return ret; 1123 } 1124 1125 /** 1126 * Frees the allocated clusters because the request failed and they won't 1127 * actually be linked. 1128 */ 1129 void coroutine_fn qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) 1130 { 1131 BDRVQcow2State *s = bs->opaque; 1132 if (!has_data_file(bs) && !m->keep_old_clusters) { 1133 qcow2_free_clusters(bs, m->alloc_offset, 1134 m->nb_clusters << s->cluster_bits, 1135 QCOW2_DISCARD_NEVER); 1136 } 1137 } 1138 1139 /* 1140 * For a given write request, create a new QCowL2Meta structure, add 1141 * it to @m and the BDRVQcow2State.cluster_allocs list. If the write 1142 * request does not need copy-on-write or changes to the L2 metadata 1143 * then this function does nothing. 1144 * 1145 * @host_cluster_offset points to the beginning of the first cluster. 1146 * 1147 * @guest_offset and @bytes indicate the offset and length of the 1148 * request. 1149 * 1150 * @l2_slice contains the L2 entries of all clusters involved in this 1151 * write request. 1152 * 1153 * If @keep_old is true it means that the clusters were already 1154 * allocated and will be overwritten. If false then the clusters are 1155 * new and we have to decrease the reference count of the old ones. 1156 * 1157 * Returns 0 on success, -errno on failure. 1158 */ 1159 static int coroutine_fn calculate_l2_meta(BlockDriverState *bs, 1160 uint64_t host_cluster_offset, 1161 uint64_t guest_offset, unsigned bytes, 1162 uint64_t *l2_slice, QCowL2Meta **m, 1163 bool keep_old) 1164 { 1165 BDRVQcow2State *s = bs->opaque; 1166 int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset); 1167 uint64_t l2_entry, l2_bitmap; 1168 unsigned cow_start_from, cow_end_to; 1169 unsigned cow_start_to = offset_into_cluster(s, guest_offset); 1170 unsigned cow_end_from = cow_start_to + bytes; 1171 unsigned nb_clusters = size_to_clusters(s, cow_end_from); 1172 QCowL2Meta *old_m = *m; 1173 QCow2SubclusterType type; 1174 int i; 1175 bool skip_cow = keep_old; 1176 1177 assert(nb_clusters <= s->l2_slice_size - l2_index); 1178 1179 /* Check the type of all affected subclusters */ 1180 for (i = 0; i < nb_clusters; i++) { 1181 l2_entry = get_l2_entry(s, l2_slice, l2_index + i); 1182 l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i); 1183 if (skip_cow) { 1184 unsigned write_from = MAX(cow_start_to, i << s->cluster_bits); 1185 unsigned write_to = MIN(cow_end_from, (i + 1) << s->cluster_bits); 1186 int first_sc = offset_to_sc_index(s, write_from); 1187 int last_sc = offset_to_sc_index(s, write_to - 1); 1188 int cnt = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap, 1189 first_sc, &type); 1190 /* Is any of the subclusters of type != QCOW2_SUBCLUSTER_NORMAL ? */ 1191 if (type != QCOW2_SUBCLUSTER_NORMAL || first_sc + cnt <= last_sc) { 1192 skip_cow = false; 1193 } 1194 } else { 1195 /* If we can't skip the cow we can still look for invalid entries */ 1196 type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, 0); 1197 } 1198 if (type == QCOW2_SUBCLUSTER_INVALID) { 1199 int l1_index = offset_to_l1_index(s, guest_offset); 1200 uint64_t l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; 1201 qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster " 1202 "entry found (L2 offset: %#" PRIx64 1203 ", L2 index: %#x)", 1204 l2_offset, l2_index + i); 1205 return -EIO; 1206 } 1207 } 1208 1209 if (skip_cow) { 1210 return 0; 1211 } 1212 1213 /* Get the L2 entry of the first cluster */ 1214 l2_entry = get_l2_entry(s, l2_slice, l2_index); 1215 l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index); 1216 sc_index = offset_to_sc_index(s, guest_offset); 1217 type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index); 1218 1219 if (!keep_old) { 1220 switch (type) { 1221 case QCOW2_SUBCLUSTER_COMPRESSED: 1222 cow_start_from = 0; 1223 break; 1224 case QCOW2_SUBCLUSTER_NORMAL: 1225 case QCOW2_SUBCLUSTER_ZERO_ALLOC: 1226 case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: 1227 if (has_subclusters(s)) { 1228 /* Skip all leading zero and unallocated subclusters */ 1229 uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC; 1230 cow_start_from = 1231 MIN(sc_index, ctz32(alloc_bitmap)) << s->subcluster_bits; 1232 } else { 1233 cow_start_from = 0; 1234 } 1235 break; 1236 case QCOW2_SUBCLUSTER_ZERO_PLAIN: 1237 case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN: 1238 cow_start_from = sc_index << s->subcluster_bits; 1239 break; 1240 default: 1241 g_assert_not_reached(); 1242 } 1243 } else { 1244 switch (type) { 1245 case QCOW2_SUBCLUSTER_NORMAL: 1246 cow_start_from = cow_start_to; 1247 break; 1248 case QCOW2_SUBCLUSTER_ZERO_ALLOC: 1249 case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: 1250 cow_start_from = sc_index << s->subcluster_bits; 1251 break; 1252 default: 1253 g_assert_not_reached(); 1254 } 1255 } 1256 1257 /* Get the L2 entry of the last cluster */ 1258 l2_index += nb_clusters - 1; 1259 l2_entry = get_l2_entry(s, l2_slice, l2_index); 1260 l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index); 1261 sc_index = offset_to_sc_index(s, guest_offset + bytes - 1); 1262 type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index); 1263 1264 if (!keep_old) { 1265 switch (type) { 1266 case QCOW2_SUBCLUSTER_COMPRESSED: 1267 cow_end_to = ROUND_UP(cow_end_from, s->cluster_size); 1268 break; 1269 case QCOW2_SUBCLUSTER_NORMAL: 1270 case QCOW2_SUBCLUSTER_ZERO_ALLOC: 1271 case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: 1272 cow_end_to = ROUND_UP(cow_end_from, s->cluster_size); 1273 if (has_subclusters(s)) { 1274 /* Skip all trailing zero and unallocated subclusters */ 1275 uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC; 1276 cow_end_to -= 1277 MIN(s->subclusters_per_cluster - sc_index - 1, 1278 clz32(alloc_bitmap)) << s->subcluster_bits; 1279 } 1280 break; 1281 case QCOW2_SUBCLUSTER_ZERO_PLAIN: 1282 case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN: 1283 cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size); 1284 break; 1285 default: 1286 g_assert_not_reached(); 1287 } 1288 } else { 1289 switch (type) { 1290 case QCOW2_SUBCLUSTER_NORMAL: 1291 cow_end_to = cow_end_from; 1292 break; 1293 case QCOW2_SUBCLUSTER_ZERO_ALLOC: 1294 case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: 1295 cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size); 1296 break; 1297 default: 1298 g_assert_not_reached(); 1299 } 1300 } 1301 1302 *m = g_malloc0(sizeof(**m)); 1303 **m = (QCowL2Meta) { 1304 .next = old_m, 1305 1306 .alloc_offset = host_cluster_offset, 1307 .offset = start_of_cluster(s, guest_offset), 1308 .nb_clusters = nb_clusters, 1309 1310 .keep_old_clusters = keep_old, 1311 1312 .cow_start = { 1313 .offset = cow_start_from, 1314 .nb_bytes = cow_start_to - cow_start_from, 1315 }, 1316 .cow_end = { 1317 .offset = cow_end_from, 1318 .nb_bytes = cow_end_to - cow_end_from, 1319 }, 1320 }; 1321 1322 qemu_co_queue_init(&(*m)->dependent_requests); 1323 QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); 1324 1325 return 0; 1326 } 1327 1328 /* 1329 * Returns true if writing to the cluster pointed to by @l2_entry 1330 * requires a new allocation (that is, if the cluster is unallocated 1331 * or has refcount > 1 and therefore cannot be written in-place). 1332 */ 1333 static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry) 1334 { 1335 switch (qcow2_get_cluster_type(bs, l2_entry)) { 1336 case QCOW2_CLUSTER_NORMAL: 1337 case QCOW2_CLUSTER_ZERO_ALLOC: 1338 if (l2_entry & QCOW_OFLAG_COPIED) { 1339 return false; 1340 } 1341 /* fallthrough */ 1342 case QCOW2_CLUSTER_UNALLOCATED: 1343 case QCOW2_CLUSTER_COMPRESSED: 1344 case QCOW2_CLUSTER_ZERO_PLAIN: 1345 return true; 1346 default: 1347 abort(); 1348 } 1349 } 1350 1351 /* 1352 * Returns the number of contiguous clusters that can be written to 1353 * using one single write request, starting from @l2_index. 1354 * At most @nb_clusters are checked. 1355 * 1356 * If @new_alloc is true this counts clusters that are either 1357 * unallocated, or allocated but with refcount > 1 (so they need to be 1358 * newly allocated and COWed). 1359 * 1360 * If @new_alloc is false this counts clusters that are already 1361 * allocated and can be overwritten in-place (this includes clusters 1362 * of type QCOW2_CLUSTER_ZERO_ALLOC). 1363 */ 1364 static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters, 1365 uint64_t *l2_slice, int l2_index, 1366 bool new_alloc) 1367 { 1368 BDRVQcow2State *s = bs->opaque; 1369 uint64_t l2_entry = get_l2_entry(s, l2_slice, l2_index); 1370 uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK; 1371 int i; 1372 1373 for (i = 0; i < nb_clusters; i++) { 1374 l2_entry = get_l2_entry(s, l2_slice, l2_index + i); 1375 if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) { 1376 break; 1377 } 1378 if (!new_alloc) { 1379 if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) { 1380 break; 1381 } 1382 expected_offset += s->cluster_size; 1383 } 1384 } 1385 1386 assert(i <= nb_clusters); 1387 return i; 1388 } 1389 1390 /* 1391 * Check if there already is an AIO write request in flight which allocates 1392 * the same cluster. In this case we need to wait until the previous 1393 * request has completed and updated the L2 table accordingly. 1394 * 1395 * Returns: 1396 * 0 if there was no dependency. *cur_bytes indicates the number of 1397 * bytes from guest_offset that can be read before the next 1398 * dependency must be processed (or the request is complete) 1399 * 1400 * -EAGAIN if we had to wait for another request, previously gathered 1401 * information on cluster allocation may be invalid now. The caller 1402 * must start over anyway, so consider *cur_bytes undefined. 1403 */ 1404 static int coroutine_fn handle_dependencies(BlockDriverState *bs, 1405 uint64_t guest_offset, 1406 uint64_t *cur_bytes, QCowL2Meta **m) 1407 { 1408 BDRVQcow2State *s = bs->opaque; 1409 QCowL2Meta *old_alloc; 1410 uint64_t bytes = *cur_bytes; 1411 1412 QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { 1413 1414 uint64_t start = guest_offset; 1415 uint64_t end = start + bytes; 1416 uint64_t old_start = start_of_cluster(s, l2meta_cow_start(old_alloc)); 1417 uint64_t old_end = ROUND_UP(l2meta_cow_end(old_alloc), s->cluster_size); 1418 1419 if (end <= old_start || start >= old_end) { 1420 /* No intersection */ 1421 continue; 1422 } 1423 1424 if (old_alloc->keep_old_clusters && 1425 (end <= l2meta_cow_start(old_alloc) || 1426 start >= l2meta_cow_end(old_alloc))) 1427 { 1428 /* 1429 * Clusters intersect but COW areas don't. And cluster itself is 1430 * already allocated. So, there is no actual conflict. 1431 */ 1432 continue; 1433 } 1434 1435 /* Conflict */ 1436 1437 if (start < old_start) { 1438 /* Stop at the start of a running allocation */ 1439 bytes = old_start - start; 1440 } else { 1441 bytes = 0; 1442 } 1443 1444 /* 1445 * Stop if an l2meta already exists. After yielding, it wouldn't 1446 * be valid any more, so we'd have to clean up the old L2Metas 1447 * and deal with requests depending on them before starting to 1448 * gather new ones. Not worth the trouble. 1449 */ 1450 if (bytes == 0 && *m) { 1451 *cur_bytes = 0; 1452 return 0; 1453 } 1454 1455 if (bytes == 0) { 1456 /* 1457 * Wait for the dependency to complete. We need to recheck 1458 * the free/allocated clusters when we continue. 1459 */ 1460 qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); 1461 return -EAGAIN; 1462 } 1463 } 1464 1465 /* Make sure that existing clusters and new allocations are only used up to 1466 * the next dependency if we shortened the request above */ 1467 *cur_bytes = bytes; 1468 1469 return 0; 1470 } 1471 1472 /* 1473 * Checks how many already allocated clusters that don't require a new 1474 * allocation there are at the given guest_offset (up to *bytes). 1475 * If *host_offset is not INV_OFFSET, only physically contiguous clusters 1476 * beginning at this host offset are counted. 1477 * 1478 * Note that guest_offset may not be cluster aligned. In this case, the 1479 * returned *host_offset points to exact byte referenced by guest_offset and 1480 * therefore isn't cluster aligned as well. 1481 * 1482 * Returns: 1483 * 0: if no allocated clusters are available at the given offset. 1484 * *bytes is normally unchanged. It is set to 0 if the cluster 1485 * is allocated and can be overwritten in-place but doesn't have 1486 * the right physical offset. 1487 * 1488 * 1: if allocated clusters that can be overwritten in place are 1489 * available at the requested offset. *bytes may have decreased 1490 * and describes the length of the area that can be written to. 1491 * 1492 * -errno: in error cases 1493 */ 1494 static int coroutine_fn handle_copied(BlockDriverState *bs, 1495 uint64_t guest_offset, uint64_t *host_offset, uint64_t *bytes, 1496 QCowL2Meta **m) 1497 { 1498 BDRVQcow2State *s = bs->opaque; 1499 int l2_index; 1500 uint64_t l2_entry, cluster_offset; 1501 uint64_t *l2_slice; 1502 uint64_t nb_clusters; 1503 unsigned int keep_clusters; 1504 int ret; 1505 1506 trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, 1507 *bytes); 1508 1509 assert(*host_offset == INV_OFFSET || offset_into_cluster(s, guest_offset) 1510 == offset_into_cluster(s, *host_offset)); 1511 1512 /* 1513 * Calculate the number of clusters to look for. We stop at L2 slice 1514 * boundaries to keep things simple. 1515 */ 1516 nb_clusters = 1517 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1518 1519 l2_index = offset_to_l2_slice_index(s, guest_offset); 1520 nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); 1521 /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */ 1522 nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits); 1523 1524 /* Find L2 entry for the first involved cluster */ 1525 ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); 1526 if (ret < 0) { 1527 return ret; 1528 } 1529 1530 l2_entry = get_l2_entry(s, l2_slice, l2_index); 1531 cluster_offset = l2_entry & L2E_OFFSET_MASK; 1532 1533 if (!cluster_needs_new_alloc(bs, l2_entry)) { 1534 if (offset_into_cluster(s, cluster_offset)) { 1535 qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset " 1536 "%#" PRIx64 " unaligned (guest offset: %#" 1537 PRIx64 ")", l2_entry & QCOW_OFLAG_ZERO ? 1538 "Preallocated zero" : "Data", 1539 cluster_offset, guest_offset); 1540 ret = -EIO; 1541 goto out; 1542 } 1543 1544 /* If a specific host_offset is required, check it */ 1545 if (*host_offset != INV_OFFSET && cluster_offset != *host_offset) { 1546 *bytes = 0; 1547 ret = 0; 1548 goto out; 1549 } 1550 1551 /* We keep all QCOW_OFLAG_COPIED clusters */ 1552 keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice, 1553 l2_index, false); 1554 assert(keep_clusters <= nb_clusters); 1555 1556 *bytes = MIN(*bytes, 1557 keep_clusters * s->cluster_size 1558 - offset_into_cluster(s, guest_offset)); 1559 assert(*bytes != 0); 1560 1561 ret = calculate_l2_meta(bs, cluster_offset, guest_offset, 1562 *bytes, l2_slice, m, true); 1563 if (ret < 0) { 1564 goto out; 1565 } 1566 1567 ret = 1; 1568 } else { 1569 ret = 0; 1570 } 1571 1572 /* Cleanup */ 1573 out: 1574 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1575 1576 /* Only return a host offset if we actually made progress. Otherwise we 1577 * would make requirements for handle_alloc() that it can't fulfill */ 1578 if (ret > 0) { 1579 *host_offset = cluster_offset + offset_into_cluster(s, guest_offset); 1580 } 1581 1582 return ret; 1583 } 1584 1585 /* 1586 * Allocates new clusters for the given guest_offset. 1587 * 1588 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to 1589 * contain the number of clusters that have been allocated and are contiguous 1590 * in the image file. 1591 * 1592 * If *host_offset is not INV_OFFSET, it specifies the offset in the image file 1593 * at which the new clusters must start. *nb_clusters can be 0 on return in 1594 * this case if the cluster at host_offset is already in use. If *host_offset 1595 * is INV_OFFSET, the clusters can be allocated anywhere in the image file. 1596 * 1597 * *host_offset is updated to contain the offset into the image file at which 1598 * the first allocated cluster starts. 1599 * 1600 * Return 0 on success and -errno in error cases. -EAGAIN means that the 1601 * function has been waiting for another request and the allocation must be 1602 * restarted, but the whole request should not be failed. 1603 */ 1604 static int coroutine_fn do_alloc_cluster_offset(BlockDriverState *bs, 1605 uint64_t guest_offset, 1606 uint64_t *host_offset, 1607 uint64_t *nb_clusters) 1608 { 1609 BDRVQcow2State *s = bs->opaque; 1610 1611 trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, 1612 *host_offset, *nb_clusters); 1613 1614 if (has_data_file(bs)) { 1615 assert(*host_offset == INV_OFFSET || 1616 *host_offset == start_of_cluster(s, guest_offset)); 1617 *host_offset = start_of_cluster(s, guest_offset); 1618 return 0; 1619 } 1620 1621 /* Allocate new clusters */ 1622 trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); 1623 if (*host_offset == INV_OFFSET) { 1624 int64_t cluster_offset = 1625 qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); 1626 if (cluster_offset < 0) { 1627 return cluster_offset; 1628 } 1629 *host_offset = cluster_offset; 1630 return 0; 1631 } else { 1632 int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); 1633 if (ret < 0) { 1634 return ret; 1635 } 1636 *nb_clusters = ret; 1637 return 0; 1638 } 1639 } 1640 1641 /* 1642 * Allocates new clusters for an area that is either still unallocated or 1643 * cannot be overwritten in-place. If *host_offset is not INV_OFFSET, 1644 * clusters are only allocated if the new allocation can match the specified 1645 * host offset. 1646 * 1647 * Note that guest_offset may not be cluster aligned. In this case, the 1648 * returned *host_offset points to exact byte referenced by guest_offset and 1649 * therefore isn't cluster aligned as well. 1650 * 1651 * Returns: 1652 * 0: if no clusters could be allocated. *bytes is set to 0, 1653 * *host_offset is left unchanged. 1654 * 1655 * 1: if new clusters were allocated. *bytes may be decreased if the 1656 * new allocation doesn't cover all of the requested area. 1657 * *host_offset is updated to contain the host offset of the first 1658 * newly allocated cluster. 1659 * 1660 * -errno: in error cases 1661 */ 1662 static int coroutine_fn handle_alloc(BlockDriverState *bs, 1663 uint64_t guest_offset, uint64_t *host_offset, uint64_t *bytes, 1664 QCowL2Meta **m) 1665 { 1666 BDRVQcow2State *s = bs->opaque; 1667 int l2_index; 1668 uint64_t *l2_slice; 1669 uint64_t nb_clusters; 1670 int ret; 1671 1672 uint64_t alloc_cluster_offset; 1673 1674 trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, 1675 *bytes); 1676 assert(*bytes > 0); 1677 1678 /* 1679 * Calculate the number of clusters to look for. We stop at L2 slice 1680 * boundaries to keep things simple. 1681 */ 1682 nb_clusters = 1683 size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); 1684 1685 l2_index = offset_to_l2_slice_index(s, guest_offset); 1686 nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); 1687 /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */ 1688 nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits); 1689 1690 /* Find L2 entry for the first involved cluster */ 1691 ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index); 1692 if (ret < 0) { 1693 return ret; 1694 } 1695 1696 nb_clusters = count_single_write_clusters(bs, nb_clusters, 1697 l2_slice, l2_index, true); 1698 1699 /* This function is only called when there were no non-COW clusters, so if 1700 * we can't find any unallocated or COW clusters either, something is 1701 * wrong with our code. */ 1702 assert(nb_clusters > 0); 1703 1704 /* Allocate at a given offset in the image file */ 1705 alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET : 1706 start_of_cluster(s, *host_offset); 1707 ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, 1708 &nb_clusters); 1709 if (ret < 0) { 1710 goto out; 1711 } 1712 1713 /* Can't extend contiguous allocation */ 1714 if (nb_clusters == 0) { 1715 *bytes = 0; 1716 ret = 0; 1717 goto out; 1718 } 1719 1720 assert(alloc_cluster_offset != INV_OFFSET); 1721 1722 /* 1723 * Save info needed for meta data update. 1724 * 1725 * requested_bytes: Number of bytes from the start of the first 1726 * newly allocated cluster to the end of the (possibly shortened 1727 * before) write request. 1728 * 1729 * avail_bytes: Number of bytes from the start of the first 1730 * newly allocated to the end of the last newly allocated cluster. 1731 * 1732 * nb_bytes: The number of bytes from the start of the first 1733 * newly allocated cluster to the end of the area that the write 1734 * request actually writes to (excluding COW at the end) 1735 */ 1736 uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset); 1737 int avail_bytes = nb_clusters << s->cluster_bits; 1738 int nb_bytes = MIN(requested_bytes, avail_bytes); 1739 1740 *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); 1741 *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset)); 1742 assert(*bytes != 0); 1743 1744 ret = calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, 1745 l2_slice, m, false); 1746 if (ret < 0) { 1747 goto out; 1748 } 1749 1750 ret = 1; 1751 1752 out: 1753 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1754 return ret; 1755 } 1756 1757 /* 1758 * For a given area on the virtual disk defined by @offset and @bytes, 1759 * find the corresponding area on the qcow2 image, allocating new 1760 * clusters (or subclusters) if necessary. The result can span a 1761 * combination of allocated and previously unallocated clusters. 1762 * 1763 * Note that offset may not be cluster aligned. In this case, the returned 1764 * *host_offset points to exact byte referenced by offset and therefore 1765 * isn't cluster aligned as well. 1766 * 1767 * On return, @host_offset is set to the beginning of the requested 1768 * area. This area is guaranteed to be contiguous on the qcow2 file 1769 * but it can be smaller than initially requested. In this case @bytes 1770 * is updated with the actual size. 1771 * 1772 * If any clusters or subclusters were allocated then @m contains a 1773 * list with the information of all the affected regions. Note that 1774 * this can happen regardless of whether this function succeeds or 1775 * not. The caller is responsible for updating the L2 metadata of the 1776 * allocated clusters (on success) or freeing them (on failure), and 1777 * for clearing the contents of @m afterwards in both cases. 1778 * 1779 * If the request conflicts with another write request in flight, the coroutine 1780 * is queued and will be reentered when the dependency has completed. 1781 * 1782 * Return 0 on success and -errno in error cases 1783 */ 1784 int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset, 1785 unsigned int *bytes, 1786 uint64_t *host_offset, 1787 QCowL2Meta **m) 1788 { 1789 BDRVQcow2State *s = bs->opaque; 1790 uint64_t start, remaining; 1791 uint64_t cluster_offset; 1792 uint64_t cur_bytes; 1793 int ret; 1794 1795 trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes); 1796 1797 again: 1798 start = offset; 1799 remaining = *bytes; 1800 cluster_offset = INV_OFFSET; 1801 *host_offset = INV_OFFSET; 1802 cur_bytes = 0; 1803 *m = NULL; 1804 1805 while (true) { 1806 1807 if (*host_offset == INV_OFFSET && cluster_offset != INV_OFFSET) { 1808 *host_offset = cluster_offset; 1809 } 1810 1811 assert(remaining >= cur_bytes); 1812 1813 start += cur_bytes; 1814 remaining -= cur_bytes; 1815 1816 if (cluster_offset != INV_OFFSET) { 1817 cluster_offset += cur_bytes; 1818 } 1819 1820 if (remaining == 0) { 1821 break; 1822 } 1823 1824 cur_bytes = remaining; 1825 1826 /* 1827 * Now start gathering as many contiguous clusters as possible: 1828 * 1829 * 1. Check for overlaps with in-flight allocations 1830 * 1831 * a) Overlap not in the first cluster -> shorten this request and 1832 * let the caller handle the rest in its next loop iteration. 1833 * 1834 * b) Real overlaps of two requests. Yield and restart the search 1835 * for contiguous clusters (the situation could have changed 1836 * while we were sleeping) 1837 * 1838 * c) TODO: Request starts in the same cluster as the in-flight 1839 * allocation ends. Shorten the COW of the in-fight allocation, 1840 * set cluster_offset to write to the same cluster and set up 1841 * the right synchronisation between the in-flight request and 1842 * the new one. 1843 */ 1844 ret = handle_dependencies(bs, start, &cur_bytes, m); 1845 if (ret == -EAGAIN) { 1846 /* Currently handle_dependencies() doesn't yield if we already had 1847 * an allocation. If it did, we would have to clean up the L2Meta 1848 * structs before starting over. */ 1849 assert(*m == NULL); 1850 goto again; 1851 } else if (ret < 0) { 1852 return ret; 1853 } else if (cur_bytes == 0) { 1854 break; 1855 } else { 1856 /* handle_dependencies() may have decreased cur_bytes (shortened 1857 * the allocations below) so that the next dependency is processed 1858 * correctly during the next loop iteration. */ 1859 } 1860 1861 /* 1862 * 2. Count contiguous COPIED clusters. 1863 */ 1864 ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); 1865 if (ret < 0) { 1866 return ret; 1867 } else if (ret) { 1868 continue; 1869 } else if (cur_bytes == 0) { 1870 break; 1871 } 1872 1873 /* 1874 * 3. If the request still hasn't completed, allocate new clusters, 1875 * considering any cluster_offset of steps 1c or 2. 1876 */ 1877 ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); 1878 if (ret < 0) { 1879 return ret; 1880 } else if (ret) { 1881 continue; 1882 } else { 1883 assert(cur_bytes == 0); 1884 break; 1885 } 1886 } 1887 1888 *bytes -= remaining; 1889 assert(*bytes > 0); 1890 assert(*host_offset != INV_OFFSET); 1891 assert(offset_into_cluster(s, *host_offset) == 1892 offset_into_cluster(s, offset)); 1893 1894 return 0; 1895 } 1896 1897 /* 1898 * This discards as many clusters of nb_clusters as possible at once (i.e. 1899 * all clusters in the same L2 slice) and returns the number of discarded 1900 * clusters. 1901 */ 1902 static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, 1903 uint64_t nb_clusters, 1904 enum qcow2_discard_type type, bool full_discard) 1905 { 1906 BDRVQcow2State *s = bs->opaque; 1907 uint64_t *l2_slice; 1908 int l2_index; 1909 int ret; 1910 int i; 1911 1912 ret = get_cluster_table(bs, offset, &l2_slice, &l2_index); 1913 if (ret < 0) { 1914 return ret; 1915 } 1916 1917 /* Limit nb_clusters to one L2 slice */ 1918 nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); 1919 assert(nb_clusters <= INT_MAX); 1920 1921 for (i = 0; i < nb_clusters; i++) { 1922 uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i); 1923 uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i); 1924 uint64_t new_l2_entry = old_l2_entry; 1925 uint64_t new_l2_bitmap = old_l2_bitmap; 1926 QCow2ClusterType cluster_type = 1927 qcow2_get_cluster_type(bs, old_l2_entry); 1928 bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) && 1929 !full_discard && 1930 (s->discard_no_unref && 1931 type == QCOW2_DISCARD_REQUEST); 1932 1933 /* 1934 * If full_discard is true, the cluster should not read back as zeroes, 1935 * but rather fall through to the backing file. 1936 * 1937 * If full_discard is false, make sure that a discarded area reads back 1938 * as zeroes for v3 images (we cannot do it for v2 without actually 1939 * writing a zero-filled buffer). We can skip the operation if the 1940 * cluster is already marked as zero, or if it's unallocated and we 1941 * don't have a backing file. 1942 * 1943 * TODO We might want to use bdrv_block_status(bs) here, but we're 1944 * holding s->lock, so that doesn't work today. 1945 */ 1946 if (full_discard) { 1947 new_l2_entry = new_l2_bitmap = 0; 1948 } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) { 1949 if (has_subclusters(s)) { 1950 if (keep_reference) { 1951 new_l2_entry = old_l2_entry; 1952 } else { 1953 new_l2_entry = 0; 1954 } 1955 new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES; 1956 } else { 1957 if (s->qcow_version >= 3) { 1958 if (keep_reference) { 1959 new_l2_entry |= QCOW_OFLAG_ZERO; 1960 } else { 1961 new_l2_entry = QCOW_OFLAG_ZERO; 1962 } 1963 } else { 1964 new_l2_entry = 0; 1965 } 1966 } 1967 } 1968 1969 if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) { 1970 continue; 1971 } 1972 1973 /* First remove L2 entries */ 1974 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 1975 set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry); 1976 if (has_subclusters(s)) { 1977 set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap); 1978 } 1979 if (!keep_reference) { 1980 /* Then decrease the refcount */ 1981 qcow2_free_any_cluster(bs, old_l2_entry, type); 1982 } else if (s->discard_passthrough[type] && 1983 (cluster_type == QCOW2_CLUSTER_NORMAL || 1984 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) { 1985 /* If we keep the reference, pass on the discard still */ 1986 bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK, 1987 s->cluster_size); 1988 } 1989 } 1990 1991 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 1992 1993 return nb_clusters; 1994 } 1995 1996 int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset, 1997 uint64_t bytes, enum qcow2_discard_type type, 1998 bool full_discard) 1999 { 2000 BDRVQcow2State *s = bs->opaque; 2001 uint64_t end_offset = offset + bytes; 2002 uint64_t nb_clusters; 2003 int64_t cleared; 2004 int ret; 2005 2006 /* Caller must pass aligned values, except at image end */ 2007 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 2008 assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) || 2009 end_offset == bs->total_sectors << BDRV_SECTOR_BITS); 2010 2011 nb_clusters = size_to_clusters(s, bytes); 2012 2013 s->cache_discards = true; 2014 2015 /* Each L2 slice is handled by its own loop iteration */ 2016 while (nb_clusters > 0) { 2017 cleared = discard_in_l2_slice(bs, offset, nb_clusters, type, 2018 full_discard); 2019 if (cleared < 0) { 2020 ret = cleared; 2021 goto fail; 2022 } 2023 2024 nb_clusters -= cleared; 2025 offset += (cleared * s->cluster_size); 2026 } 2027 2028 ret = 0; 2029 fail: 2030 s->cache_discards = false; 2031 qcow2_process_discards(bs, ret); 2032 2033 return ret; 2034 } 2035 2036 /* 2037 * This zeroes as many clusters of nb_clusters as possible at once (i.e. 2038 * all clusters in the same L2 slice) and returns the number of zeroed 2039 * clusters. 2040 */ 2041 static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, 2042 uint64_t nb_clusters, int flags) 2043 { 2044 BDRVQcow2State *s = bs->opaque; 2045 uint64_t *l2_slice; 2046 int l2_index; 2047 int ret; 2048 int i; 2049 2050 ret = get_cluster_table(bs, offset, &l2_slice, &l2_index); 2051 if (ret < 0) { 2052 return ret; 2053 } 2054 2055 /* Limit nb_clusters to one L2 slice */ 2056 nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index); 2057 assert(nb_clusters <= INT_MAX); 2058 2059 for (i = 0; i < nb_clusters; i++) { 2060 uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i); 2061 uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i); 2062 QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry); 2063 bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) || 2064 ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type)); 2065 uint64_t new_l2_entry = unmap ? 0 : old_l2_entry; 2066 uint64_t new_l2_bitmap = old_l2_bitmap; 2067 2068 if (has_subclusters(s)) { 2069 new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES; 2070 } else { 2071 new_l2_entry |= QCOW_OFLAG_ZERO; 2072 } 2073 2074 if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) { 2075 continue; 2076 } 2077 2078 /* First update L2 entries */ 2079 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 2080 set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry); 2081 if (has_subclusters(s)) { 2082 set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap); 2083 } 2084 2085 /* Then decrease the refcount */ 2086 if (unmap) { 2087 qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST); 2088 } 2089 } 2090 2091 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 2092 2093 return nb_clusters; 2094 } 2095 2096 static int coroutine_fn 2097 zero_l2_subclusters(BlockDriverState *bs, uint64_t offset, 2098 unsigned nb_subclusters) 2099 { 2100 BDRVQcow2State *s = bs->opaque; 2101 uint64_t *l2_slice; 2102 uint64_t old_l2_bitmap, l2_bitmap; 2103 int l2_index, ret, sc = offset_to_sc_index(s, offset); 2104 2105 /* For full clusters use zero_in_l2_slice() instead */ 2106 assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster); 2107 assert(sc + nb_subclusters <= s->subclusters_per_cluster); 2108 assert(offset_into_subcluster(s, offset) == 0); 2109 2110 ret = get_cluster_table(bs, offset, &l2_slice, &l2_index); 2111 if (ret < 0) { 2112 return ret; 2113 } 2114 2115 switch (qcow2_get_cluster_type(bs, get_l2_entry(s, l2_slice, l2_index))) { 2116 case QCOW2_CLUSTER_COMPRESSED: 2117 ret = -ENOTSUP; /* We cannot partially zeroize compressed clusters */ 2118 goto out; 2119 case QCOW2_CLUSTER_NORMAL: 2120 case QCOW2_CLUSTER_UNALLOCATED: 2121 break; 2122 default: 2123 g_assert_not_reached(); 2124 } 2125 2126 old_l2_bitmap = l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index); 2127 2128 l2_bitmap |= QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters); 2129 l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters); 2130 2131 if (old_l2_bitmap != l2_bitmap) { 2132 set_l2_bitmap(s, l2_slice, l2_index, l2_bitmap); 2133 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 2134 } 2135 2136 ret = 0; 2137 out: 2138 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 2139 2140 return ret; 2141 } 2142 2143 int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset, 2144 uint64_t bytes, int flags) 2145 { 2146 BDRVQcow2State *s = bs->opaque; 2147 uint64_t end_offset = offset + bytes; 2148 uint64_t nb_clusters; 2149 unsigned head, tail; 2150 int64_t cleared; 2151 int ret; 2152 2153 /* If we have to stay in sync with an external data file, zero out 2154 * s->data_file first. */ 2155 if (data_file_is_raw(bs)) { 2156 assert(has_data_file(bs)); 2157 ret = bdrv_co_pwrite_zeroes(s->data_file, offset, bytes, flags); 2158 if (ret < 0) { 2159 return ret; 2160 } 2161 } 2162 2163 /* Caller must pass aligned values, except at image end */ 2164 assert(offset_into_subcluster(s, offset) == 0); 2165 assert(offset_into_subcluster(s, end_offset) == 0 || 2166 end_offset >= bs->total_sectors << BDRV_SECTOR_BITS); 2167 2168 /* 2169 * The zero flag is only supported by version 3 and newer. However, if we 2170 * have no backing file, we can resort to discard in version 2. 2171 */ 2172 if (s->qcow_version < 3) { 2173 if (!bs->backing) { 2174 return qcow2_cluster_discard(bs, offset, bytes, 2175 QCOW2_DISCARD_REQUEST, false); 2176 } 2177 return -ENOTSUP; 2178 } 2179 2180 head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset; 2181 offset += head; 2182 2183 tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 : 2184 end_offset - MAX(offset, start_of_cluster(s, end_offset)); 2185 end_offset -= tail; 2186 2187 s->cache_discards = true; 2188 2189 if (head) { 2190 ret = zero_l2_subclusters(bs, offset - head, 2191 size_to_subclusters(s, head)); 2192 if (ret < 0) { 2193 goto fail; 2194 } 2195 } 2196 2197 /* Each L2 slice is handled by its own loop iteration */ 2198 nb_clusters = size_to_clusters(s, end_offset - offset); 2199 2200 while (nb_clusters > 0) { 2201 cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags); 2202 if (cleared < 0) { 2203 ret = cleared; 2204 goto fail; 2205 } 2206 2207 nb_clusters -= cleared; 2208 offset += (cleared * s->cluster_size); 2209 } 2210 2211 if (tail) { 2212 ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail)); 2213 if (ret < 0) { 2214 goto fail; 2215 } 2216 } 2217 2218 ret = 0; 2219 fail: 2220 s->cache_discards = false; 2221 qcow2_process_discards(bs, ret); 2222 2223 return ret; 2224 } 2225 2226 /* 2227 * Expands all zero clusters in a specific L1 table (or deallocates them, for 2228 * non-backed non-pre-allocated zero clusters). 2229 * 2230 * l1_entries and *visited_l1_entries are used to keep track of progress for 2231 * status_cb(). l1_entries contains the total number of L1 entries and 2232 * *visited_l1_entries counts all visited L1 entries. 2233 */ 2234 static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, 2235 int l1_size, int64_t *visited_l1_entries, 2236 int64_t l1_entries, 2237 BlockDriverAmendStatusCB *status_cb, 2238 void *cb_opaque) 2239 { 2240 BDRVQcow2State *s = bs->opaque; 2241 bool is_active_l1 = (l1_table == s->l1_table); 2242 uint64_t *l2_slice = NULL; 2243 unsigned slice, slice_size2, n_slices; 2244 int ret; 2245 int i, j; 2246 2247 /* qcow2_downgrade() is not allowed in images with subclusters */ 2248 assert(!has_subclusters(s)); 2249 2250 slice_size2 = s->l2_slice_size * l2_entry_size(s); 2251 n_slices = s->cluster_size / slice_size2; 2252 2253 if (!is_active_l1) { 2254 /* inactive L2 tables require a buffer to be stored in when loading 2255 * them from disk */ 2256 l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2); 2257 if (l2_slice == NULL) { 2258 return -ENOMEM; 2259 } 2260 } 2261 2262 for (i = 0; i < l1_size; i++) { 2263 uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; 2264 uint64_t l2_refcount; 2265 2266 if (!l2_offset) { 2267 /* unallocated */ 2268 (*visited_l1_entries)++; 2269 if (status_cb) { 2270 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 2271 } 2272 continue; 2273 } 2274 2275 if (offset_into_cluster(s, l2_offset)) { 2276 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" 2277 PRIx64 " unaligned (L1 index: %#x)", 2278 l2_offset, i); 2279 ret = -EIO; 2280 goto fail; 2281 } 2282 2283 ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, 2284 &l2_refcount); 2285 if (ret < 0) { 2286 goto fail; 2287 } 2288 2289 for (slice = 0; slice < n_slices; slice++) { 2290 uint64_t slice_offset = l2_offset + slice * slice_size2; 2291 bool l2_dirty = false; 2292 if (is_active_l1) { 2293 /* get active L2 tables from cache */ 2294 ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset, 2295 (void **)&l2_slice); 2296 } else { 2297 /* load inactive L2 tables from disk */ 2298 ret = bdrv_pread(bs->file, slice_offset, slice_size2, 2299 l2_slice, 0); 2300 } 2301 if (ret < 0) { 2302 goto fail; 2303 } 2304 2305 for (j = 0; j < s->l2_slice_size; j++) { 2306 uint64_t l2_entry = get_l2_entry(s, l2_slice, j); 2307 int64_t offset = l2_entry & L2E_OFFSET_MASK; 2308 QCow2ClusterType cluster_type = 2309 qcow2_get_cluster_type(bs, l2_entry); 2310 2311 if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN && 2312 cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) { 2313 continue; 2314 } 2315 2316 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 2317 if (!bs->backing) { 2318 /* 2319 * not backed; therefore we can simply deallocate the 2320 * cluster. No need to call set_l2_bitmap(), this 2321 * function doesn't support images with subclusters. 2322 */ 2323 set_l2_entry(s, l2_slice, j, 0); 2324 l2_dirty = true; 2325 continue; 2326 } 2327 2328 offset = qcow2_alloc_clusters(bs, s->cluster_size); 2329 if (offset < 0) { 2330 ret = offset; 2331 goto fail; 2332 } 2333 2334 /* The offset must fit in the offset field */ 2335 assert((offset & L2E_OFFSET_MASK) == offset); 2336 2337 if (l2_refcount > 1) { 2338 /* For shared L2 tables, set the refcount accordingly 2339 * (it is already 1 and needs to be l2_refcount) */ 2340 ret = qcow2_update_cluster_refcount( 2341 bs, offset >> s->cluster_bits, 2342 refcount_diff(1, l2_refcount), false, 2343 QCOW2_DISCARD_OTHER); 2344 if (ret < 0) { 2345 qcow2_free_clusters(bs, offset, s->cluster_size, 2346 QCOW2_DISCARD_OTHER); 2347 goto fail; 2348 } 2349 } 2350 } 2351 2352 if (offset_into_cluster(s, offset)) { 2353 int l2_index = slice * s->l2_slice_size + j; 2354 qcow2_signal_corruption( 2355 bs, true, -1, -1, 2356 "Cluster allocation offset " 2357 "%#" PRIx64 " unaligned (L2 offset: %#" 2358 PRIx64 ", L2 index: %#x)", offset, 2359 l2_offset, l2_index); 2360 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 2361 qcow2_free_clusters(bs, offset, s->cluster_size, 2362 QCOW2_DISCARD_ALWAYS); 2363 } 2364 ret = -EIO; 2365 goto fail; 2366 } 2367 2368 ret = qcow2_pre_write_overlap_check(bs, 0, offset, 2369 s->cluster_size, true); 2370 if (ret < 0) { 2371 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 2372 qcow2_free_clusters(bs, offset, s->cluster_size, 2373 QCOW2_DISCARD_ALWAYS); 2374 } 2375 goto fail; 2376 } 2377 2378 ret = bdrv_pwrite_zeroes(s->data_file, offset, 2379 s->cluster_size, 0); 2380 if (ret < 0) { 2381 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { 2382 qcow2_free_clusters(bs, offset, s->cluster_size, 2383 QCOW2_DISCARD_ALWAYS); 2384 } 2385 goto fail; 2386 } 2387 2388 if (l2_refcount == 1) { 2389 set_l2_entry(s, l2_slice, j, offset | QCOW_OFLAG_COPIED); 2390 } else { 2391 set_l2_entry(s, l2_slice, j, offset); 2392 } 2393 /* 2394 * No need to call set_l2_bitmap() after set_l2_entry() because 2395 * this function doesn't support images with subclusters. 2396 */ 2397 l2_dirty = true; 2398 } 2399 2400 if (is_active_l1) { 2401 if (l2_dirty) { 2402 qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); 2403 qcow2_cache_depends_on_flush(s->l2_table_cache); 2404 } 2405 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 2406 } else { 2407 if (l2_dirty) { 2408 ret = qcow2_pre_write_overlap_check( 2409 bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, 2410 slice_offset, slice_size2, false); 2411 if (ret < 0) { 2412 goto fail; 2413 } 2414 2415 ret = bdrv_pwrite(bs->file, slice_offset, slice_size2, 2416 l2_slice, 0); 2417 if (ret < 0) { 2418 goto fail; 2419 } 2420 } 2421 } 2422 } 2423 2424 (*visited_l1_entries)++; 2425 if (status_cb) { 2426 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); 2427 } 2428 } 2429 2430 ret = 0; 2431 2432 fail: 2433 if (l2_slice) { 2434 if (!is_active_l1) { 2435 qemu_vfree(l2_slice); 2436 } else { 2437 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice); 2438 } 2439 } 2440 return ret; 2441 } 2442 2443 /* 2444 * For backed images, expands all zero clusters on the image. For non-backed 2445 * images, deallocates all non-pre-allocated zero clusters (and claims the 2446 * allocation for pre-allocated ones). This is important for downgrading to a 2447 * qcow2 version which doesn't yet support metadata zero clusters. 2448 */ 2449 int qcow2_expand_zero_clusters(BlockDriverState *bs, 2450 BlockDriverAmendStatusCB *status_cb, 2451 void *cb_opaque) 2452 { 2453 BDRVQcow2State *s = bs->opaque; 2454 uint64_t *l1_table = NULL; 2455 int64_t l1_entries = 0, visited_l1_entries = 0; 2456 int ret; 2457 int i, j; 2458 2459 if (status_cb) { 2460 l1_entries = s->l1_size; 2461 for (i = 0; i < s->nb_snapshots; i++) { 2462 l1_entries += s->snapshots[i].l1_size; 2463 } 2464 } 2465 2466 ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, 2467 &visited_l1_entries, l1_entries, 2468 status_cb, cb_opaque); 2469 if (ret < 0) { 2470 goto fail; 2471 } 2472 2473 /* Inactive L1 tables may point to active L2 tables - therefore it is 2474 * necessary to flush the L2 table cache before trying to access the L2 2475 * tables pointed to by inactive L1 entries (else we might try to expand 2476 * zero clusters that have already been expanded); furthermore, it is also 2477 * necessary to empty the L2 table cache, since it may contain tables which 2478 * are now going to be modified directly on disk, bypassing the cache. 2479 * qcow2_cache_empty() does both for us. */ 2480 ret = qcow2_cache_empty(bs, s->l2_table_cache); 2481 if (ret < 0) { 2482 goto fail; 2483 } 2484 2485 for (i = 0; i < s->nb_snapshots; i++) { 2486 int l1_size2; 2487 uint64_t *new_l1_table; 2488 Error *local_err = NULL; 2489 2490 ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset, 2491 s->snapshots[i].l1_size, L1E_SIZE, 2492 QCOW_MAX_L1_SIZE, "Snapshot L1 table", 2493 &local_err); 2494 if (ret < 0) { 2495 error_report_err(local_err); 2496 goto fail; 2497 } 2498 2499 l1_size2 = s->snapshots[i].l1_size * L1E_SIZE; 2500 new_l1_table = g_try_realloc(l1_table, l1_size2); 2501 2502 if (!new_l1_table) { 2503 ret = -ENOMEM; 2504 goto fail; 2505 } 2506 2507 l1_table = new_l1_table; 2508 2509 ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset, l1_size2, 2510 l1_table, 0); 2511 if (ret < 0) { 2512 goto fail; 2513 } 2514 2515 for (j = 0; j < s->snapshots[i].l1_size; j++) { 2516 be64_to_cpus(&l1_table[j]); 2517 } 2518 2519 ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, 2520 &visited_l1_entries, l1_entries, 2521 status_cb, cb_opaque); 2522 if (ret < 0) { 2523 goto fail; 2524 } 2525 } 2526 2527 ret = 0; 2528 2529 fail: 2530 g_free(l1_table); 2531 return ret; 2532 } 2533 2534 void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry, 2535 uint64_t *coffset, int *csize) 2536 { 2537 BDRVQcow2State *s = bs->opaque; 2538 int nb_csectors; 2539 2540 assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED); 2541 2542 *coffset = l2_entry & s->cluster_offset_mask; 2543 2544 nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1; 2545 *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE - 2546 (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1)); 2547 } 2548