1 /* 2 * block_copy API 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * Copyright (c) 2019 Virtuozzo International GmbH. 6 * 7 * Authors: 8 * Dietmar Maurer (dietmar@proxmox.com) 9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 */ 14 15 #include "qemu/osdep.h" 16 17 #include "trace.h" 18 #include "qapi/error.h" 19 #include "block/block-copy.h" 20 #include "block/block_int-io.h" 21 #include "block/dirty-bitmap.h" 22 #include "block/reqlist.h" 23 #include "sysemu/block-backend.h" 24 #include "qemu/units.h" 25 #include "qemu/co-shared-resource.h" 26 #include "qemu/coroutine.h" 27 #include "qemu/ratelimit.h" 28 #include "block/aio_task.h" 29 #include "qemu/error-report.h" 30 #include "qemu/memalign.h" 31 32 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) 33 #define BLOCK_COPY_MAX_BUFFER (1 * MiB) 34 #define BLOCK_COPY_MAX_MEM (128 * MiB) 35 #define BLOCK_COPY_MAX_WORKERS 64 36 #define BLOCK_COPY_SLICE_TIME 100000000ULL /* ns */ 37 #define BLOCK_COPY_CLUSTER_SIZE_DEFAULT (1 << 16) 38 39 typedef enum { 40 COPY_READ_WRITE_CLUSTER, 41 COPY_READ_WRITE, 42 COPY_WRITE_ZEROES, 43 COPY_RANGE_SMALL, 44 COPY_RANGE_FULL 45 } BlockCopyMethod; 46 47 static coroutine_fn int block_copy_task_entry(AioTask *task); 48 49 typedef struct BlockCopyCallState { 50 /* Fields initialized in block_copy_async() and never changed. */ 51 BlockCopyState *s; 52 int64_t offset; 53 int64_t bytes; 54 int max_workers; 55 int64_t max_chunk; 56 bool ignore_ratelimit; 57 BlockCopyAsyncCallbackFunc cb; 58 void *cb_opaque; 59 /* Coroutine where async block-copy is running */ 60 Coroutine *co; 61 62 /* Fields whose state changes throughout the execution */ 63 bool finished; /* atomic */ 64 QemuCoSleep sleep; /* TODO: protect API with a lock */ 65 bool cancelled; /* atomic */ 66 /* To reference all call states from BlockCopyState */ 67 QLIST_ENTRY(BlockCopyCallState) list; 68 69 /* 70 * Fields that report information about return values and errors. 71 * Protected by lock in BlockCopyState. 72 */ 73 bool error_is_read; 74 /* 75 * @ret is set concurrently by tasks under mutex. Only set once by first 76 * failed task (and untouched if no task failed). 77 * After finishing (call_state->finished is true), it is not modified 78 * anymore and may be safely read without mutex. 79 */ 80 int ret; 81 } BlockCopyCallState; 82 83 typedef struct BlockCopyTask { 84 AioTask task; 85 86 /* 87 * Fields initialized in block_copy_task_create() 88 * and never changed. 89 */ 90 BlockCopyState *s; 91 BlockCopyCallState *call_state; 92 /* 93 * @method can also be set again in the while loop of 94 * block_copy_dirty_clusters(), but it is never accessed concurrently 95 * because the only other function that reads it is 96 * block_copy_task_entry() and it is invoked afterwards in the same 97 * iteration. 98 */ 99 BlockCopyMethod method; 100 101 /* 102 * Generally, req is protected by lock in BlockCopyState, Still req.offset 103 * is only set on task creation, so may be read concurrently after creation. 104 * req.bytes is changed at most once, and need only protecting the case of 105 * parallel read while updating @bytes value in block_copy_task_shrink(). 106 */ 107 BlockReq req; 108 } BlockCopyTask; 109 110 static int64_t task_end(BlockCopyTask *task) 111 { 112 return task->req.offset + task->req.bytes; 113 } 114 115 typedef struct BlockCopyState { 116 /* 117 * BdrvChild objects are not owned or managed by block-copy. They are 118 * provided by block-copy user and user is responsible for appropriate 119 * permissions on these children. 120 */ 121 BdrvChild *source; 122 BdrvChild *target; 123 124 /* 125 * Fields initialized in block_copy_state_new() 126 * and never changed. 127 */ 128 int64_t cluster_size; 129 int64_t max_transfer; 130 uint64_t len; 131 BdrvRequestFlags write_flags; 132 133 /* 134 * Fields whose state changes throughout the execution 135 * Protected by lock. 136 */ 137 CoMutex lock; 138 int64_t in_flight_bytes; 139 BlockCopyMethod method; 140 bool discard_source; 141 BlockReqList reqs; 142 QLIST_HEAD(, BlockCopyCallState) calls; 143 /* 144 * skip_unallocated: 145 * 146 * Used by sync=top jobs, which first scan the source node for unallocated 147 * areas and clear them in the copy_bitmap. During this process, the bitmap 148 * is thus not fully initialized: It may still have bits set for areas that 149 * are unallocated and should actually not be copied. 150 * 151 * This is indicated by skip_unallocated. 152 * 153 * In this case, block_copy() will query the source’s allocation status, 154 * skip unallocated regions, clear them in the copy_bitmap, and invoke 155 * block_copy_reset_unallocated() every time it does. 156 */ 157 bool skip_unallocated; /* atomic */ 158 /* State fields that use a thread-safe API */ 159 BdrvDirtyBitmap *copy_bitmap; 160 ProgressMeter *progress; 161 SharedResource *mem; 162 RateLimit rate_limit; 163 } BlockCopyState; 164 165 /* Called with lock held */ 166 static int64_t block_copy_chunk_size(BlockCopyState *s) 167 { 168 switch (s->method) { 169 case COPY_READ_WRITE_CLUSTER: 170 return s->cluster_size; 171 case COPY_READ_WRITE: 172 case COPY_RANGE_SMALL: 173 return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER), 174 s->max_transfer); 175 case COPY_RANGE_FULL: 176 return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), 177 s->max_transfer); 178 default: 179 /* Cannot have COPY_WRITE_ZEROES here. */ 180 abort(); 181 } 182 } 183 184 /* 185 * Search for the first dirty area in offset/bytes range and create task at 186 * the beginning of it. 187 */ 188 static coroutine_fn BlockCopyTask * 189 block_copy_task_create(BlockCopyState *s, BlockCopyCallState *call_state, 190 int64_t offset, int64_t bytes) 191 { 192 BlockCopyTask *task; 193 int64_t max_chunk; 194 195 QEMU_LOCK_GUARD(&s->lock); 196 max_chunk = MIN_NON_ZERO(block_copy_chunk_size(s), call_state->max_chunk); 197 if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap, 198 offset, offset + bytes, 199 max_chunk, &offset, &bytes)) 200 { 201 return NULL; 202 } 203 204 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 205 bytes = QEMU_ALIGN_UP(bytes, s->cluster_size); 206 207 /* region is dirty, so no existent tasks possible in it */ 208 assert(!reqlist_find_conflict(&s->reqs, offset, bytes)); 209 210 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 211 s->in_flight_bytes += bytes; 212 213 task = g_new(BlockCopyTask, 1); 214 *task = (BlockCopyTask) { 215 .task.func = block_copy_task_entry, 216 .s = s, 217 .call_state = call_state, 218 .method = s->method, 219 }; 220 reqlist_init_req(&s->reqs, &task->req, offset, bytes); 221 222 return task; 223 } 224 225 /* 226 * block_copy_task_shrink 227 * 228 * Drop the tail of the task to be handled later. Set dirty bits back and 229 * wake up all tasks waiting for us (may be some of them are not intersecting 230 * with shrunk task) 231 */ 232 static void coroutine_fn block_copy_task_shrink(BlockCopyTask *task, 233 int64_t new_bytes) 234 { 235 QEMU_LOCK_GUARD(&task->s->lock); 236 if (new_bytes == task->req.bytes) { 237 return; 238 } 239 240 assert(new_bytes > 0 && new_bytes < task->req.bytes); 241 242 task->s->in_flight_bytes -= task->req.bytes - new_bytes; 243 bdrv_set_dirty_bitmap(task->s->copy_bitmap, 244 task->req.offset + new_bytes, 245 task->req.bytes - new_bytes); 246 247 reqlist_shrink_req(&task->req, new_bytes); 248 } 249 250 static void coroutine_fn block_copy_task_end(BlockCopyTask *task, int ret) 251 { 252 QEMU_LOCK_GUARD(&task->s->lock); 253 task->s->in_flight_bytes -= task->req.bytes; 254 if (ret < 0) { 255 bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->req.offset, 256 task->req.bytes); 257 } 258 if (task->s->progress) { 259 progress_set_remaining(task->s->progress, 260 bdrv_get_dirty_count(task->s->copy_bitmap) + 261 task->s->in_flight_bytes); 262 } 263 reqlist_remove_req(&task->req); 264 } 265 266 void block_copy_state_free(BlockCopyState *s) 267 { 268 if (!s) { 269 return; 270 } 271 272 ratelimit_destroy(&s->rate_limit); 273 bdrv_release_dirty_bitmap(s->copy_bitmap); 274 shres_destroy(s->mem); 275 g_free(s); 276 } 277 278 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) 279 { 280 return MIN_NON_ZERO(INT_MAX, 281 MIN_NON_ZERO(source->bs->bl.max_transfer, 282 target->bs->bl.max_transfer)); 283 } 284 285 void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range, 286 bool compress) 287 { 288 /* Keep BDRV_REQ_SERIALISING set (or not set) in block_copy_state_new() */ 289 s->write_flags = (s->write_flags & BDRV_REQ_SERIALISING) | 290 (compress ? BDRV_REQ_WRITE_COMPRESSED : 0); 291 292 if (s->max_transfer < s->cluster_size) { 293 /* 294 * copy_range does not respect max_transfer. We don't want to bother 295 * with requests smaller than block-copy cluster size, so fallback to 296 * buffered copying (read and write respect max_transfer on their 297 * behalf). 298 */ 299 s->method = COPY_READ_WRITE_CLUSTER; 300 } else if (compress) { 301 /* Compression supports only cluster-size writes and no copy-range. */ 302 s->method = COPY_READ_WRITE_CLUSTER; 303 } else { 304 /* 305 * If copy range enabled, start with COPY_RANGE_SMALL, until first 306 * successful copy_range (look at block_copy_do_copy). 307 */ 308 s->method = use_copy_range ? COPY_RANGE_SMALL : COPY_READ_WRITE; 309 } 310 } 311 312 static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, 313 int64_t min_cluster_size, 314 Error **errp) 315 { 316 int ret; 317 BlockDriverInfo bdi; 318 bool target_does_cow; 319 320 GLOBAL_STATE_CODE(); 321 GRAPH_RDLOCK_GUARD_MAINLOOP(); 322 323 min_cluster_size = MAX(min_cluster_size, 324 (int64_t)BLOCK_COPY_CLUSTER_SIZE_DEFAULT); 325 326 target_does_cow = bdrv_backing_chain_next(target); 327 328 /* 329 * If there is no backing file on the target, we cannot rely on COW if our 330 * backup cluster size is smaller than the target cluster size. Even for 331 * targets with a backing file, try to avoid COW if possible. 332 */ 333 ret = bdrv_get_info(target, &bdi); 334 if (ret == -ENOTSUP && !target_does_cow) { 335 /* Cluster size is not defined */ 336 warn_report("The target block device doesn't provide information about " 337 "the block size and it doesn't have a backing file. The " 338 "(default) block size of %" PRIi64 " bytes is used. If the " 339 "actual block size of the target exceeds this value, the " 340 "backup may be unusable", 341 min_cluster_size); 342 return min_cluster_size; 343 } else if (ret < 0 && !target_does_cow) { 344 error_setg_errno(errp, -ret, 345 "Couldn't determine the cluster size of the target image, " 346 "which has no backing file"); 347 error_append_hint(errp, 348 "Aborting, since this may create an unusable destination image\n"); 349 return ret; 350 } else if (ret < 0 && target_does_cow) { 351 /* Not fatal; just trudge on ahead. */ 352 return min_cluster_size; 353 } 354 355 return MAX(min_cluster_size, bdi.cluster_size); 356 } 357 358 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, 359 BlockDriverState *copy_bitmap_bs, 360 const BdrvDirtyBitmap *bitmap, 361 bool discard_source, 362 uint64_t min_cluster_size, 363 Error **errp) 364 { 365 ERRP_GUARD(); 366 BlockCopyState *s; 367 int64_t cluster_size; 368 BdrvDirtyBitmap *copy_bitmap; 369 bool is_fleecing; 370 371 GLOBAL_STATE_CODE(); 372 373 if (min_cluster_size > INT64_MAX) { 374 error_setg(errp, "min-cluster-size too large: %" PRIu64 " > %" PRIi64, 375 min_cluster_size, INT64_MAX); 376 return NULL; 377 } else if (min_cluster_size && !is_power_of_2(min_cluster_size)) { 378 error_setg(errp, "min-cluster-size needs to be a power of 2"); 379 return NULL; 380 } 381 382 cluster_size = block_copy_calculate_cluster_size(target->bs, 383 (int64_t)min_cluster_size, 384 errp); 385 if (cluster_size < 0) { 386 return NULL; 387 } 388 389 copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL, 390 errp); 391 if (!copy_bitmap) { 392 return NULL; 393 } 394 bdrv_disable_dirty_bitmap(copy_bitmap); 395 if (bitmap) { 396 if (!bdrv_merge_dirty_bitmap(copy_bitmap, bitmap, NULL, errp)) { 397 error_prepend(errp, "Failed to merge bitmap '%s' to internal " 398 "copy-bitmap: ", bdrv_dirty_bitmap_name(bitmap)); 399 bdrv_release_dirty_bitmap(copy_bitmap); 400 return NULL; 401 } 402 } else { 403 bdrv_set_dirty_bitmap(copy_bitmap, 0, 404 bdrv_dirty_bitmap_size(copy_bitmap)); 405 } 406 407 /* 408 * If source is in backing chain of target assume that target is going to be 409 * used for "image fleecing", i.e. it should represent a kind of snapshot of 410 * source at backup-start point in time. And target is going to be read by 411 * somebody (for example, used as NBD export) during backup job. 412 * 413 * In this case, we need to add BDRV_REQ_SERIALISING write flag to avoid 414 * intersection of backup writes and third party reads from target, 415 * otherwise reading from target we may occasionally read already updated by 416 * guest data. 417 * 418 * For more information see commit f8d59dfb40bb and test 419 * tests/qemu-iotests/222 420 */ 421 bdrv_graph_rdlock_main_loop(); 422 is_fleecing = bdrv_chain_contains(target->bs, source->bs); 423 bdrv_graph_rdunlock_main_loop(); 424 425 s = g_new(BlockCopyState, 1); 426 *s = (BlockCopyState) { 427 .source = source, 428 .target = target, 429 .copy_bitmap = copy_bitmap, 430 .cluster_size = cluster_size, 431 .len = bdrv_dirty_bitmap_size(copy_bitmap), 432 .write_flags = (is_fleecing ? BDRV_REQ_SERIALISING : 0), 433 .mem = shres_create(BLOCK_COPY_MAX_MEM), 434 .max_transfer = QEMU_ALIGN_DOWN( 435 block_copy_max_transfer(source, target), 436 cluster_size), 437 }; 438 439 s->discard_source = discard_source; 440 block_copy_set_copy_opts(s, false, false); 441 442 ratelimit_init(&s->rate_limit); 443 qemu_co_mutex_init(&s->lock); 444 QLIST_INIT(&s->reqs); 445 QLIST_INIT(&s->calls); 446 447 return s; 448 } 449 450 /* Only set before running the job, no need for locking. */ 451 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) 452 { 453 s->progress = pm; 454 } 455 456 /* 457 * Takes ownership of @task 458 * 459 * If pool is NULL directly run the task, otherwise schedule it into the pool. 460 * 461 * Returns: task.func return code if pool is NULL 462 * otherwise -ECANCELED if pool status is bad 463 * otherwise 0 (successfully scheduled) 464 */ 465 static coroutine_fn int block_copy_task_run(AioTaskPool *pool, 466 BlockCopyTask *task) 467 { 468 if (!pool) { 469 int ret = task->task.func(&task->task); 470 471 g_free(task); 472 return ret; 473 } 474 475 aio_task_pool_wait_slot(pool); 476 if (aio_task_pool_status(pool) < 0) { 477 co_put_to_shres(task->s->mem, task->req.bytes); 478 block_copy_task_end(task, -ECANCELED); 479 g_free(task); 480 return -ECANCELED; 481 } 482 483 aio_task_pool_start_task(pool, &task->task); 484 485 return 0; 486 } 487 488 /* 489 * block_copy_do_copy 490 * 491 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed 492 * s->len only to cover last cluster when s->len is not aligned to clusters. 493 * 494 * No sync here: neither bitmap nor intersecting requests handling, only copy. 495 * 496 * @method is an in-out argument, so that copy_range can be either extended to 497 * a full-size buffer or disabled if the copy_range attempt fails. The output 498 * value of @method should be used for subsequent tasks. 499 * Returns 0 on success. 500 */ 501 static int coroutine_fn GRAPH_RDLOCK 502 block_copy_do_copy(BlockCopyState *s, int64_t offset, int64_t bytes, 503 BlockCopyMethod *method, bool *error_is_read) 504 { 505 int ret; 506 int64_t nbytes = MIN(offset + bytes, s->len) - offset; 507 void *bounce_buffer = NULL; 508 509 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes); 510 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 511 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); 512 assert(offset < s->len); 513 assert(offset + bytes <= s->len || 514 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size)); 515 assert(nbytes < INT_MAX); 516 517 switch (*method) { 518 case COPY_WRITE_ZEROES: 519 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags & 520 ~BDRV_REQ_WRITE_COMPRESSED); 521 if (ret < 0) { 522 trace_block_copy_write_zeroes_fail(s, offset, ret); 523 *error_is_read = false; 524 } 525 return ret; 526 527 case COPY_RANGE_SMALL: 528 case COPY_RANGE_FULL: 529 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes, 530 0, s->write_flags); 531 if (ret >= 0) { 532 /* Successful copy-range, increase chunk size. */ 533 *method = COPY_RANGE_FULL; 534 return 0; 535 } 536 537 trace_block_copy_copy_range_fail(s, offset, ret); 538 *method = COPY_READ_WRITE; 539 /* Fall through to read+write with allocated buffer */ 540 541 case COPY_READ_WRITE_CLUSTER: 542 case COPY_READ_WRITE: 543 /* 544 * In case of failed copy_range request above, we may proceed with 545 * buffered request larger than BLOCK_COPY_MAX_BUFFER. 546 * Still, further requests will be properly limited, so don't care too 547 * much. Moreover the most likely case (copy_range is unsupported for 548 * the configuration, so the very first copy_range request fails) 549 * is handled by setting large copy_size only after first successful 550 * copy_range. 551 */ 552 553 bounce_buffer = qemu_blockalign(s->source->bs, nbytes); 554 555 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0); 556 if (ret < 0) { 557 trace_block_copy_read_fail(s, offset, ret); 558 *error_is_read = true; 559 goto out; 560 } 561 562 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer, 563 s->write_flags); 564 if (ret < 0) { 565 trace_block_copy_write_fail(s, offset, ret); 566 *error_is_read = false; 567 goto out; 568 } 569 570 out: 571 qemu_vfree(bounce_buffer); 572 break; 573 574 default: 575 abort(); 576 } 577 578 return ret; 579 } 580 581 static coroutine_fn int block_copy_task_entry(AioTask *task) 582 { 583 BlockCopyTask *t = container_of(task, BlockCopyTask, task); 584 BlockCopyState *s = t->s; 585 bool error_is_read = false; 586 BlockCopyMethod method = t->method; 587 int ret = -1; 588 589 WITH_GRAPH_RDLOCK_GUARD() { 590 ret = block_copy_do_copy(s, t->req.offset, t->req.bytes, &method, 591 &error_is_read); 592 } 593 594 WITH_QEMU_LOCK_GUARD(&s->lock) { 595 if (s->method == t->method) { 596 s->method = method; 597 } 598 599 if (ret < 0) { 600 if (!t->call_state->ret) { 601 t->call_state->ret = ret; 602 t->call_state->error_is_read = error_is_read; 603 } 604 } else if (s->progress) { 605 progress_work_done(s->progress, t->req.bytes); 606 } 607 } 608 co_put_to_shres(s->mem, t->req.bytes); 609 block_copy_task_end(t, ret); 610 611 if (s->discard_source && ret == 0) { 612 int64_t nbytes = 613 MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset; 614 WITH_GRAPH_RDLOCK_GUARD() { 615 bdrv_co_pdiscard(s->source, t->req.offset, nbytes); 616 } 617 } 618 619 return ret; 620 } 621 622 static coroutine_fn GRAPH_RDLOCK 623 int block_copy_block_status(BlockCopyState *s, int64_t offset, int64_t bytes, 624 int64_t *pnum) 625 { 626 int64_t num; 627 BlockDriverState *base; 628 int ret; 629 630 if (qatomic_read(&s->skip_unallocated)) { 631 base = bdrv_backing_chain_next(s->source->bs); 632 } else { 633 base = NULL; 634 } 635 636 ret = bdrv_co_block_status_above(s->source->bs, base, offset, bytes, &num, 637 NULL, NULL); 638 if (ret < 0 || num < s->cluster_size) { 639 /* 640 * On error or if failed to obtain large enough chunk just fallback to 641 * copy one cluster. 642 */ 643 num = s->cluster_size; 644 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA; 645 } else if (offset + num == s->len) { 646 num = QEMU_ALIGN_UP(num, s->cluster_size); 647 } else { 648 num = QEMU_ALIGN_DOWN(num, s->cluster_size); 649 } 650 651 *pnum = num; 652 return ret; 653 } 654 655 /* 656 * Check if the cluster starting at offset is allocated or not. 657 * return via pnum the number of contiguous clusters sharing this allocation. 658 */ 659 static int coroutine_fn GRAPH_RDLOCK 660 block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, 661 int64_t *pnum) 662 { 663 BlockDriverState *bs = s->source->bs; 664 int64_t count, total_count = 0; 665 int64_t bytes = s->len - offset; 666 int ret; 667 668 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 669 670 while (true) { 671 /* protected in backup_run() */ 672 ret = bdrv_co_is_allocated(bs, offset, bytes, &count); 673 if (ret < 0) { 674 return ret; 675 } 676 677 total_count += count; 678 679 if (ret || count == 0) { 680 /* 681 * ret: partial segment(s) are considered allocated. 682 * otherwise: unallocated tail is treated as an entire segment. 683 */ 684 *pnum = DIV_ROUND_UP(total_count, s->cluster_size); 685 return ret; 686 } 687 688 /* Unallocated segment(s) with uncertain following segment(s) */ 689 if (total_count >= s->cluster_size) { 690 *pnum = total_count / s->cluster_size; 691 return 0; 692 } 693 694 offset += count; 695 bytes -= count; 696 } 697 } 698 699 void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes) 700 { 701 QEMU_LOCK_GUARD(&s->lock); 702 703 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 704 if (s->progress) { 705 progress_set_remaining(s->progress, 706 bdrv_get_dirty_count(s->copy_bitmap) + 707 s->in_flight_bytes); 708 } 709 } 710 711 /* 712 * Reset bits in copy_bitmap starting at offset if they represent unallocated 713 * data in the image. May reset subsequent contiguous bits. 714 * @return 0 when the cluster at @offset was unallocated, 715 * 1 otherwise, and -ret on error. 716 */ 717 int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s, 718 int64_t offset, 719 int64_t *count) 720 { 721 int ret; 722 int64_t clusters, bytes; 723 724 ret = block_copy_is_cluster_allocated(s, offset, &clusters); 725 if (ret < 0) { 726 return ret; 727 } 728 729 bytes = clusters * s->cluster_size; 730 731 if (!ret) { 732 block_copy_reset(s, offset, bytes); 733 } 734 735 *count = bytes; 736 return ret; 737 } 738 739 /* 740 * block_copy_dirty_clusters 741 * 742 * Copy dirty clusters in @offset/@bytes range. 743 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty 744 * clusters found and -errno on failure. 745 */ 746 static int coroutine_fn GRAPH_RDLOCK 747 block_copy_dirty_clusters(BlockCopyCallState *call_state) 748 { 749 BlockCopyState *s = call_state->s; 750 int64_t offset = call_state->offset; 751 int64_t bytes = call_state->bytes; 752 753 int ret = 0; 754 bool found_dirty = false; 755 int64_t end = offset + bytes; 756 AioTaskPool *aio = NULL; 757 758 /* 759 * block_copy() user is responsible for keeping source and target in same 760 * aio context 761 */ 762 assert(bdrv_get_aio_context(s->source->bs) == 763 bdrv_get_aio_context(s->target->bs)); 764 765 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 766 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); 767 768 while (bytes && aio_task_pool_status(aio) == 0 && 769 !qatomic_read(&call_state->cancelled)) { 770 BlockCopyTask *task; 771 int64_t status_bytes; 772 773 task = block_copy_task_create(s, call_state, offset, bytes); 774 if (!task) { 775 /* No more dirty bits in the bitmap */ 776 trace_block_copy_skip_range(s, offset, bytes); 777 break; 778 } 779 if (task->req.offset > offset) { 780 trace_block_copy_skip_range(s, offset, task->req.offset - offset); 781 } 782 783 found_dirty = true; 784 785 ret = block_copy_block_status(s, task->req.offset, task->req.bytes, 786 &status_bytes); 787 assert(ret >= 0); /* never fail */ 788 if (status_bytes < task->req.bytes) { 789 block_copy_task_shrink(task, status_bytes); 790 } 791 if (qatomic_read(&s->skip_unallocated) && 792 !(ret & BDRV_BLOCK_ALLOCATED)) { 793 block_copy_task_end(task, 0); 794 trace_block_copy_skip_range(s, task->req.offset, task->req.bytes); 795 offset = task_end(task); 796 bytes = end - offset; 797 g_free(task); 798 continue; 799 } 800 if (ret & BDRV_BLOCK_ZERO) { 801 task->method = COPY_WRITE_ZEROES; 802 } 803 804 if (!call_state->ignore_ratelimit) { 805 uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0); 806 if (ns > 0) { 807 block_copy_task_end(task, -EAGAIN); 808 g_free(task); 809 qemu_co_sleep_ns_wakeable(&call_state->sleep, 810 QEMU_CLOCK_REALTIME, ns); 811 continue; 812 } 813 } 814 815 ratelimit_calculate_delay(&s->rate_limit, task->req.bytes); 816 817 trace_block_copy_process(s, task->req.offset); 818 819 co_get_from_shres(s->mem, task->req.bytes); 820 821 offset = task_end(task); 822 bytes = end - offset; 823 824 if (!aio && bytes) { 825 aio = aio_task_pool_new(call_state->max_workers); 826 } 827 828 ret = block_copy_task_run(aio, task); 829 if (ret < 0) { 830 goto out; 831 } 832 } 833 834 out: 835 if (aio) { 836 aio_task_pool_wait_all(aio); 837 838 /* 839 * We are not really interested in -ECANCELED returned from 840 * block_copy_task_run. If it fails, it means some task already failed 841 * for real reason, let's return first failure. 842 * Still, assert that we don't rewrite failure by success. 843 * 844 * Note: ret may be positive here because of block-status result. 845 */ 846 assert(ret >= 0 || aio_task_pool_status(aio) < 0); 847 ret = aio_task_pool_status(aio); 848 849 aio_task_pool_free(aio); 850 } 851 852 return ret < 0 ? ret : found_dirty; 853 } 854 855 void block_copy_kick(BlockCopyCallState *call_state) 856 { 857 qemu_co_sleep_wake(&call_state->sleep); 858 } 859 860 /* 861 * block_copy_common 862 * 863 * Copy requested region, accordingly to dirty bitmap. 864 * Collaborate with parallel block_copy requests: if they succeed it will help 865 * us. If they fail, we will retry not-copied regions. So, if we return error, 866 * it means that some I/O operation failed in context of _this_ block_copy call, 867 * not some parallel operation. 868 */ 869 static int coroutine_fn GRAPH_RDLOCK 870 block_copy_common(BlockCopyCallState *call_state) 871 { 872 int ret; 873 BlockCopyState *s = call_state->s; 874 875 qemu_co_mutex_lock(&s->lock); 876 QLIST_INSERT_HEAD(&s->calls, call_state, list); 877 qemu_co_mutex_unlock(&s->lock); 878 879 do { 880 ret = block_copy_dirty_clusters(call_state); 881 882 if (ret == 0 && !qatomic_read(&call_state->cancelled)) { 883 WITH_QEMU_LOCK_GUARD(&s->lock) { 884 /* 885 * Check that there is no task we still need to 886 * wait to complete 887 */ 888 ret = reqlist_wait_one(&s->reqs, call_state->offset, 889 call_state->bytes, &s->lock); 890 if (ret == 0) { 891 /* 892 * No pending tasks, but check again the bitmap in this 893 * same critical section, since a task might have failed 894 * between this and the critical section in 895 * block_copy_dirty_clusters(). 896 * 897 * reqlist_wait_one return value 0 also means that it 898 * didn't release the lock. So, we are still in the same 899 * critical section, not interrupted by any concurrent 900 * access to state. 901 */ 902 ret = bdrv_dirty_bitmap_next_dirty(s->copy_bitmap, 903 call_state->offset, 904 call_state->bytes) >= 0; 905 } 906 } 907 } 908 909 /* 910 * We retry in two cases: 911 * 1. Some progress done 912 * Something was copied, which means that there were yield points 913 * and some new dirty bits may have appeared (due to failed parallel 914 * block-copy requests). 915 * 2. We have waited for some intersecting block-copy request 916 * It may have failed and produced new dirty bits. 917 */ 918 } while (ret > 0 && !qatomic_read(&call_state->cancelled)); 919 920 qatomic_store_release(&call_state->finished, true); 921 922 if (call_state->cb) { 923 call_state->cb(call_state->cb_opaque); 924 } 925 926 qemu_co_mutex_lock(&s->lock); 927 QLIST_REMOVE(call_state, list); 928 qemu_co_mutex_unlock(&s->lock); 929 930 return ret; 931 } 932 933 static void coroutine_fn block_copy_async_co_entry(void *opaque) 934 { 935 GRAPH_RDLOCK_GUARD(); 936 block_copy_common(opaque); 937 } 938 939 int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes, 940 bool ignore_ratelimit, uint64_t timeout_ns, 941 BlockCopyAsyncCallbackFunc cb, 942 void *cb_opaque) 943 { 944 int ret; 945 BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); 946 947 *call_state = (BlockCopyCallState) { 948 .s = s, 949 .offset = start, 950 .bytes = bytes, 951 .ignore_ratelimit = ignore_ratelimit, 952 .max_workers = BLOCK_COPY_MAX_WORKERS, 953 .cb = cb, 954 .cb_opaque = cb_opaque, 955 }; 956 957 ret = qemu_co_timeout(block_copy_async_co_entry, call_state, timeout_ns, 958 g_free); 959 if (ret < 0) { 960 assert(ret == -ETIMEDOUT); 961 block_copy_call_cancel(call_state); 962 /* call_state will be freed by running coroutine. */ 963 return ret; 964 } 965 966 ret = call_state->ret; 967 g_free(call_state); 968 969 return ret; 970 } 971 972 BlockCopyCallState *block_copy_async(BlockCopyState *s, 973 int64_t offset, int64_t bytes, 974 int max_workers, int64_t max_chunk, 975 BlockCopyAsyncCallbackFunc cb, 976 void *cb_opaque) 977 { 978 BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); 979 980 *call_state = (BlockCopyCallState) { 981 .s = s, 982 .offset = offset, 983 .bytes = bytes, 984 .max_workers = max_workers, 985 .max_chunk = max_chunk, 986 .cb = cb, 987 .cb_opaque = cb_opaque, 988 989 .co = qemu_coroutine_create(block_copy_async_co_entry, call_state), 990 }; 991 992 qemu_coroutine_enter(call_state->co); 993 994 return call_state; 995 } 996 997 void block_copy_call_free(BlockCopyCallState *call_state) 998 { 999 if (!call_state) { 1000 return; 1001 } 1002 1003 assert(qatomic_read(&call_state->finished)); 1004 g_free(call_state); 1005 } 1006 1007 bool block_copy_call_finished(BlockCopyCallState *call_state) 1008 { 1009 return qatomic_read(&call_state->finished); 1010 } 1011 1012 bool block_copy_call_succeeded(BlockCopyCallState *call_state) 1013 { 1014 return qatomic_load_acquire(&call_state->finished) && 1015 !qatomic_read(&call_state->cancelled) && 1016 call_state->ret == 0; 1017 } 1018 1019 bool block_copy_call_failed(BlockCopyCallState *call_state) 1020 { 1021 return qatomic_load_acquire(&call_state->finished) && 1022 !qatomic_read(&call_state->cancelled) && 1023 call_state->ret < 0; 1024 } 1025 1026 bool block_copy_call_cancelled(BlockCopyCallState *call_state) 1027 { 1028 return qatomic_read(&call_state->cancelled); 1029 } 1030 1031 int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read) 1032 { 1033 assert(qatomic_load_acquire(&call_state->finished)); 1034 if (error_is_read) { 1035 *error_is_read = call_state->error_is_read; 1036 } 1037 return call_state->ret; 1038 } 1039 1040 /* 1041 * Note that cancelling and finishing are racy. 1042 * User can cancel a block-copy that is already finished. 1043 */ 1044 void block_copy_call_cancel(BlockCopyCallState *call_state) 1045 { 1046 qatomic_set(&call_state->cancelled, true); 1047 block_copy_kick(call_state); 1048 } 1049 1050 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s) 1051 { 1052 return s->copy_bitmap; 1053 } 1054 1055 int64_t block_copy_cluster_size(BlockCopyState *s) 1056 { 1057 return s->cluster_size; 1058 } 1059 1060 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip) 1061 { 1062 qatomic_set(&s->skip_unallocated, skip); 1063 } 1064 1065 void block_copy_set_speed(BlockCopyState *s, uint64_t speed) 1066 { 1067 ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME); 1068 1069 /* 1070 * Note: it's good to kick all call states from here, but it should be done 1071 * only from a coroutine, to not crash if s->calls list changed while 1072 * entering one call. So for now, the only user of this function kicks its 1073 * only one call_state by hand. 1074 */ 1075 } 1076