1 /* 2 * block_copy API 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * Copyright (c) 2019 Virtuozzo International GmbH. 6 * 7 * Authors: 8 * Dietmar Maurer (dietmar@proxmox.com) 9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 */ 14 15 #include "qemu/osdep.h" 16 17 #include "trace.h" 18 #include "qapi/error.h" 19 #include "block/block-copy.h" 20 #include "block/block_int-io.h" 21 #include "block/dirty-bitmap.h" 22 #include "block/reqlist.h" 23 #include "sysemu/block-backend.h" 24 #include "qemu/units.h" 25 #include "qemu/co-shared-resource.h" 26 #include "qemu/coroutine.h" 27 #include "qemu/ratelimit.h" 28 #include "block/aio_task.h" 29 #include "qemu/error-report.h" 30 #include "qemu/memalign.h" 31 32 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) 33 #define BLOCK_COPY_MAX_BUFFER (1 * MiB) 34 #define BLOCK_COPY_MAX_MEM (128 * MiB) 35 #define BLOCK_COPY_MAX_WORKERS 64 36 #define BLOCK_COPY_SLICE_TIME 100000000ULL /* ns */ 37 #define BLOCK_COPY_CLUSTER_SIZE_DEFAULT (1 << 16) 38 39 typedef enum { 40 COPY_READ_WRITE_CLUSTER, 41 COPY_READ_WRITE, 42 COPY_WRITE_ZEROES, 43 COPY_RANGE_SMALL, 44 COPY_RANGE_FULL 45 } BlockCopyMethod; 46 47 static coroutine_fn int block_copy_task_entry(AioTask *task); 48 49 typedef struct BlockCopyCallState { 50 /* Fields initialized in block_copy_async() and never changed. */ 51 BlockCopyState *s; 52 int64_t offset; 53 int64_t bytes; 54 int max_workers; 55 int64_t max_chunk; 56 bool ignore_ratelimit; 57 BlockCopyAsyncCallbackFunc cb; 58 void *cb_opaque; 59 /* Coroutine where async block-copy is running */ 60 Coroutine *co; 61 62 /* Fields whose state changes throughout the execution */ 63 bool finished; /* atomic */ 64 QemuCoSleep sleep; /* TODO: protect API with a lock */ 65 bool cancelled; /* atomic */ 66 /* To reference all call states from BlockCopyState */ 67 QLIST_ENTRY(BlockCopyCallState) list; 68 69 /* 70 * Fields that report information about return values and errors. 71 * Protected by lock in BlockCopyState. 72 */ 73 bool error_is_read; 74 /* 75 * @ret is set concurrently by tasks under mutex. Only set once by first 76 * failed task (and untouched if no task failed). 77 * After finishing (call_state->finished is true), it is not modified 78 * anymore and may be safely read without mutex. 79 */ 80 int ret; 81 } BlockCopyCallState; 82 83 typedef struct BlockCopyTask { 84 AioTask task; 85 86 /* 87 * Fields initialized in block_copy_task_create() 88 * and never changed. 89 */ 90 BlockCopyState *s; 91 BlockCopyCallState *call_state; 92 /* 93 * @method can also be set again in the while loop of 94 * block_copy_dirty_clusters(), but it is never accessed concurrently 95 * because the only other function that reads it is 96 * block_copy_task_entry() and it is invoked afterwards in the same 97 * iteration. 98 */ 99 BlockCopyMethod method; 100 101 /* 102 * Generally, req is protected by lock in BlockCopyState, Still req.offset 103 * is only set on task creation, so may be read concurrently after creation. 104 * req.bytes is changed at most once, and need only protecting the case of 105 * parallel read while updating @bytes value in block_copy_task_shrink(). 106 */ 107 BlockReq req; 108 } BlockCopyTask; 109 110 static int64_t task_end(BlockCopyTask *task) 111 { 112 return task->req.offset + task->req.bytes; 113 } 114 115 typedef struct BlockCopyState { 116 /* 117 * BdrvChild objects are not owned or managed by block-copy. They are 118 * provided by block-copy user and user is responsible for appropriate 119 * permissions on these children. 120 */ 121 BdrvChild *source; 122 BdrvChild *target; 123 124 /* 125 * Fields initialized in block_copy_state_new() 126 * and never changed. 127 */ 128 int64_t cluster_size; 129 int64_t max_transfer; 130 uint64_t len; 131 BdrvRequestFlags write_flags; 132 133 /* 134 * Fields whose state changes throughout the execution 135 * Protected by lock. 136 */ 137 CoMutex lock; 138 int64_t in_flight_bytes; 139 BlockCopyMethod method; 140 bool discard_source; 141 BlockReqList reqs; 142 QLIST_HEAD(, BlockCopyCallState) calls; 143 /* 144 * skip_unallocated: 145 * 146 * Used by sync=top jobs, which first scan the source node for unallocated 147 * areas and clear them in the copy_bitmap. During this process, the bitmap 148 * is thus not fully initialized: It may still have bits set for areas that 149 * are unallocated and should actually not be copied. 150 * 151 * This is indicated by skip_unallocated. 152 * 153 * In this case, block_copy() will query the source’s allocation status, 154 * skip unallocated regions, clear them in the copy_bitmap, and invoke 155 * block_copy_reset_unallocated() every time it does. 156 */ 157 bool skip_unallocated; /* atomic */ 158 /* State fields that use a thread-safe API */ 159 BdrvDirtyBitmap *copy_bitmap; 160 ProgressMeter *progress; 161 SharedResource *mem; 162 RateLimit rate_limit; 163 } BlockCopyState; 164 165 /* Called with lock held */ 166 static int64_t block_copy_chunk_size(BlockCopyState *s) 167 { 168 switch (s->method) { 169 case COPY_READ_WRITE_CLUSTER: 170 return s->cluster_size; 171 case COPY_READ_WRITE: 172 case COPY_RANGE_SMALL: 173 return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER), 174 s->max_transfer); 175 case COPY_RANGE_FULL: 176 return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), 177 s->max_transfer); 178 default: 179 /* Cannot have COPY_WRITE_ZEROES here. */ 180 abort(); 181 } 182 } 183 184 /* 185 * Search for the first dirty area in offset/bytes range and create task at 186 * the beginning of it. 187 */ 188 static coroutine_fn BlockCopyTask * 189 block_copy_task_create(BlockCopyState *s, BlockCopyCallState *call_state, 190 int64_t offset, int64_t bytes) 191 { 192 BlockCopyTask *task; 193 int64_t max_chunk; 194 195 QEMU_LOCK_GUARD(&s->lock); 196 max_chunk = MIN_NON_ZERO(block_copy_chunk_size(s), call_state->max_chunk); 197 if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap, 198 offset, offset + bytes, 199 max_chunk, &offset, &bytes)) 200 { 201 return NULL; 202 } 203 204 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 205 bytes = QEMU_ALIGN_UP(bytes, s->cluster_size); 206 207 /* region is dirty, so no existent tasks possible in it */ 208 assert(!reqlist_find_conflict(&s->reqs, offset, bytes)); 209 210 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 211 s->in_flight_bytes += bytes; 212 213 task = g_new(BlockCopyTask, 1); 214 *task = (BlockCopyTask) { 215 .task.func = block_copy_task_entry, 216 .s = s, 217 .call_state = call_state, 218 .method = s->method, 219 }; 220 reqlist_init_req(&s->reqs, &task->req, offset, bytes); 221 222 return task; 223 } 224 225 /* 226 * block_copy_task_shrink 227 * 228 * Drop the tail of the task to be handled later. Set dirty bits back and 229 * wake up all tasks waiting for us (may be some of them are not intersecting 230 * with shrunk task) 231 */ 232 static void coroutine_fn block_copy_task_shrink(BlockCopyTask *task, 233 int64_t new_bytes) 234 { 235 QEMU_LOCK_GUARD(&task->s->lock); 236 if (new_bytes == task->req.bytes) { 237 return; 238 } 239 240 assert(new_bytes > 0 && new_bytes < task->req.bytes); 241 242 task->s->in_flight_bytes -= task->req.bytes - new_bytes; 243 bdrv_set_dirty_bitmap(task->s->copy_bitmap, 244 task->req.offset + new_bytes, 245 task->req.bytes - new_bytes); 246 247 reqlist_shrink_req(&task->req, new_bytes); 248 } 249 250 static void coroutine_fn block_copy_task_end(BlockCopyTask *task, int ret) 251 { 252 QEMU_LOCK_GUARD(&task->s->lock); 253 task->s->in_flight_bytes -= task->req.bytes; 254 if (ret < 0) { 255 bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->req.offset, 256 task->req.bytes); 257 } 258 if (task->s->progress) { 259 progress_set_remaining(task->s->progress, 260 bdrv_get_dirty_count(task->s->copy_bitmap) + 261 task->s->in_flight_bytes); 262 } 263 reqlist_remove_req(&task->req); 264 } 265 266 void block_copy_state_free(BlockCopyState *s) 267 { 268 if (!s) { 269 return; 270 } 271 272 ratelimit_destroy(&s->rate_limit); 273 bdrv_release_dirty_bitmap(s->copy_bitmap); 274 shres_destroy(s->mem); 275 g_free(s); 276 } 277 278 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) 279 { 280 return MIN_NON_ZERO(INT_MAX, 281 MIN_NON_ZERO(source->bs->bl.max_transfer, 282 target->bs->bl.max_transfer)); 283 } 284 285 void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range, 286 bool compress) 287 { 288 /* Keep BDRV_REQ_SERIALISING set (or not set) in block_copy_state_new() */ 289 s->write_flags = (s->write_flags & BDRV_REQ_SERIALISING) | 290 (compress ? BDRV_REQ_WRITE_COMPRESSED : 0); 291 292 if (s->max_transfer < s->cluster_size) { 293 /* 294 * copy_range does not respect max_transfer. We don't want to bother 295 * with requests smaller than block-copy cluster size, so fallback to 296 * buffered copying (read and write respect max_transfer on their 297 * behalf). 298 */ 299 s->method = COPY_READ_WRITE_CLUSTER; 300 } else if (compress) { 301 /* Compression supports only cluster-size writes and no copy-range. */ 302 s->method = COPY_READ_WRITE_CLUSTER; 303 } else { 304 /* 305 * If copy range enabled, start with COPY_RANGE_SMALL, until first 306 * successful copy_range (look at block_copy_do_copy). 307 */ 308 s->method = use_copy_range ? COPY_RANGE_SMALL : COPY_READ_WRITE; 309 } 310 } 311 312 static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, 313 Error **errp) 314 { 315 int ret; 316 BlockDriverInfo bdi; 317 bool target_does_cow; 318 319 GLOBAL_STATE_CODE(); 320 GRAPH_RDLOCK_GUARD_MAINLOOP(); 321 322 target_does_cow = bdrv_backing_chain_next(target); 323 324 /* 325 * If there is no backing file on the target, we cannot rely on COW if our 326 * backup cluster size is smaller than the target cluster size. Even for 327 * targets with a backing file, try to avoid COW if possible. 328 */ 329 ret = bdrv_get_info(target, &bdi); 330 if (ret == -ENOTSUP && !target_does_cow) { 331 /* Cluster size is not defined */ 332 warn_report("The target block device doesn't provide " 333 "information about the block size and it doesn't have a " 334 "backing file. The default block size of %u bytes is " 335 "used. If the actual block size of the target exceeds " 336 "this default, the backup may be unusable", 337 BLOCK_COPY_CLUSTER_SIZE_DEFAULT); 338 return BLOCK_COPY_CLUSTER_SIZE_DEFAULT; 339 } else if (ret < 0 && !target_does_cow) { 340 error_setg_errno(errp, -ret, 341 "Couldn't determine the cluster size of the target image, " 342 "which has no backing file"); 343 error_append_hint(errp, 344 "Aborting, since this may create an unusable destination image\n"); 345 return ret; 346 } else if (ret < 0 && target_does_cow) { 347 /* Not fatal; just trudge on ahead. */ 348 return BLOCK_COPY_CLUSTER_SIZE_DEFAULT; 349 } 350 351 return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); 352 } 353 354 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, 355 BlockDriverState *copy_bitmap_bs, 356 const BdrvDirtyBitmap *bitmap, 357 bool discard_source, 358 Error **errp) 359 { 360 ERRP_GUARD(); 361 BlockCopyState *s; 362 int64_t cluster_size; 363 BdrvDirtyBitmap *copy_bitmap; 364 bool is_fleecing; 365 366 GLOBAL_STATE_CODE(); 367 368 cluster_size = block_copy_calculate_cluster_size(target->bs, errp); 369 if (cluster_size < 0) { 370 return NULL; 371 } 372 373 copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL, 374 errp); 375 if (!copy_bitmap) { 376 return NULL; 377 } 378 bdrv_disable_dirty_bitmap(copy_bitmap); 379 if (bitmap) { 380 if (!bdrv_merge_dirty_bitmap(copy_bitmap, bitmap, NULL, errp)) { 381 error_prepend(errp, "Failed to merge bitmap '%s' to internal " 382 "copy-bitmap: ", bdrv_dirty_bitmap_name(bitmap)); 383 bdrv_release_dirty_bitmap(copy_bitmap); 384 return NULL; 385 } 386 } else { 387 bdrv_set_dirty_bitmap(copy_bitmap, 0, 388 bdrv_dirty_bitmap_size(copy_bitmap)); 389 } 390 391 /* 392 * If source is in backing chain of target assume that target is going to be 393 * used for "image fleecing", i.e. it should represent a kind of snapshot of 394 * source at backup-start point in time. And target is going to be read by 395 * somebody (for example, used as NBD export) during backup job. 396 * 397 * In this case, we need to add BDRV_REQ_SERIALISING write flag to avoid 398 * intersection of backup writes and third party reads from target, 399 * otherwise reading from target we may occasionally read already updated by 400 * guest data. 401 * 402 * For more information see commit f8d59dfb40bb and test 403 * tests/qemu-iotests/222 404 */ 405 bdrv_graph_rdlock_main_loop(); 406 is_fleecing = bdrv_chain_contains(target->bs, source->bs); 407 bdrv_graph_rdunlock_main_loop(); 408 409 s = g_new(BlockCopyState, 1); 410 *s = (BlockCopyState) { 411 .source = source, 412 .target = target, 413 .copy_bitmap = copy_bitmap, 414 .cluster_size = cluster_size, 415 .len = bdrv_dirty_bitmap_size(copy_bitmap), 416 .write_flags = (is_fleecing ? BDRV_REQ_SERIALISING : 0), 417 .mem = shres_create(BLOCK_COPY_MAX_MEM), 418 .max_transfer = QEMU_ALIGN_DOWN( 419 block_copy_max_transfer(source, target), 420 cluster_size), 421 }; 422 423 s->discard_source = discard_source; 424 block_copy_set_copy_opts(s, false, false); 425 426 ratelimit_init(&s->rate_limit); 427 qemu_co_mutex_init(&s->lock); 428 QLIST_INIT(&s->reqs); 429 QLIST_INIT(&s->calls); 430 431 return s; 432 } 433 434 /* Only set before running the job, no need for locking. */ 435 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) 436 { 437 s->progress = pm; 438 } 439 440 /* 441 * Takes ownership of @task 442 * 443 * If pool is NULL directly run the task, otherwise schedule it into the pool. 444 * 445 * Returns: task.func return code if pool is NULL 446 * otherwise -ECANCELED if pool status is bad 447 * otherwise 0 (successfully scheduled) 448 */ 449 static coroutine_fn int block_copy_task_run(AioTaskPool *pool, 450 BlockCopyTask *task) 451 { 452 if (!pool) { 453 int ret = task->task.func(&task->task); 454 455 g_free(task); 456 return ret; 457 } 458 459 aio_task_pool_wait_slot(pool); 460 if (aio_task_pool_status(pool) < 0) { 461 co_put_to_shres(task->s->mem, task->req.bytes); 462 block_copy_task_end(task, -ECANCELED); 463 g_free(task); 464 return -ECANCELED; 465 } 466 467 aio_task_pool_start_task(pool, &task->task); 468 469 return 0; 470 } 471 472 /* 473 * block_copy_do_copy 474 * 475 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed 476 * s->len only to cover last cluster when s->len is not aligned to clusters. 477 * 478 * No sync here: neither bitmap nor intersecting requests handling, only copy. 479 * 480 * @method is an in-out argument, so that copy_range can be either extended to 481 * a full-size buffer or disabled if the copy_range attempt fails. The output 482 * value of @method should be used for subsequent tasks. 483 * Returns 0 on success. 484 */ 485 static int coroutine_fn GRAPH_RDLOCK 486 block_copy_do_copy(BlockCopyState *s, int64_t offset, int64_t bytes, 487 BlockCopyMethod *method, bool *error_is_read) 488 { 489 int ret; 490 int64_t nbytes = MIN(offset + bytes, s->len) - offset; 491 void *bounce_buffer = NULL; 492 493 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes); 494 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 495 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); 496 assert(offset < s->len); 497 assert(offset + bytes <= s->len || 498 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size)); 499 assert(nbytes < INT_MAX); 500 501 switch (*method) { 502 case COPY_WRITE_ZEROES: 503 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags & 504 ~BDRV_REQ_WRITE_COMPRESSED); 505 if (ret < 0) { 506 trace_block_copy_write_zeroes_fail(s, offset, ret); 507 *error_is_read = false; 508 } 509 return ret; 510 511 case COPY_RANGE_SMALL: 512 case COPY_RANGE_FULL: 513 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes, 514 0, s->write_flags); 515 if (ret >= 0) { 516 /* Successful copy-range, increase chunk size. */ 517 *method = COPY_RANGE_FULL; 518 return 0; 519 } 520 521 trace_block_copy_copy_range_fail(s, offset, ret); 522 *method = COPY_READ_WRITE; 523 /* Fall through to read+write with allocated buffer */ 524 525 case COPY_READ_WRITE_CLUSTER: 526 case COPY_READ_WRITE: 527 /* 528 * In case of failed copy_range request above, we may proceed with 529 * buffered request larger than BLOCK_COPY_MAX_BUFFER. 530 * Still, further requests will be properly limited, so don't care too 531 * much. Moreover the most likely case (copy_range is unsupported for 532 * the configuration, so the very first copy_range request fails) 533 * is handled by setting large copy_size only after first successful 534 * copy_range. 535 */ 536 537 bounce_buffer = qemu_blockalign(s->source->bs, nbytes); 538 539 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0); 540 if (ret < 0) { 541 trace_block_copy_read_fail(s, offset, ret); 542 *error_is_read = true; 543 goto out; 544 } 545 546 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer, 547 s->write_flags); 548 if (ret < 0) { 549 trace_block_copy_write_fail(s, offset, ret); 550 *error_is_read = false; 551 goto out; 552 } 553 554 out: 555 qemu_vfree(bounce_buffer); 556 break; 557 558 default: 559 abort(); 560 } 561 562 return ret; 563 } 564 565 static coroutine_fn int block_copy_task_entry(AioTask *task) 566 { 567 BlockCopyTask *t = container_of(task, BlockCopyTask, task); 568 BlockCopyState *s = t->s; 569 bool error_is_read = false; 570 BlockCopyMethod method = t->method; 571 int ret; 572 573 WITH_GRAPH_RDLOCK_GUARD() { 574 ret = block_copy_do_copy(s, t->req.offset, t->req.bytes, &method, 575 &error_is_read); 576 } 577 578 WITH_QEMU_LOCK_GUARD(&s->lock) { 579 if (s->method == t->method) { 580 s->method = method; 581 } 582 583 if (ret < 0) { 584 if (!t->call_state->ret) { 585 t->call_state->ret = ret; 586 t->call_state->error_is_read = error_is_read; 587 } 588 } else if (s->progress) { 589 progress_work_done(s->progress, t->req.bytes); 590 } 591 } 592 co_put_to_shres(s->mem, t->req.bytes); 593 block_copy_task_end(t, ret); 594 595 if (s->discard_source && ret == 0) { 596 int64_t nbytes = 597 MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset; 598 bdrv_co_pdiscard(s->source, t->req.offset, nbytes); 599 } 600 601 return ret; 602 } 603 604 static coroutine_fn GRAPH_RDLOCK 605 int block_copy_block_status(BlockCopyState *s, int64_t offset, int64_t bytes, 606 int64_t *pnum) 607 { 608 int64_t num; 609 BlockDriverState *base; 610 int ret; 611 612 if (qatomic_read(&s->skip_unallocated)) { 613 base = bdrv_backing_chain_next(s->source->bs); 614 } else { 615 base = NULL; 616 } 617 618 ret = bdrv_co_block_status_above(s->source->bs, base, offset, bytes, &num, 619 NULL, NULL); 620 if (ret < 0 || num < s->cluster_size) { 621 /* 622 * On error or if failed to obtain large enough chunk just fallback to 623 * copy one cluster. 624 */ 625 num = s->cluster_size; 626 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA; 627 } else if (offset + num == s->len) { 628 num = QEMU_ALIGN_UP(num, s->cluster_size); 629 } else { 630 num = QEMU_ALIGN_DOWN(num, s->cluster_size); 631 } 632 633 *pnum = num; 634 return ret; 635 } 636 637 /* 638 * Check if the cluster starting at offset is allocated or not. 639 * return via pnum the number of contiguous clusters sharing this allocation. 640 */ 641 static int coroutine_fn GRAPH_RDLOCK 642 block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, 643 int64_t *pnum) 644 { 645 BlockDriverState *bs = s->source->bs; 646 int64_t count, total_count = 0; 647 int64_t bytes = s->len - offset; 648 int ret; 649 650 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 651 652 while (true) { 653 /* protected in backup_run() */ 654 ret = bdrv_co_is_allocated(bs, offset, bytes, &count); 655 if (ret < 0) { 656 return ret; 657 } 658 659 total_count += count; 660 661 if (ret || count == 0) { 662 /* 663 * ret: partial segment(s) are considered allocated. 664 * otherwise: unallocated tail is treated as an entire segment. 665 */ 666 *pnum = DIV_ROUND_UP(total_count, s->cluster_size); 667 return ret; 668 } 669 670 /* Unallocated segment(s) with uncertain following segment(s) */ 671 if (total_count >= s->cluster_size) { 672 *pnum = total_count / s->cluster_size; 673 return 0; 674 } 675 676 offset += count; 677 bytes -= count; 678 } 679 } 680 681 void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes) 682 { 683 QEMU_LOCK_GUARD(&s->lock); 684 685 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 686 if (s->progress) { 687 progress_set_remaining(s->progress, 688 bdrv_get_dirty_count(s->copy_bitmap) + 689 s->in_flight_bytes); 690 } 691 } 692 693 /* 694 * Reset bits in copy_bitmap starting at offset if they represent unallocated 695 * data in the image. May reset subsequent contiguous bits. 696 * @return 0 when the cluster at @offset was unallocated, 697 * 1 otherwise, and -ret on error. 698 */ 699 int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s, 700 int64_t offset, 701 int64_t *count) 702 { 703 int ret; 704 int64_t clusters, bytes; 705 706 ret = block_copy_is_cluster_allocated(s, offset, &clusters); 707 if (ret < 0) { 708 return ret; 709 } 710 711 bytes = clusters * s->cluster_size; 712 713 if (!ret) { 714 block_copy_reset(s, offset, bytes); 715 } 716 717 *count = bytes; 718 return ret; 719 } 720 721 /* 722 * block_copy_dirty_clusters 723 * 724 * Copy dirty clusters in @offset/@bytes range. 725 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty 726 * clusters found and -errno on failure. 727 */ 728 static int coroutine_fn GRAPH_RDLOCK 729 block_copy_dirty_clusters(BlockCopyCallState *call_state) 730 { 731 BlockCopyState *s = call_state->s; 732 int64_t offset = call_state->offset; 733 int64_t bytes = call_state->bytes; 734 735 int ret = 0; 736 bool found_dirty = false; 737 int64_t end = offset + bytes; 738 AioTaskPool *aio = NULL; 739 740 /* 741 * block_copy() user is responsible for keeping source and target in same 742 * aio context 743 */ 744 assert(bdrv_get_aio_context(s->source->bs) == 745 bdrv_get_aio_context(s->target->bs)); 746 747 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 748 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); 749 750 while (bytes && aio_task_pool_status(aio) == 0 && 751 !qatomic_read(&call_state->cancelled)) { 752 BlockCopyTask *task; 753 int64_t status_bytes; 754 755 task = block_copy_task_create(s, call_state, offset, bytes); 756 if (!task) { 757 /* No more dirty bits in the bitmap */ 758 trace_block_copy_skip_range(s, offset, bytes); 759 break; 760 } 761 if (task->req.offset > offset) { 762 trace_block_copy_skip_range(s, offset, task->req.offset - offset); 763 } 764 765 found_dirty = true; 766 767 ret = block_copy_block_status(s, task->req.offset, task->req.bytes, 768 &status_bytes); 769 assert(ret >= 0); /* never fail */ 770 if (status_bytes < task->req.bytes) { 771 block_copy_task_shrink(task, status_bytes); 772 } 773 if (qatomic_read(&s->skip_unallocated) && 774 !(ret & BDRV_BLOCK_ALLOCATED)) { 775 block_copy_task_end(task, 0); 776 trace_block_copy_skip_range(s, task->req.offset, task->req.bytes); 777 offset = task_end(task); 778 bytes = end - offset; 779 g_free(task); 780 continue; 781 } 782 if (ret & BDRV_BLOCK_ZERO) { 783 task->method = COPY_WRITE_ZEROES; 784 } 785 786 if (!call_state->ignore_ratelimit) { 787 uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0); 788 if (ns > 0) { 789 block_copy_task_end(task, -EAGAIN); 790 g_free(task); 791 qemu_co_sleep_ns_wakeable(&call_state->sleep, 792 QEMU_CLOCK_REALTIME, ns); 793 continue; 794 } 795 } 796 797 ratelimit_calculate_delay(&s->rate_limit, task->req.bytes); 798 799 trace_block_copy_process(s, task->req.offset); 800 801 co_get_from_shres(s->mem, task->req.bytes); 802 803 offset = task_end(task); 804 bytes = end - offset; 805 806 if (!aio && bytes) { 807 aio = aio_task_pool_new(call_state->max_workers); 808 } 809 810 ret = block_copy_task_run(aio, task); 811 if (ret < 0) { 812 goto out; 813 } 814 } 815 816 out: 817 if (aio) { 818 aio_task_pool_wait_all(aio); 819 820 /* 821 * We are not really interested in -ECANCELED returned from 822 * block_copy_task_run. If it fails, it means some task already failed 823 * for real reason, let's return first failure. 824 * Still, assert that we don't rewrite failure by success. 825 * 826 * Note: ret may be positive here because of block-status result. 827 */ 828 assert(ret >= 0 || aio_task_pool_status(aio) < 0); 829 ret = aio_task_pool_status(aio); 830 831 aio_task_pool_free(aio); 832 } 833 834 return ret < 0 ? ret : found_dirty; 835 } 836 837 void block_copy_kick(BlockCopyCallState *call_state) 838 { 839 qemu_co_sleep_wake(&call_state->sleep); 840 } 841 842 /* 843 * block_copy_common 844 * 845 * Copy requested region, accordingly to dirty bitmap. 846 * Collaborate with parallel block_copy requests: if they succeed it will help 847 * us. If they fail, we will retry not-copied regions. So, if we return error, 848 * it means that some I/O operation failed in context of _this_ block_copy call, 849 * not some parallel operation. 850 */ 851 static int coroutine_fn GRAPH_RDLOCK 852 block_copy_common(BlockCopyCallState *call_state) 853 { 854 int ret; 855 BlockCopyState *s = call_state->s; 856 857 qemu_co_mutex_lock(&s->lock); 858 QLIST_INSERT_HEAD(&s->calls, call_state, list); 859 qemu_co_mutex_unlock(&s->lock); 860 861 do { 862 ret = block_copy_dirty_clusters(call_state); 863 864 if (ret == 0 && !qatomic_read(&call_state->cancelled)) { 865 WITH_QEMU_LOCK_GUARD(&s->lock) { 866 /* 867 * Check that there is no task we still need to 868 * wait to complete 869 */ 870 ret = reqlist_wait_one(&s->reqs, call_state->offset, 871 call_state->bytes, &s->lock); 872 if (ret == 0) { 873 /* 874 * No pending tasks, but check again the bitmap in this 875 * same critical section, since a task might have failed 876 * between this and the critical section in 877 * block_copy_dirty_clusters(). 878 * 879 * reqlist_wait_one return value 0 also means that it 880 * didn't release the lock. So, we are still in the same 881 * critical section, not interrupted by any concurrent 882 * access to state. 883 */ 884 ret = bdrv_dirty_bitmap_next_dirty(s->copy_bitmap, 885 call_state->offset, 886 call_state->bytes) >= 0; 887 } 888 } 889 } 890 891 /* 892 * We retry in two cases: 893 * 1. Some progress done 894 * Something was copied, which means that there were yield points 895 * and some new dirty bits may have appeared (due to failed parallel 896 * block-copy requests). 897 * 2. We have waited for some intersecting block-copy request 898 * It may have failed and produced new dirty bits. 899 */ 900 } while (ret > 0 && !qatomic_read(&call_state->cancelled)); 901 902 qatomic_store_release(&call_state->finished, true); 903 904 if (call_state->cb) { 905 call_state->cb(call_state->cb_opaque); 906 } 907 908 qemu_co_mutex_lock(&s->lock); 909 QLIST_REMOVE(call_state, list); 910 qemu_co_mutex_unlock(&s->lock); 911 912 return ret; 913 } 914 915 static void coroutine_fn block_copy_async_co_entry(void *opaque) 916 { 917 GRAPH_RDLOCK_GUARD(); 918 block_copy_common(opaque); 919 } 920 921 int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes, 922 bool ignore_ratelimit, uint64_t timeout_ns, 923 BlockCopyAsyncCallbackFunc cb, 924 void *cb_opaque) 925 { 926 int ret; 927 BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); 928 929 *call_state = (BlockCopyCallState) { 930 .s = s, 931 .offset = start, 932 .bytes = bytes, 933 .ignore_ratelimit = ignore_ratelimit, 934 .max_workers = BLOCK_COPY_MAX_WORKERS, 935 .cb = cb, 936 .cb_opaque = cb_opaque, 937 }; 938 939 ret = qemu_co_timeout(block_copy_async_co_entry, call_state, timeout_ns, 940 g_free); 941 if (ret < 0) { 942 assert(ret == -ETIMEDOUT); 943 block_copy_call_cancel(call_state); 944 /* call_state will be freed by running coroutine. */ 945 return ret; 946 } 947 948 ret = call_state->ret; 949 g_free(call_state); 950 951 return ret; 952 } 953 954 BlockCopyCallState *block_copy_async(BlockCopyState *s, 955 int64_t offset, int64_t bytes, 956 int max_workers, int64_t max_chunk, 957 BlockCopyAsyncCallbackFunc cb, 958 void *cb_opaque) 959 { 960 BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); 961 962 *call_state = (BlockCopyCallState) { 963 .s = s, 964 .offset = offset, 965 .bytes = bytes, 966 .max_workers = max_workers, 967 .max_chunk = max_chunk, 968 .cb = cb, 969 .cb_opaque = cb_opaque, 970 971 .co = qemu_coroutine_create(block_copy_async_co_entry, call_state), 972 }; 973 974 qemu_coroutine_enter(call_state->co); 975 976 return call_state; 977 } 978 979 void block_copy_call_free(BlockCopyCallState *call_state) 980 { 981 if (!call_state) { 982 return; 983 } 984 985 assert(qatomic_read(&call_state->finished)); 986 g_free(call_state); 987 } 988 989 bool block_copy_call_finished(BlockCopyCallState *call_state) 990 { 991 return qatomic_read(&call_state->finished); 992 } 993 994 bool block_copy_call_succeeded(BlockCopyCallState *call_state) 995 { 996 return qatomic_load_acquire(&call_state->finished) && 997 !qatomic_read(&call_state->cancelled) && 998 call_state->ret == 0; 999 } 1000 1001 bool block_copy_call_failed(BlockCopyCallState *call_state) 1002 { 1003 return qatomic_load_acquire(&call_state->finished) && 1004 !qatomic_read(&call_state->cancelled) && 1005 call_state->ret < 0; 1006 } 1007 1008 bool block_copy_call_cancelled(BlockCopyCallState *call_state) 1009 { 1010 return qatomic_read(&call_state->cancelled); 1011 } 1012 1013 int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read) 1014 { 1015 assert(qatomic_load_acquire(&call_state->finished)); 1016 if (error_is_read) { 1017 *error_is_read = call_state->error_is_read; 1018 } 1019 return call_state->ret; 1020 } 1021 1022 /* 1023 * Note that cancelling and finishing are racy. 1024 * User can cancel a block-copy that is already finished. 1025 */ 1026 void block_copy_call_cancel(BlockCopyCallState *call_state) 1027 { 1028 qatomic_set(&call_state->cancelled, true); 1029 block_copy_kick(call_state); 1030 } 1031 1032 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s) 1033 { 1034 return s->copy_bitmap; 1035 } 1036 1037 int64_t block_copy_cluster_size(BlockCopyState *s) 1038 { 1039 return s->cluster_size; 1040 } 1041 1042 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip) 1043 { 1044 qatomic_set(&s->skip_unallocated, skip); 1045 } 1046 1047 void block_copy_set_speed(BlockCopyState *s, uint64_t speed) 1048 { 1049 ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME); 1050 1051 /* 1052 * Note: it's good to kick all call states from here, but it should be done 1053 * only from a coroutine, to not crash if s->calls list changed while 1054 * entering one call. So for now, the only user of this function kicks its 1055 * only one call_state by hand. 1056 */ 1057 } 1058