1 /* 2 * block_copy API 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * Copyright (c) 2019 Virtuozzo International GmbH. 6 * 7 * Authors: 8 * Dietmar Maurer (dietmar@proxmox.com) 9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 */ 14 15 #include "qemu/osdep.h" 16 17 #include "trace.h" 18 #include "qapi/error.h" 19 #include "block/block-copy.h" 20 #include "block/block_int-io.h" 21 #include "block/dirty-bitmap.h" 22 #include "block/reqlist.h" 23 #include "sysemu/block-backend.h" 24 #include "qemu/units.h" 25 #include "qemu/co-shared-resource.h" 26 #include "qemu/coroutine.h" 27 #include "qemu/ratelimit.h" 28 #include "block/aio_task.h" 29 #include "qemu/error-report.h" 30 #include "qemu/memalign.h" 31 32 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) 33 #define BLOCK_COPY_MAX_BUFFER (1 * MiB) 34 #define BLOCK_COPY_MAX_MEM (128 * MiB) 35 #define BLOCK_COPY_MAX_WORKERS 64 36 #define BLOCK_COPY_SLICE_TIME 100000000ULL /* ns */ 37 #define BLOCK_COPY_CLUSTER_SIZE_DEFAULT (1 << 16) 38 39 typedef enum { 40 COPY_READ_WRITE_CLUSTER, 41 COPY_READ_WRITE, 42 COPY_WRITE_ZEROES, 43 COPY_RANGE_SMALL, 44 COPY_RANGE_FULL 45 } BlockCopyMethod; 46 47 static coroutine_fn int block_copy_task_entry(AioTask *task); 48 49 typedef struct BlockCopyCallState { 50 /* Fields initialized in block_copy_async() and never changed. */ 51 BlockCopyState *s; 52 int64_t offset; 53 int64_t bytes; 54 int max_workers; 55 int64_t max_chunk; 56 bool ignore_ratelimit; 57 BlockCopyAsyncCallbackFunc cb; 58 void *cb_opaque; 59 /* Coroutine where async block-copy is running */ 60 Coroutine *co; 61 62 /* Fields whose state changes throughout the execution */ 63 bool finished; /* atomic */ 64 QemuCoSleep sleep; /* TODO: protect API with a lock */ 65 bool cancelled; /* atomic */ 66 /* To reference all call states from BlockCopyState */ 67 QLIST_ENTRY(BlockCopyCallState) list; 68 69 /* 70 * Fields that report information about return values and errors. 71 * Protected by lock in BlockCopyState. 72 */ 73 bool error_is_read; 74 /* 75 * @ret is set concurrently by tasks under mutex. Only set once by first 76 * failed task (and untouched if no task failed). 77 * After finishing (call_state->finished is true), it is not modified 78 * anymore and may be safely read without mutex. 79 */ 80 int ret; 81 } BlockCopyCallState; 82 83 typedef struct BlockCopyTask { 84 AioTask task; 85 86 /* 87 * Fields initialized in block_copy_task_create() 88 * and never changed. 89 */ 90 BlockCopyState *s; 91 BlockCopyCallState *call_state; 92 /* 93 * @method can also be set again in the while loop of 94 * block_copy_dirty_clusters(), but it is never accessed concurrently 95 * because the only other function that reads it is 96 * block_copy_task_entry() and it is invoked afterwards in the same 97 * iteration. 98 */ 99 BlockCopyMethod method; 100 101 /* 102 * Generally, req is protected by lock in BlockCopyState, Still req.offset 103 * is only set on task creation, so may be read concurrently after creation. 104 * req.bytes is changed at most once, and need only protecting the case of 105 * parallel read while updating @bytes value in block_copy_task_shrink(). 106 */ 107 BlockReq req; 108 } BlockCopyTask; 109 110 static int64_t task_end(BlockCopyTask *task) 111 { 112 return task->req.offset + task->req.bytes; 113 } 114 115 typedef struct BlockCopyState { 116 /* 117 * BdrvChild objects are not owned or managed by block-copy. They are 118 * provided by block-copy user and user is responsible for appropriate 119 * permissions on these children. 120 */ 121 BdrvChild *source; 122 BdrvChild *target; 123 124 /* 125 * Fields initialized in block_copy_state_new() 126 * and never changed. 127 */ 128 int64_t cluster_size; 129 int64_t max_transfer; 130 uint64_t len; 131 BdrvRequestFlags write_flags; 132 133 /* 134 * Fields whose state changes throughout the execution 135 * Protected by lock. 136 */ 137 CoMutex lock; 138 int64_t in_flight_bytes; 139 BlockCopyMethod method; 140 bool discard_source; 141 BlockReqList reqs; 142 QLIST_HEAD(, BlockCopyCallState) calls; 143 /* 144 * skip_unallocated: 145 * 146 * Used by sync=top jobs, which first scan the source node for unallocated 147 * areas and clear them in the copy_bitmap. During this process, the bitmap 148 * is thus not fully initialized: It may still have bits set for areas that 149 * are unallocated and should actually not be copied. 150 * 151 * This is indicated by skip_unallocated. 152 * 153 * In this case, block_copy() will query the source’s allocation status, 154 * skip unallocated regions, clear them in the copy_bitmap, and invoke 155 * block_copy_reset_unallocated() every time it does. 156 */ 157 bool skip_unallocated; /* atomic */ 158 /* State fields that use a thread-safe API */ 159 BdrvDirtyBitmap *copy_bitmap; 160 ProgressMeter *progress; 161 SharedResource *mem; 162 RateLimit rate_limit; 163 } BlockCopyState; 164 165 /* Called with lock held */ 166 static int64_t block_copy_chunk_size(BlockCopyState *s) 167 { 168 switch (s->method) { 169 case COPY_READ_WRITE_CLUSTER: 170 return s->cluster_size; 171 case COPY_READ_WRITE: 172 case COPY_RANGE_SMALL: 173 return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER), 174 s->max_transfer); 175 case COPY_RANGE_FULL: 176 return MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), 177 s->max_transfer); 178 default: 179 /* Cannot have COPY_WRITE_ZEROES here. */ 180 abort(); 181 } 182 } 183 184 /* 185 * Search for the first dirty area in offset/bytes range and create task at 186 * the beginning of it. 187 */ 188 static coroutine_fn BlockCopyTask * 189 block_copy_task_create(BlockCopyState *s, BlockCopyCallState *call_state, 190 int64_t offset, int64_t bytes) 191 { 192 BlockCopyTask *task; 193 int64_t max_chunk; 194 195 QEMU_LOCK_GUARD(&s->lock); 196 max_chunk = MIN_NON_ZERO(block_copy_chunk_size(s), call_state->max_chunk); 197 if (!bdrv_dirty_bitmap_next_dirty_area(s->copy_bitmap, 198 offset, offset + bytes, 199 max_chunk, &offset, &bytes)) 200 { 201 return NULL; 202 } 203 204 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 205 bytes = QEMU_ALIGN_UP(bytes, s->cluster_size); 206 207 /* region is dirty, so no existent tasks possible in it */ 208 assert(!reqlist_find_conflict(&s->reqs, offset, bytes)); 209 210 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 211 s->in_flight_bytes += bytes; 212 213 task = g_new(BlockCopyTask, 1); 214 *task = (BlockCopyTask) { 215 .task.func = block_copy_task_entry, 216 .s = s, 217 .call_state = call_state, 218 .method = s->method, 219 }; 220 reqlist_init_req(&s->reqs, &task->req, offset, bytes); 221 222 return task; 223 } 224 225 /* 226 * block_copy_task_shrink 227 * 228 * Drop the tail of the task to be handled later. Set dirty bits back and 229 * wake up all tasks waiting for us (may be some of them are not intersecting 230 * with shrunk task) 231 */ 232 static void coroutine_fn block_copy_task_shrink(BlockCopyTask *task, 233 int64_t new_bytes) 234 { 235 QEMU_LOCK_GUARD(&task->s->lock); 236 if (new_bytes == task->req.bytes) { 237 return; 238 } 239 240 assert(new_bytes > 0 && new_bytes < task->req.bytes); 241 242 task->s->in_flight_bytes -= task->req.bytes - new_bytes; 243 bdrv_set_dirty_bitmap(task->s->copy_bitmap, 244 task->req.offset + new_bytes, 245 task->req.bytes - new_bytes); 246 247 reqlist_shrink_req(&task->req, new_bytes); 248 } 249 250 static void coroutine_fn block_copy_task_end(BlockCopyTask *task, int ret) 251 { 252 QEMU_LOCK_GUARD(&task->s->lock); 253 task->s->in_flight_bytes -= task->req.bytes; 254 if (ret < 0) { 255 bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->req.offset, 256 task->req.bytes); 257 } 258 if (task->s->progress) { 259 progress_set_remaining(task->s->progress, 260 bdrv_get_dirty_count(task->s->copy_bitmap) + 261 task->s->in_flight_bytes); 262 } 263 reqlist_remove_req(&task->req); 264 } 265 266 void block_copy_state_free(BlockCopyState *s) 267 { 268 if (!s) { 269 return; 270 } 271 272 ratelimit_destroy(&s->rate_limit); 273 bdrv_release_dirty_bitmap(s->copy_bitmap); 274 shres_destroy(s->mem); 275 g_free(s); 276 } 277 278 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) 279 { 280 return MIN_NON_ZERO(INT_MAX, 281 MIN_NON_ZERO(source->bs->bl.max_transfer, 282 target->bs->bl.max_transfer)); 283 } 284 285 void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range, 286 bool compress) 287 { 288 /* Keep BDRV_REQ_SERIALISING set (or not set) in block_copy_state_new() */ 289 s->write_flags = (s->write_flags & BDRV_REQ_SERIALISING) | 290 (compress ? BDRV_REQ_WRITE_COMPRESSED : 0); 291 292 if (s->max_transfer < s->cluster_size) { 293 /* 294 * copy_range does not respect max_transfer. We don't want to bother 295 * with requests smaller than block-copy cluster size, so fallback to 296 * buffered copying (read and write respect max_transfer on their 297 * behalf). 298 */ 299 s->method = COPY_READ_WRITE_CLUSTER; 300 } else if (compress) { 301 /* Compression supports only cluster-size writes and no copy-range. */ 302 s->method = COPY_READ_WRITE_CLUSTER; 303 } else { 304 /* 305 * If copy range enabled, start with COPY_RANGE_SMALL, until first 306 * successful copy_range (look at block_copy_do_copy). 307 */ 308 s->method = use_copy_range ? COPY_RANGE_SMALL : COPY_READ_WRITE; 309 } 310 } 311 312 static int64_t block_copy_calculate_cluster_size(BlockDriverState *target, 313 Error **errp) 314 { 315 int ret; 316 BlockDriverInfo bdi; 317 bool target_does_cow; 318 319 GLOBAL_STATE_CODE(); 320 GRAPH_RDLOCK_GUARD_MAINLOOP(); 321 322 target_does_cow = bdrv_backing_chain_next(target); 323 324 /* 325 * If there is no backing file on the target, we cannot rely on COW if our 326 * backup cluster size is smaller than the target cluster size. Even for 327 * targets with a backing file, try to avoid COW if possible. 328 */ 329 ret = bdrv_get_info(target, &bdi); 330 if (ret == -ENOTSUP && !target_does_cow) { 331 /* Cluster size is not defined */ 332 warn_report("The target block device doesn't provide " 333 "information about the block size and it doesn't have a " 334 "backing file. The default block size of %u bytes is " 335 "used. If the actual block size of the target exceeds " 336 "this default, the backup may be unusable", 337 BLOCK_COPY_CLUSTER_SIZE_DEFAULT); 338 return BLOCK_COPY_CLUSTER_SIZE_DEFAULT; 339 } else if (ret < 0 && !target_does_cow) { 340 error_setg_errno(errp, -ret, 341 "Couldn't determine the cluster size of the target image, " 342 "which has no backing file"); 343 error_append_hint(errp, 344 "Aborting, since this may create an unusable destination image\n"); 345 return ret; 346 } else if (ret < 0 && target_does_cow) { 347 /* Not fatal; just trudge on ahead. */ 348 return BLOCK_COPY_CLUSTER_SIZE_DEFAULT; 349 } 350 351 return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); 352 } 353 354 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, 355 BlockDriverState *copy_bitmap_bs, 356 const BdrvDirtyBitmap *bitmap, 357 bool discard_source, 358 Error **errp) 359 { 360 ERRP_GUARD(); 361 BlockCopyState *s; 362 int64_t cluster_size; 363 BdrvDirtyBitmap *copy_bitmap; 364 bool is_fleecing; 365 366 GLOBAL_STATE_CODE(); 367 368 cluster_size = block_copy_calculate_cluster_size(target->bs, errp); 369 if (cluster_size < 0) { 370 return NULL; 371 } 372 373 copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL, 374 errp); 375 if (!copy_bitmap) { 376 return NULL; 377 } 378 bdrv_disable_dirty_bitmap(copy_bitmap); 379 if (bitmap) { 380 if (!bdrv_merge_dirty_bitmap(copy_bitmap, bitmap, NULL, errp)) { 381 error_prepend(errp, "Failed to merge bitmap '%s' to internal " 382 "copy-bitmap: ", bdrv_dirty_bitmap_name(bitmap)); 383 bdrv_release_dirty_bitmap(copy_bitmap); 384 return NULL; 385 } 386 } else { 387 bdrv_set_dirty_bitmap(copy_bitmap, 0, 388 bdrv_dirty_bitmap_size(copy_bitmap)); 389 } 390 391 /* 392 * If source is in backing chain of target assume that target is going to be 393 * used for "image fleecing", i.e. it should represent a kind of snapshot of 394 * source at backup-start point in time. And target is going to be read by 395 * somebody (for example, used as NBD export) during backup job. 396 * 397 * In this case, we need to add BDRV_REQ_SERIALISING write flag to avoid 398 * intersection of backup writes and third party reads from target, 399 * otherwise reading from target we may occasionally read already updated by 400 * guest data. 401 * 402 * For more information see commit f8d59dfb40bb and test 403 * tests/qemu-iotests/222 404 */ 405 bdrv_graph_rdlock_main_loop(); 406 is_fleecing = bdrv_chain_contains(target->bs, source->bs); 407 bdrv_graph_rdunlock_main_loop(); 408 409 s = g_new(BlockCopyState, 1); 410 *s = (BlockCopyState) { 411 .source = source, 412 .target = target, 413 .copy_bitmap = copy_bitmap, 414 .cluster_size = cluster_size, 415 .len = bdrv_dirty_bitmap_size(copy_bitmap), 416 .write_flags = (is_fleecing ? BDRV_REQ_SERIALISING : 0), 417 .mem = shres_create(BLOCK_COPY_MAX_MEM), 418 .max_transfer = QEMU_ALIGN_DOWN( 419 block_copy_max_transfer(source, target), 420 cluster_size), 421 }; 422 423 s->discard_source = discard_source; 424 block_copy_set_copy_opts(s, false, false); 425 426 ratelimit_init(&s->rate_limit); 427 qemu_co_mutex_init(&s->lock); 428 QLIST_INIT(&s->reqs); 429 QLIST_INIT(&s->calls); 430 431 return s; 432 } 433 434 /* Only set before running the job, no need for locking. */ 435 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) 436 { 437 s->progress = pm; 438 } 439 440 /* 441 * Takes ownership of @task 442 * 443 * If pool is NULL directly run the task, otherwise schedule it into the pool. 444 * 445 * Returns: task.func return code if pool is NULL 446 * otherwise -ECANCELED if pool status is bad 447 * otherwise 0 (successfully scheduled) 448 */ 449 static coroutine_fn int block_copy_task_run(AioTaskPool *pool, 450 BlockCopyTask *task) 451 { 452 if (!pool) { 453 int ret = task->task.func(&task->task); 454 455 g_free(task); 456 return ret; 457 } 458 459 aio_task_pool_wait_slot(pool); 460 if (aio_task_pool_status(pool) < 0) { 461 co_put_to_shres(task->s->mem, task->req.bytes); 462 block_copy_task_end(task, -ECANCELED); 463 g_free(task); 464 return -ECANCELED; 465 } 466 467 aio_task_pool_start_task(pool, &task->task); 468 469 return 0; 470 } 471 472 /* 473 * block_copy_do_copy 474 * 475 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed 476 * s->len only to cover last cluster when s->len is not aligned to clusters. 477 * 478 * No sync here: neither bitmap nor intersecting requests handling, only copy. 479 * 480 * @method is an in-out argument, so that copy_range can be either extended to 481 * a full-size buffer or disabled if the copy_range attempt fails. The output 482 * value of @method should be used for subsequent tasks. 483 * Returns 0 on success. 484 */ 485 static int coroutine_fn GRAPH_RDLOCK 486 block_copy_do_copy(BlockCopyState *s, int64_t offset, int64_t bytes, 487 BlockCopyMethod *method, bool *error_is_read) 488 { 489 int ret; 490 int64_t nbytes = MIN(offset + bytes, s->len) - offset; 491 void *bounce_buffer = NULL; 492 493 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes); 494 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 495 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); 496 assert(offset < s->len); 497 assert(offset + bytes <= s->len || 498 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size)); 499 assert(nbytes < INT_MAX); 500 501 switch (*method) { 502 case COPY_WRITE_ZEROES: 503 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags & 504 ~BDRV_REQ_WRITE_COMPRESSED); 505 if (ret < 0) { 506 trace_block_copy_write_zeroes_fail(s, offset, ret); 507 *error_is_read = false; 508 } 509 return ret; 510 511 case COPY_RANGE_SMALL: 512 case COPY_RANGE_FULL: 513 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes, 514 0, s->write_flags); 515 if (ret >= 0) { 516 /* Successful copy-range, increase chunk size. */ 517 *method = COPY_RANGE_FULL; 518 return 0; 519 } 520 521 trace_block_copy_copy_range_fail(s, offset, ret); 522 *method = COPY_READ_WRITE; 523 /* Fall through to read+write with allocated buffer */ 524 525 case COPY_READ_WRITE_CLUSTER: 526 case COPY_READ_WRITE: 527 /* 528 * In case of failed copy_range request above, we may proceed with 529 * buffered request larger than BLOCK_COPY_MAX_BUFFER. 530 * Still, further requests will be properly limited, so don't care too 531 * much. Moreover the most likely case (copy_range is unsupported for 532 * the configuration, so the very first copy_range request fails) 533 * is handled by setting large copy_size only after first successful 534 * copy_range. 535 */ 536 537 bounce_buffer = qemu_blockalign(s->source->bs, nbytes); 538 539 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0); 540 if (ret < 0) { 541 trace_block_copy_read_fail(s, offset, ret); 542 *error_is_read = true; 543 goto out; 544 } 545 546 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer, 547 s->write_flags); 548 if (ret < 0) { 549 trace_block_copy_write_fail(s, offset, ret); 550 *error_is_read = false; 551 goto out; 552 } 553 554 out: 555 qemu_vfree(bounce_buffer); 556 break; 557 558 default: 559 abort(); 560 } 561 562 return ret; 563 } 564 565 static coroutine_fn int block_copy_task_entry(AioTask *task) 566 { 567 BlockCopyTask *t = container_of(task, BlockCopyTask, task); 568 BlockCopyState *s = t->s; 569 bool error_is_read = false; 570 BlockCopyMethod method = t->method; 571 int ret; 572 573 WITH_GRAPH_RDLOCK_GUARD() { 574 ret = block_copy_do_copy(s, t->req.offset, t->req.bytes, &method, 575 &error_is_read); 576 } 577 578 WITH_QEMU_LOCK_GUARD(&s->lock) { 579 if (s->method == t->method) { 580 s->method = method; 581 } 582 583 if (ret < 0) { 584 if (!t->call_state->ret) { 585 t->call_state->ret = ret; 586 t->call_state->error_is_read = error_is_read; 587 } 588 } else if (s->progress) { 589 progress_work_done(s->progress, t->req.bytes); 590 } 591 } 592 co_put_to_shres(s->mem, t->req.bytes); 593 block_copy_task_end(t, ret); 594 595 if (s->discard_source && ret == 0) { 596 int64_t nbytes = 597 MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset; 598 WITH_GRAPH_RDLOCK_GUARD() { 599 bdrv_co_pdiscard(s->source, t->req.offset, nbytes); 600 } 601 } 602 603 return ret; 604 } 605 606 static coroutine_fn GRAPH_RDLOCK 607 int block_copy_block_status(BlockCopyState *s, int64_t offset, int64_t bytes, 608 int64_t *pnum) 609 { 610 int64_t num; 611 BlockDriverState *base; 612 int ret; 613 614 if (qatomic_read(&s->skip_unallocated)) { 615 base = bdrv_backing_chain_next(s->source->bs); 616 } else { 617 base = NULL; 618 } 619 620 ret = bdrv_co_block_status_above(s->source->bs, base, offset, bytes, &num, 621 NULL, NULL); 622 if (ret < 0 || num < s->cluster_size) { 623 /* 624 * On error or if failed to obtain large enough chunk just fallback to 625 * copy one cluster. 626 */ 627 num = s->cluster_size; 628 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA; 629 } else if (offset + num == s->len) { 630 num = QEMU_ALIGN_UP(num, s->cluster_size); 631 } else { 632 num = QEMU_ALIGN_DOWN(num, s->cluster_size); 633 } 634 635 *pnum = num; 636 return ret; 637 } 638 639 /* 640 * Check if the cluster starting at offset is allocated or not. 641 * return via pnum the number of contiguous clusters sharing this allocation. 642 */ 643 static int coroutine_fn GRAPH_RDLOCK 644 block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, 645 int64_t *pnum) 646 { 647 BlockDriverState *bs = s->source->bs; 648 int64_t count, total_count = 0; 649 int64_t bytes = s->len - offset; 650 int ret; 651 652 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 653 654 while (true) { 655 /* protected in backup_run() */ 656 ret = bdrv_co_is_allocated(bs, offset, bytes, &count); 657 if (ret < 0) { 658 return ret; 659 } 660 661 total_count += count; 662 663 if (ret || count == 0) { 664 /* 665 * ret: partial segment(s) are considered allocated. 666 * otherwise: unallocated tail is treated as an entire segment. 667 */ 668 *pnum = DIV_ROUND_UP(total_count, s->cluster_size); 669 return ret; 670 } 671 672 /* Unallocated segment(s) with uncertain following segment(s) */ 673 if (total_count >= s->cluster_size) { 674 *pnum = total_count / s->cluster_size; 675 return 0; 676 } 677 678 offset += count; 679 bytes -= count; 680 } 681 } 682 683 void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes) 684 { 685 QEMU_LOCK_GUARD(&s->lock); 686 687 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 688 if (s->progress) { 689 progress_set_remaining(s->progress, 690 bdrv_get_dirty_count(s->copy_bitmap) + 691 s->in_flight_bytes); 692 } 693 } 694 695 /* 696 * Reset bits in copy_bitmap starting at offset if they represent unallocated 697 * data in the image. May reset subsequent contiguous bits. 698 * @return 0 when the cluster at @offset was unallocated, 699 * 1 otherwise, and -ret on error. 700 */ 701 int64_t coroutine_fn block_copy_reset_unallocated(BlockCopyState *s, 702 int64_t offset, 703 int64_t *count) 704 { 705 int ret; 706 int64_t clusters, bytes; 707 708 ret = block_copy_is_cluster_allocated(s, offset, &clusters); 709 if (ret < 0) { 710 return ret; 711 } 712 713 bytes = clusters * s->cluster_size; 714 715 if (!ret) { 716 block_copy_reset(s, offset, bytes); 717 } 718 719 *count = bytes; 720 return ret; 721 } 722 723 /* 724 * block_copy_dirty_clusters 725 * 726 * Copy dirty clusters in @offset/@bytes range. 727 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty 728 * clusters found and -errno on failure. 729 */ 730 static int coroutine_fn GRAPH_RDLOCK 731 block_copy_dirty_clusters(BlockCopyCallState *call_state) 732 { 733 BlockCopyState *s = call_state->s; 734 int64_t offset = call_state->offset; 735 int64_t bytes = call_state->bytes; 736 737 int ret = 0; 738 bool found_dirty = false; 739 int64_t end = offset + bytes; 740 AioTaskPool *aio = NULL; 741 742 /* 743 * block_copy() user is responsible for keeping source and target in same 744 * aio context 745 */ 746 assert(bdrv_get_aio_context(s->source->bs) == 747 bdrv_get_aio_context(s->target->bs)); 748 749 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 750 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); 751 752 while (bytes && aio_task_pool_status(aio) == 0 && 753 !qatomic_read(&call_state->cancelled)) { 754 BlockCopyTask *task; 755 int64_t status_bytes; 756 757 task = block_copy_task_create(s, call_state, offset, bytes); 758 if (!task) { 759 /* No more dirty bits in the bitmap */ 760 trace_block_copy_skip_range(s, offset, bytes); 761 break; 762 } 763 if (task->req.offset > offset) { 764 trace_block_copy_skip_range(s, offset, task->req.offset - offset); 765 } 766 767 found_dirty = true; 768 769 ret = block_copy_block_status(s, task->req.offset, task->req.bytes, 770 &status_bytes); 771 assert(ret >= 0); /* never fail */ 772 if (status_bytes < task->req.bytes) { 773 block_copy_task_shrink(task, status_bytes); 774 } 775 if (qatomic_read(&s->skip_unallocated) && 776 !(ret & BDRV_BLOCK_ALLOCATED)) { 777 block_copy_task_end(task, 0); 778 trace_block_copy_skip_range(s, task->req.offset, task->req.bytes); 779 offset = task_end(task); 780 bytes = end - offset; 781 g_free(task); 782 continue; 783 } 784 if (ret & BDRV_BLOCK_ZERO) { 785 task->method = COPY_WRITE_ZEROES; 786 } 787 788 if (!call_state->ignore_ratelimit) { 789 uint64_t ns = ratelimit_calculate_delay(&s->rate_limit, 0); 790 if (ns > 0) { 791 block_copy_task_end(task, -EAGAIN); 792 g_free(task); 793 qemu_co_sleep_ns_wakeable(&call_state->sleep, 794 QEMU_CLOCK_REALTIME, ns); 795 continue; 796 } 797 } 798 799 ratelimit_calculate_delay(&s->rate_limit, task->req.bytes); 800 801 trace_block_copy_process(s, task->req.offset); 802 803 co_get_from_shres(s->mem, task->req.bytes); 804 805 offset = task_end(task); 806 bytes = end - offset; 807 808 if (!aio && bytes) { 809 aio = aio_task_pool_new(call_state->max_workers); 810 } 811 812 ret = block_copy_task_run(aio, task); 813 if (ret < 0) { 814 goto out; 815 } 816 } 817 818 out: 819 if (aio) { 820 aio_task_pool_wait_all(aio); 821 822 /* 823 * We are not really interested in -ECANCELED returned from 824 * block_copy_task_run. If it fails, it means some task already failed 825 * for real reason, let's return first failure. 826 * Still, assert that we don't rewrite failure by success. 827 * 828 * Note: ret may be positive here because of block-status result. 829 */ 830 assert(ret >= 0 || aio_task_pool_status(aio) < 0); 831 ret = aio_task_pool_status(aio); 832 833 aio_task_pool_free(aio); 834 } 835 836 return ret < 0 ? ret : found_dirty; 837 } 838 839 void block_copy_kick(BlockCopyCallState *call_state) 840 { 841 qemu_co_sleep_wake(&call_state->sleep); 842 } 843 844 /* 845 * block_copy_common 846 * 847 * Copy requested region, accordingly to dirty bitmap. 848 * Collaborate with parallel block_copy requests: if they succeed it will help 849 * us. If they fail, we will retry not-copied regions. So, if we return error, 850 * it means that some I/O operation failed in context of _this_ block_copy call, 851 * not some parallel operation. 852 */ 853 static int coroutine_fn GRAPH_RDLOCK 854 block_copy_common(BlockCopyCallState *call_state) 855 { 856 int ret; 857 BlockCopyState *s = call_state->s; 858 859 qemu_co_mutex_lock(&s->lock); 860 QLIST_INSERT_HEAD(&s->calls, call_state, list); 861 qemu_co_mutex_unlock(&s->lock); 862 863 do { 864 ret = block_copy_dirty_clusters(call_state); 865 866 if (ret == 0 && !qatomic_read(&call_state->cancelled)) { 867 WITH_QEMU_LOCK_GUARD(&s->lock) { 868 /* 869 * Check that there is no task we still need to 870 * wait to complete 871 */ 872 ret = reqlist_wait_one(&s->reqs, call_state->offset, 873 call_state->bytes, &s->lock); 874 if (ret == 0) { 875 /* 876 * No pending tasks, but check again the bitmap in this 877 * same critical section, since a task might have failed 878 * between this and the critical section in 879 * block_copy_dirty_clusters(). 880 * 881 * reqlist_wait_one return value 0 also means that it 882 * didn't release the lock. So, we are still in the same 883 * critical section, not interrupted by any concurrent 884 * access to state. 885 */ 886 ret = bdrv_dirty_bitmap_next_dirty(s->copy_bitmap, 887 call_state->offset, 888 call_state->bytes) >= 0; 889 } 890 } 891 } 892 893 /* 894 * We retry in two cases: 895 * 1. Some progress done 896 * Something was copied, which means that there were yield points 897 * and some new dirty bits may have appeared (due to failed parallel 898 * block-copy requests). 899 * 2. We have waited for some intersecting block-copy request 900 * It may have failed and produced new dirty bits. 901 */ 902 } while (ret > 0 && !qatomic_read(&call_state->cancelled)); 903 904 qatomic_store_release(&call_state->finished, true); 905 906 if (call_state->cb) { 907 call_state->cb(call_state->cb_opaque); 908 } 909 910 qemu_co_mutex_lock(&s->lock); 911 QLIST_REMOVE(call_state, list); 912 qemu_co_mutex_unlock(&s->lock); 913 914 return ret; 915 } 916 917 static void coroutine_fn block_copy_async_co_entry(void *opaque) 918 { 919 GRAPH_RDLOCK_GUARD(); 920 block_copy_common(opaque); 921 } 922 923 int coroutine_fn block_copy(BlockCopyState *s, int64_t start, int64_t bytes, 924 bool ignore_ratelimit, uint64_t timeout_ns, 925 BlockCopyAsyncCallbackFunc cb, 926 void *cb_opaque) 927 { 928 int ret; 929 BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); 930 931 *call_state = (BlockCopyCallState) { 932 .s = s, 933 .offset = start, 934 .bytes = bytes, 935 .ignore_ratelimit = ignore_ratelimit, 936 .max_workers = BLOCK_COPY_MAX_WORKERS, 937 .cb = cb, 938 .cb_opaque = cb_opaque, 939 }; 940 941 ret = qemu_co_timeout(block_copy_async_co_entry, call_state, timeout_ns, 942 g_free); 943 if (ret < 0) { 944 assert(ret == -ETIMEDOUT); 945 block_copy_call_cancel(call_state); 946 /* call_state will be freed by running coroutine. */ 947 return ret; 948 } 949 950 ret = call_state->ret; 951 g_free(call_state); 952 953 return ret; 954 } 955 956 BlockCopyCallState *block_copy_async(BlockCopyState *s, 957 int64_t offset, int64_t bytes, 958 int max_workers, int64_t max_chunk, 959 BlockCopyAsyncCallbackFunc cb, 960 void *cb_opaque) 961 { 962 BlockCopyCallState *call_state = g_new(BlockCopyCallState, 1); 963 964 *call_state = (BlockCopyCallState) { 965 .s = s, 966 .offset = offset, 967 .bytes = bytes, 968 .max_workers = max_workers, 969 .max_chunk = max_chunk, 970 .cb = cb, 971 .cb_opaque = cb_opaque, 972 973 .co = qemu_coroutine_create(block_copy_async_co_entry, call_state), 974 }; 975 976 qemu_coroutine_enter(call_state->co); 977 978 return call_state; 979 } 980 981 void block_copy_call_free(BlockCopyCallState *call_state) 982 { 983 if (!call_state) { 984 return; 985 } 986 987 assert(qatomic_read(&call_state->finished)); 988 g_free(call_state); 989 } 990 991 bool block_copy_call_finished(BlockCopyCallState *call_state) 992 { 993 return qatomic_read(&call_state->finished); 994 } 995 996 bool block_copy_call_succeeded(BlockCopyCallState *call_state) 997 { 998 return qatomic_load_acquire(&call_state->finished) && 999 !qatomic_read(&call_state->cancelled) && 1000 call_state->ret == 0; 1001 } 1002 1003 bool block_copy_call_failed(BlockCopyCallState *call_state) 1004 { 1005 return qatomic_load_acquire(&call_state->finished) && 1006 !qatomic_read(&call_state->cancelled) && 1007 call_state->ret < 0; 1008 } 1009 1010 bool block_copy_call_cancelled(BlockCopyCallState *call_state) 1011 { 1012 return qatomic_read(&call_state->cancelled); 1013 } 1014 1015 int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read) 1016 { 1017 assert(qatomic_load_acquire(&call_state->finished)); 1018 if (error_is_read) { 1019 *error_is_read = call_state->error_is_read; 1020 } 1021 return call_state->ret; 1022 } 1023 1024 /* 1025 * Note that cancelling and finishing are racy. 1026 * User can cancel a block-copy that is already finished. 1027 */ 1028 void block_copy_call_cancel(BlockCopyCallState *call_state) 1029 { 1030 qatomic_set(&call_state->cancelled, true); 1031 block_copy_kick(call_state); 1032 } 1033 1034 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s) 1035 { 1036 return s->copy_bitmap; 1037 } 1038 1039 int64_t block_copy_cluster_size(BlockCopyState *s) 1040 { 1041 return s->cluster_size; 1042 } 1043 1044 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip) 1045 { 1046 qatomic_set(&s->skip_unallocated, skip); 1047 } 1048 1049 void block_copy_set_speed(BlockCopyState *s, uint64_t speed) 1050 { 1051 ratelimit_set_speed(&s->rate_limit, speed, BLOCK_COPY_SLICE_TIME); 1052 1053 /* 1054 * Note: it's good to kick all call states from here, but it should be done 1055 * only from a coroutine, to not crash if s->calls list changed while 1056 * entering one call. So for now, the only user of this function kicks its 1057 * only one call_state by hand. 1058 */ 1059 } 1060