1 /* 2 * block_copy API 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * Copyright (c) 2019 Virtuozzo International GmbH. 6 * 7 * Authors: 8 * Dietmar Maurer (dietmar@proxmox.com) 9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 */ 14 15 #include "qemu/osdep.h" 16 17 #include "trace.h" 18 #include "qapi/error.h" 19 #include "block/block-copy.h" 20 #include "sysemu/block-backend.h" 21 #include "qemu/units.h" 22 23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB) 24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB) 25 #define BLOCK_COPY_MAX_MEM (128 * MiB) 26 27 typedef struct BlockCopyInFlightReq { 28 int64_t offset; 29 int64_t bytes; 30 QLIST_ENTRY(BlockCopyInFlightReq) list; 31 CoQueue wait_queue; /* coroutines blocked on this request */ 32 } BlockCopyInFlightReq; 33 34 typedef struct BlockCopyState { 35 /* 36 * BdrvChild objects are not owned or managed by block-copy. They are 37 * provided by block-copy user and user is responsible for appropriate 38 * permissions on these children. 39 */ 40 BdrvChild *source; 41 BdrvChild *target; 42 BdrvDirtyBitmap *copy_bitmap; 43 int64_t in_flight_bytes; 44 int64_t cluster_size; 45 bool use_copy_range; 46 int64_t copy_size; 47 uint64_t len; 48 QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs; 49 50 BdrvRequestFlags write_flags; 51 52 /* 53 * skip_unallocated: 54 * 55 * Used by sync=top jobs, which first scan the source node for unallocated 56 * areas and clear them in the copy_bitmap. During this process, the bitmap 57 * is thus not fully initialized: It may still have bits set for areas that 58 * are unallocated and should actually not be copied. 59 * 60 * This is indicated by skip_unallocated. 61 * 62 * In this case, block_copy() will query the source’s allocation status, 63 * skip unallocated regions, clear them in the copy_bitmap, and invoke 64 * block_copy_reset_unallocated() every time it does. 65 */ 66 bool skip_unallocated; 67 68 ProgressMeter *progress; 69 /* progress_bytes_callback: called when some copying progress is done. */ 70 ProgressBytesCallbackFunc progress_bytes_callback; 71 void *progress_opaque; 72 73 SharedResource *mem; 74 } BlockCopyState; 75 76 static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s, 77 int64_t offset, 78 int64_t bytes) 79 { 80 BlockCopyInFlightReq *req; 81 82 QLIST_FOREACH(req, &s->inflight_reqs, list) { 83 if (offset + bytes > req->offset && offset < req->offset + req->bytes) { 84 return req; 85 } 86 } 87 88 return NULL; 89 } 90 91 /* 92 * If there are no intersecting requests return false. Otherwise, wait for the 93 * first found intersecting request to finish and return true. 94 */ 95 static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset, 96 int64_t bytes) 97 { 98 BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, bytes); 99 100 if (!req) { 101 return false; 102 } 103 104 qemu_co_queue_wait(&req->wait_queue, NULL); 105 106 return true; 107 } 108 109 /* Called only on full-dirty region */ 110 static void block_copy_inflight_req_begin(BlockCopyState *s, 111 BlockCopyInFlightReq *req, 112 int64_t offset, int64_t bytes) 113 { 114 assert(!find_conflicting_inflight_req(s, offset, bytes)); 115 116 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 117 s->in_flight_bytes += bytes; 118 119 req->offset = offset; 120 req->bytes = bytes; 121 qemu_co_queue_init(&req->wait_queue); 122 QLIST_INSERT_HEAD(&s->inflight_reqs, req, list); 123 } 124 125 /* 126 * block_copy_inflight_req_shrink 127 * 128 * Drop the tail of the request to be handled later. Set dirty bits back and 129 * wake up all requests waiting for us (may be some of them are not intersecting 130 * with shrunk request) 131 */ 132 static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s, 133 BlockCopyInFlightReq *req, int64_t new_bytes) 134 { 135 if (new_bytes == req->bytes) { 136 return; 137 } 138 139 assert(new_bytes > 0 && new_bytes < req->bytes); 140 141 s->in_flight_bytes -= req->bytes - new_bytes; 142 bdrv_set_dirty_bitmap(s->copy_bitmap, 143 req->offset + new_bytes, req->bytes - new_bytes); 144 145 req->bytes = new_bytes; 146 qemu_co_queue_restart_all(&req->wait_queue); 147 } 148 149 static void coroutine_fn block_copy_inflight_req_end(BlockCopyState *s, 150 BlockCopyInFlightReq *req, 151 int ret) 152 { 153 s->in_flight_bytes -= req->bytes; 154 if (ret < 0) { 155 bdrv_set_dirty_bitmap(s->copy_bitmap, req->offset, req->bytes); 156 } 157 QLIST_REMOVE(req, list); 158 qemu_co_queue_restart_all(&req->wait_queue); 159 } 160 161 void block_copy_state_free(BlockCopyState *s) 162 { 163 if (!s) { 164 return; 165 } 166 167 bdrv_release_dirty_bitmap(s->copy_bitmap); 168 shres_destroy(s->mem); 169 g_free(s); 170 } 171 172 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target) 173 { 174 return MIN_NON_ZERO(INT_MAX, 175 MIN_NON_ZERO(source->bs->bl.max_transfer, 176 target->bs->bl.max_transfer)); 177 } 178 179 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, 180 int64_t cluster_size, 181 BdrvRequestFlags write_flags, Error **errp) 182 { 183 BlockCopyState *s; 184 BdrvDirtyBitmap *copy_bitmap; 185 186 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL, 187 errp); 188 if (!copy_bitmap) { 189 return NULL; 190 } 191 bdrv_disable_dirty_bitmap(copy_bitmap); 192 193 s = g_new(BlockCopyState, 1); 194 *s = (BlockCopyState) { 195 .source = source, 196 .target = target, 197 .copy_bitmap = copy_bitmap, 198 .cluster_size = cluster_size, 199 .len = bdrv_dirty_bitmap_size(copy_bitmap), 200 .write_flags = write_flags, 201 .mem = shres_create(BLOCK_COPY_MAX_MEM), 202 }; 203 204 if (block_copy_max_transfer(source, target) < cluster_size) { 205 /* 206 * copy_range does not respect max_transfer. We don't want to bother 207 * with requests smaller than block-copy cluster size, so fallback to 208 * buffered copying (read and write respect max_transfer on their 209 * behalf). 210 */ 211 s->use_copy_range = false; 212 s->copy_size = cluster_size; 213 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) { 214 /* Compression supports only cluster-size writes and no copy-range. */ 215 s->use_copy_range = false; 216 s->copy_size = cluster_size; 217 } else { 218 /* 219 * We enable copy-range, but keep small copy_size, until first 220 * successful copy_range (look at block_copy_do_copy). 221 */ 222 s->use_copy_range = true; 223 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); 224 } 225 226 QLIST_INIT(&s->inflight_reqs); 227 228 return s; 229 } 230 231 void block_copy_set_progress_callback( 232 BlockCopyState *s, 233 ProgressBytesCallbackFunc progress_bytes_callback, 234 void *progress_opaque) 235 { 236 s->progress_bytes_callback = progress_bytes_callback; 237 s->progress_opaque = progress_opaque; 238 } 239 240 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm) 241 { 242 s->progress = pm; 243 } 244 245 /* 246 * block_copy_do_copy 247 * 248 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed 249 * s->len only to cover last cluster when s->len is not aligned to clusters. 250 * 251 * No sync here: nor bitmap neighter intersecting requests handling, only copy. 252 * 253 * Returns 0 on success. 254 */ 255 static int coroutine_fn block_copy_do_copy(BlockCopyState *s, 256 int64_t offset, int64_t bytes, 257 bool zeroes, bool *error_is_read) 258 { 259 int ret; 260 int64_t nbytes = MIN(offset + bytes, s->len) - offset; 261 void *bounce_buffer = NULL; 262 263 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes); 264 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 265 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); 266 assert(offset < s->len); 267 assert(offset + bytes <= s->len || 268 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size)); 269 assert(nbytes < INT_MAX); 270 271 if (zeroes) { 272 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags & 273 ~BDRV_REQ_WRITE_COMPRESSED); 274 if (ret < 0) { 275 trace_block_copy_write_zeroes_fail(s, offset, ret); 276 if (error_is_read) { 277 *error_is_read = false; 278 } 279 } 280 return ret; 281 } 282 283 if (s->use_copy_range) { 284 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes, 285 0, s->write_flags); 286 if (ret < 0) { 287 trace_block_copy_copy_range_fail(s, offset, ret); 288 s->use_copy_range = false; 289 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER); 290 /* Fallback to read+write with allocated buffer */ 291 } else { 292 if (s->use_copy_range) { 293 /* 294 * Successful copy-range. Now increase copy_size. copy_range 295 * does not respect max_transfer (it's a TODO), so we factor 296 * that in here. 297 * 298 * Note: we double-check s->use_copy_range for the case when 299 * parallel block-copy request unsets it during previous 300 * bdrv_co_copy_range call. 301 */ 302 s->copy_size = 303 MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE), 304 QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source, 305 s->target), 306 s->cluster_size)); 307 } 308 goto out; 309 } 310 } 311 312 /* 313 * In case of failed copy_range request above, we may proceed with buffered 314 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will 315 * be properly limited, so don't care too much. Moreover the most likely 316 * case (copy_range is unsupported for the configuration, so the very first 317 * copy_range request fails) is handled by setting large copy_size only 318 * after first successful copy_range. 319 */ 320 321 bounce_buffer = qemu_blockalign(s->source->bs, nbytes); 322 323 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0); 324 if (ret < 0) { 325 trace_block_copy_read_fail(s, offset, ret); 326 if (error_is_read) { 327 *error_is_read = true; 328 } 329 goto out; 330 } 331 332 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer, 333 s->write_flags); 334 if (ret < 0) { 335 trace_block_copy_write_fail(s, offset, ret); 336 if (error_is_read) { 337 *error_is_read = false; 338 } 339 goto out; 340 } 341 342 out: 343 qemu_vfree(bounce_buffer); 344 345 return ret; 346 } 347 348 static int block_copy_block_status(BlockCopyState *s, int64_t offset, 349 int64_t bytes, int64_t *pnum) 350 { 351 int64_t num; 352 BlockDriverState *base; 353 int ret; 354 355 if (s->skip_unallocated && s->source->bs->backing) { 356 base = s->source->bs->backing->bs; 357 } else { 358 base = NULL; 359 } 360 361 ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num, 362 NULL, NULL); 363 if (ret < 0 || num < s->cluster_size) { 364 /* 365 * On error or if failed to obtain large enough chunk just fallback to 366 * copy one cluster. 367 */ 368 num = s->cluster_size; 369 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA; 370 } else if (offset + num == s->len) { 371 num = QEMU_ALIGN_UP(num, s->cluster_size); 372 } else { 373 num = QEMU_ALIGN_DOWN(num, s->cluster_size); 374 } 375 376 *pnum = num; 377 return ret; 378 } 379 380 /* 381 * Check if the cluster starting at offset is allocated or not. 382 * return via pnum the number of contiguous clusters sharing this allocation. 383 */ 384 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset, 385 int64_t *pnum) 386 { 387 BlockDriverState *bs = s->source->bs; 388 int64_t count, total_count = 0; 389 int64_t bytes = s->len - offset; 390 int ret; 391 392 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 393 394 while (true) { 395 ret = bdrv_is_allocated(bs, offset, bytes, &count); 396 if (ret < 0) { 397 return ret; 398 } 399 400 total_count += count; 401 402 if (ret || count == 0) { 403 /* 404 * ret: partial segment(s) are considered allocated. 405 * otherwise: unallocated tail is treated as an entire segment. 406 */ 407 *pnum = DIV_ROUND_UP(total_count, s->cluster_size); 408 return ret; 409 } 410 411 /* Unallocated segment(s) with uncertain following segment(s) */ 412 if (total_count >= s->cluster_size) { 413 *pnum = total_count / s->cluster_size; 414 return 0; 415 } 416 417 offset += count; 418 bytes -= count; 419 } 420 } 421 422 /* 423 * Reset bits in copy_bitmap starting at offset if they represent unallocated 424 * data in the image. May reset subsequent contiguous bits. 425 * @return 0 when the cluster at @offset was unallocated, 426 * 1 otherwise, and -ret on error. 427 */ 428 int64_t block_copy_reset_unallocated(BlockCopyState *s, 429 int64_t offset, int64_t *count) 430 { 431 int ret; 432 int64_t clusters, bytes; 433 434 ret = block_copy_is_cluster_allocated(s, offset, &clusters); 435 if (ret < 0) { 436 return ret; 437 } 438 439 bytes = clusters * s->cluster_size; 440 441 if (!ret) { 442 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes); 443 progress_set_remaining(s->progress, 444 bdrv_get_dirty_count(s->copy_bitmap) + 445 s->in_flight_bytes); 446 } 447 448 *count = bytes; 449 return ret; 450 } 451 452 /* 453 * block_copy_dirty_clusters 454 * 455 * Copy dirty clusters in @offset/@bytes range. 456 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty 457 * clusters found and -errno on failure. 458 */ 459 static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s, 460 int64_t offset, int64_t bytes, 461 bool *error_is_read) 462 { 463 int ret = 0; 464 bool found_dirty = false; 465 466 /* 467 * block_copy() user is responsible for keeping source and target in same 468 * aio context 469 */ 470 assert(bdrv_get_aio_context(s->source->bs) == 471 bdrv_get_aio_context(s->target->bs)); 472 473 assert(QEMU_IS_ALIGNED(offset, s->cluster_size)); 474 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size)); 475 476 while (bytes) { 477 BlockCopyInFlightReq req; 478 int64_t next_zero, cur_bytes, status_bytes; 479 480 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) { 481 trace_block_copy_skip(s, offset); 482 offset += s->cluster_size; 483 bytes -= s->cluster_size; 484 continue; /* already copied */ 485 } 486 487 found_dirty = true; 488 489 cur_bytes = MIN(bytes, s->copy_size); 490 491 next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset, 492 cur_bytes); 493 if (next_zero >= 0) { 494 assert(next_zero > offset); /* offset is dirty */ 495 assert(next_zero < offset + cur_bytes); /* no need to do MIN() */ 496 cur_bytes = next_zero - offset; 497 } 498 block_copy_inflight_req_begin(s, &req, offset, cur_bytes); 499 500 ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes); 501 assert(ret >= 0); /* never fail */ 502 cur_bytes = MIN(cur_bytes, status_bytes); 503 block_copy_inflight_req_shrink(s, &req, cur_bytes); 504 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) { 505 block_copy_inflight_req_end(s, &req, 0); 506 progress_set_remaining(s->progress, 507 bdrv_get_dirty_count(s->copy_bitmap) + 508 s->in_flight_bytes); 509 trace_block_copy_skip_range(s, offset, status_bytes); 510 offset += status_bytes; 511 bytes -= status_bytes; 512 continue; 513 } 514 515 trace_block_copy_process(s, offset); 516 517 co_get_from_shres(s->mem, cur_bytes); 518 ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO, 519 error_is_read); 520 co_put_to_shres(s->mem, cur_bytes); 521 block_copy_inflight_req_end(s, &req, ret); 522 if (ret < 0) { 523 return ret; 524 } 525 526 progress_work_done(s->progress, cur_bytes); 527 s->progress_bytes_callback(cur_bytes, s->progress_opaque); 528 offset += cur_bytes; 529 bytes -= cur_bytes; 530 } 531 532 return found_dirty; 533 } 534 535 /* 536 * block_copy 537 * 538 * Copy requested region, accordingly to dirty bitmap. 539 * Collaborate with parallel block_copy requests: if they succeed it will help 540 * us. If they fail, we will retry not-copied regions. So, if we return error, 541 * it means that some I/O operation failed in context of _this_ block_copy call, 542 * not some parallel operation. 543 */ 544 int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes, 545 bool *error_is_read) 546 { 547 int ret; 548 549 do { 550 ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read); 551 552 if (ret == 0) { 553 ret = block_copy_wait_one(s, offset, bytes); 554 } 555 556 /* 557 * We retry in two cases: 558 * 1. Some progress done 559 * Something was copied, which means that there were yield points 560 * and some new dirty bits may have appeared (due to failed parallel 561 * block-copy requests). 562 * 2. We have waited for some intersecting block-copy request 563 * It may have failed and produced new dirty bits. 564 */ 565 } while (ret > 0); 566 567 return ret; 568 } 569 570 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s) 571 { 572 return s->copy_bitmap; 573 } 574 575 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip) 576 { 577 s->skip_unallocated = skip; 578 } 579