1 /* 2 * QEMU backup 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * 6 * Authors: 7 * Dietmar Maurer (dietmar@proxmox.com) 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 * 12 */ 13 14 #include "qemu/osdep.h" 15 16 #include "trace.h" 17 #include "block/block.h" 18 #include "block/block_int.h" 19 #include "block/blockjob_int.h" 20 #include "block/block_backup.h" 21 #include "qapi/error.h" 22 #include "qapi/qmp/qerror.h" 23 #include "qemu/ratelimit.h" 24 #include "qemu/cutils.h" 25 #include "sysemu/block-backend.h" 26 #include "qemu/bitmap.h" 27 #include "qemu/error-report.h" 28 29 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) 30 #define SLICE_TIME 100000000ULL /* ns */ 31 32 typedef struct BackupBlockJob { 33 BlockJob common; 34 BlockBackend *target; 35 /* bitmap for sync=incremental */ 36 BdrvDirtyBitmap *sync_bitmap; 37 MirrorSyncMode sync_mode; 38 RateLimit limit; 39 BlockdevOnError on_source_error; 40 BlockdevOnError on_target_error; 41 CoRwlock flush_rwlock; 42 uint64_t bytes_read; 43 int64_t cluster_size; 44 bool compress; 45 NotifierWithReturn before_write; 46 QLIST_HEAD(, CowRequest) inflight_reqs; 47 48 HBitmap *copy_bitmap; 49 } BackupBlockJob; 50 51 /* See if in-flight requests overlap and wait for them to complete */ 52 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, 53 int64_t start, 54 int64_t end) 55 { 56 CowRequest *req; 57 bool retry; 58 59 do { 60 retry = false; 61 QLIST_FOREACH(req, &job->inflight_reqs, list) { 62 if (end > req->start_byte && start < req->end_byte) { 63 qemu_co_queue_wait(&req->wait_queue, NULL); 64 retry = true; 65 break; 66 } 67 } 68 } while (retry); 69 } 70 71 /* Keep track of an in-flight request */ 72 static void cow_request_begin(CowRequest *req, BackupBlockJob *job, 73 int64_t start, int64_t end) 74 { 75 req->start_byte = start; 76 req->end_byte = end; 77 qemu_co_queue_init(&req->wait_queue); 78 QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); 79 } 80 81 /* Forget about a completed request */ 82 static void cow_request_end(CowRequest *req) 83 { 84 QLIST_REMOVE(req, list); 85 qemu_co_queue_restart_all(&req->wait_queue); 86 } 87 88 static int coroutine_fn backup_do_cow(BackupBlockJob *job, 89 int64_t offset, uint64_t bytes, 90 bool *error_is_read, 91 bool is_write_notifier) 92 { 93 BlockBackend *blk = job->common.blk; 94 CowRequest cow_request; 95 struct iovec iov; 96 QEMUIOVector bounce_qiov; 97 void *bounce_buffer = NULL; 98 int ret = 0; 99 int64_t start, end; /* bytes */ 100 int n; /* bytes */ 101 102 qemu_co_rwlock_rdlock(&job->flush_rwlock); 103 104 start = QEMU_ALIGN_DOWN(offset, job->cluster_size); 105 end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size); 106 107 trace_backup_do_cow_enter(job, start, offset, bytes); 108 109 wait_for_overlapping_requests(job, start, end); 110 cow_request_begin(&cow_request, job, start, end); 111 112 for (; start < end; start += job->cluster_size) { 113 if (!hbitmap_get(job->copy_bitmap, start / job->cluster_size)) { 114 trace_backup_do_cow_skip(job, start); 115 continue; /* already copied */ 116 } 117 hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1); 118 119 trace_backup_do_cow_process(job, start); 120 121 n = MIN(job->cluster_size, job->common.len - start); 122 123 if (!bounce_buffer) { 124 bounce_buffer = blk_blockalign(blk, job->cluster_size); 125 } 126 iov.iov_base = bounce_buffer; 127 iov.iov_len = n; 128 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 129 130 ret = blk_co_preadv(blk, start, bounce_qiov.size, &bounce_qiov, 131 is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); 132 if (ret < 0) { 133 trace_backup_do_cow_read_fail(job, start, ret); 134 if (error_is_read) { 135 *error_is_read = true; 136 } 137 hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1); 138 goto out; 139 } 140 141 if (buffer_is_zero(iov.iov_base, iov.iov_len)) { 142 ret = blk_co_pwrite_zeroes(job->target, start, 143 bounce_qiov.size, BDRV_REQ_MAY_UNMAP); 144 } else { 145 ret = blk_co_pwritev(job->target, start, 146 bounce_qiov.size, &bounce_qiov, 147 job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0); 148 } 149 if (ret < 0) { 150 trace_backup_do_cow_write_fail(job, start, ret); 151 if (error_is_read) { 152 *error_is_read = false; 153 } 154 hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1); 155 goto out; 156 } 157 158 /* Publish progress, guest I/O counts as progress too. Note that the 159 * offset field is an opaque progress value, it is not a disk offset. 160 */ 161 job->bytes_read += n; 162 job->common.offset += n; 163 } 164 165 out: 166 if (bounce_buffer) { 167 qemu_vfree(bounce_buffer); 168 } 169 170 cow_request_end(&cow_request); 171 172 trace_backup_do_cow_return(job, offset, bytes, ret); 173 174 qemu_co_rwlock_unlock(&job->flush_rwlock); 175 176 return ret; 177 } 178 179 static int coroutine_fn backup_before_write_notify( 180 NotifierWithReturn *notifier, 181 void *opaque) 182 { 183 BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write); 184 BdrvTrackedRequest *req = opaque; 185 186 assert(req->bs == blk_bs(job->common.blk)); 187 assert(QEMU_IS_ALIGNED(req->offset, BDRV_SECTOR_SIZE)); 188 assert(QEMU_IS_ALIGNED(req->bytes, BDRV_SECTOR_SIZE)); 189 190 return backup_do_cow(job, req->offset, req->bytes, NULL, true); 191 } 192 193 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) 194 { 195 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 196 197 if (speed < 0) { 198 error_setg(errp, QERR_INVALID_PARAMETER, "speed"); 199 return; 200 } 201 ratelimit_set_speed(&s->limit, speed, SLICE_TIME); 202 } 203 204 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) 205 { 206 BdrvDirtyBitmap *bm; 207 BlockDriverState *bs = blk_bs(job->common.blk); 208 209 if (ret < 0 || block_job_is_cancelled(&job->common)) { 210 /* Merge the successor back into the parent, delete nothing. */ 211 bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); 212 assert(bm); 213 } else { 214 /* Everything is fine, delete this bitmap and install the backup. */ 215 bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); 216 assert(bm); 217 } 218 } 219 220 static void backup_commit(BlockJob *job) 221 { 222 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 223 if (s->sync_bitmap) { 224 backup_cleanup_sync_bitmap(s, 0); 225 } 226 } 227 228 static void backup_abort(BlockJob *job) 229 { 230 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 231 if (s->sync_bitmap) { 232 backup_cleanup_sync_bitmap(s, -1); 233 } 234 } 235 236 static void backup_clean(BlockJob *job) 237 { 238 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 239 assert(s->target); 240 blk_unref(s->target); 241 s->target = NULL; 242 } 243 244 static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context) 245 { 246 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 247 248 blk_set_aio_context(s->target, aio_context); 249 } 250 251 void backup_do_checkpoint(BlockJob *job, Error **errp) 252 { 253 BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); 254 int64_t len; 255 256 assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP); 257 258 if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) { 259 error_setg(errp, "The backup job only supports block checkpoint in" 260 " sync=none mode"); 261 return; 262 } 263 264 len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size); 265 hbitmap_set(backup_job->copy_bitmap, 0, len); 266 } 267 268 void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset, 269 uint64_t bytes) 270 { 271 BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); 272 int64_t start, end; 273 274 assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP); 275 276 start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size); 277 end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size); 278 wait_for_overlapping_requests(backup_job, start, end); 279 } 280 281 void backup_cow_request_begin(CowRequest *req, BlockJob *job, 282 int64_t offset, uint64_t bytes) 283 { 284 BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); 285 int64_t start, end; 286 287 assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP); 288 289 start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size); 290 end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size); 291 cow_request_begin(req, backup_job, start, end); 292 } 293 294 void backup_cow_request_end(CowRequest *req) 295 { 296 cow_request_end(req); 297 } 298 299 static void backup_drain(BlockJob *job) 300 { 301 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 302 303 /* Need to keep a reference in case blk_drain triggers execution 304 * of backup_complete... 305 */ 306 if (s->target) { 307 BlockBackend *target = s->target; 308 blk_ref(target); 309 blk_drain(target); 310 blk_unref(target); 311 } 312 } 313 314 static BlockErrorAction backup_error_action(BackupBlockJob *job, 315 bool read, int error) 316 { 317 if (read) { 318 return block_job_error_action(&job->common, job->on_source_error, 319 true, error); 320 } else { 321 return block_job_error_action(&job->common, job->on_target_error, 322 false, error); 323 } 324 } 325 326 typedef struct { 327 int ret; 328 } BackupCompleteData; 329 330 static void backup_complete(BlockJob *job, void *opaque) 331 { 332 BackupCompleteData *data = opaque; 333 334 block_job_completed(job, data->ret); 335 g_free(data); 336 } 337 338 static bool coroutine_fn yield_and_check(BackupBlockJob *job) 339 { 340 if (block_job_is_cancelled(&job->common)) { 341 return true; 342 } 343 344 /* we need to yield so that bdrv_drain_all() returns. 345 * (without, VM does not reboot) 346 */ 347 if (job->common.speed) { 348 uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, 349 job->bytes_read); 350 job->bytes_read = 0; 351 block_job_sleep_ns(&job->common, delay_ns); 352 } else { 353 block_job_sleep_ns(&job->common, 0); 354 } 355 356 if (block_job_is_cancelled(&job->common)) { 357 return true; 358 } 359 360 return false; 361 } 362 363 static int coroutine_fn backup_run_incremental(BackupBlockJob *job) 364 { 365 int ret; 366 bool error_is_read; 367 int64_t cluster; 368 HBitmapIter hbi; 369 370 hbitmap_iter_init(&hbi, job->copy_bitmap, 0); 371 while ((cluster = hbitmap_iter_next(&hbi)) != -1) { 372 do { 373 if (yield_and_check(job)) { 374 return 0; 375 } 376 ret = backup_do_cow(job, cluster * job->cluster_size, 377 job->cluster_size, &error_is_read, false); 378 if (ret < 0 && backup_error_action(job, error_is_read, -ret) == 379 BLOCK_ERROR_ACTION_REPORT) 380 { 381 return ret; 382 } 383 } while (ret < 0); 384 } 385 386 return 0; 387 } 388 389 /* init copy_bitmap from sync_bitmap */ 390 static void backup_incremental_init_copy_bitmap(BackupBlockJob *job) 391 { 392 BdrvDirtyBitmapIter *dbi; 393 int64_t offset; 394 int64_t end = DIV_ROUND_UP(bdrv_dirty_bitmap_size(job->sync_bitmap), 395 job->cluster_size); 396 397 dbi = bdrv_dirty_iter_new(job->sync_bitmap); 398 while ((offset = bdrv_dirty_iter_next(dbi)) != -1) { 399 int64_t cluster = offset / job->cluster_size; 400 int64_t next_cluster; 401 402 offset += bdrv_dirty_bitmap_granularity(job->sync_bitmap); 403 if (offset >= bdrv_dirty_bitmap_size(job->sync_bitmap)) { 404 hbitmap_set(job->copy_bitmap, cluster, end - cluster); 405 break; 406 } 407 408 offset = bdrv_dirty_bitmap_next_zero(job->sync_bitmap, offset); 409 if (offset == -1) { 410 hbitmap_set(job->copy_bitmap, cluster, end - cluster); 411 break; 412 } 413 414 next_cluster = DIV_ROUND_UP(offset, job->cluster_size); 415 hbitmap_set(job->copy_bitmap, cluster, next_cluster - cluster); 416 if (next_cluster >= end) { 417 break; 418 } 419 420 bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size); 421 } 422 423 job->common.offset = job->common.len - 424 hbitmap_count(job->copy_bitmap) * job->cluster_size; 425 426 bdrv_dirty_iter_free(dbi); 427 } 428 429 static void coroutine_fn backup_run(void *opaque) 430 { 431 BackupBlockJob *job = opaque; 432 BackupCompleteData *data; 433 BlockDriverState *bs = blk_bs(job->common.blk); 434 int64_t offset, nb_clusters; 435 int ret = 0; 436 437 QLIST_INIT(&job->inflight_reqs); 438 qemu_co_rwlock_init(&job->flush_rwlock); 439 440 nb_clusters = DIV_ROUND_UP(job->common.len, job->cluster_size); 441 job->copy_bitmap = hbitmap_alloc(nb_clusters, 0); 442 if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 443 backup_incremental_init_copy_bitmap(job); 444 } else { 445 hbitmap_set(job->copy_bitmap, 0, nb_clusters); 446 } 447 448 449 job->before_write.notify = backup_before_write_notify; 450 bdrv_add_before_write_notifier(bs, &job->before_write); 451 452 if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { 453 /* All bits are set in copy_bitmap to allow any cluster to be copied. 454 * This does not actually require them to be copied. */ 455 while (!block_job_is_cancelled(&job->common)) { 456 /* Yield until the job is cancelled. We just let our before_write 457 * notify callback service CoW requests. */ 458 block_job_yield(&job->common); 459 } 460 } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 461 ret = backup_run_incremental(job); 462 } else { 463 /* Both FULL and TOP SYNC_MODE's require copying.. */ 464 for (offset = 0; offset < job->common.len; 465 offset += job->cluster_size) { 466 bool error_is_read; 467 int alloced = 0; 468 469 if (yield_and_check(job)) { 470 break; 471 } 472 473 if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { 474 int i; 475 int64_t n; 476 477 /* Check to see if these blocks are already in the 478 * backing file. */ 479 480 for (i = 0; i < job->cluster_size;) { 481 /* bdrv_is_allocated() only returns true/false based 482 * on the first set of sectors it comes across that 483 * are are all in the same state. 484 * For that reason we must verify each sector in the 485 * backup cluster length. We end up copying more than 486 * needed but at some point that is always the case. */ 487 alloced = 488 bdrv_is_allocated(bs, offset + i, 489 job->cluster_size - i, &n); 490 i += n; 491 492 if (alloced || n == 0) { 493 break; 494 } 495 } 496 497 /* If the above loop never found any sectors that are in 498 * the topmost image, skip this backup. */ 499 if (alloced == 0) { 500 continue; 501 } 502 } 503 /* FULL sync mode we copy the whole drive. */ 504 if (alloced < 0) { 505 ret = alloced; 506 } else { 507 ret = backup_do_cow(job, offset, job->cluster_size, 508 &error_is_read, false); 509 } 510 if (ret < 0) { 511 /* Depending on error action, fail now or retry cluster */ 512 BlockErrorAction action = 513 backup_error_action(job, error_is_read, -ret); 514 if (action == BLOCK_ERROR_ACTION_REPORT) { 515 break; 516 } else { 517 offset -= job->cluster_size; 518 continue; 519 } 520 } 521 } 522 } 523 524 notifier_with_return_remove(&job->before_write); 525 526 /* wait until pending backup_do_cow() calls have completed */ 527 qemu_co_rwlock_wrlock(&job->flush_rwlock); 528 qemu_co_rwlock_unlock(&job->flush_rwlock); 529 hbitmap_free(job->copy_bitmap); 530 531 data = g_malloc(sizeof(*data)); 532 data->ret = ret; 533 block_job_defer_to_main_loop(&job->common, backup_complete, data); 534 } 535 536 static const BlockJobDriver backup_job_driver = { 537 .instance_size = sizeof(BackupBlockJob), 538 .job_type = BLOCK_JOB_TYPE_BACKUP, 539 .start = backup_run, 540 .set_speed = backup_set_speed, 541 .commit = backup_commit, 542 .abort = backup_abort, 543 .clean = backup_clean, 544 .attached_aio_context = backup_attached_aio_context, 545 .drain = backup_drain, 546 }; 547 548 BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, 549 BlockDriverState *target, int64_t speed, 550 MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap, 551 bool compress, 552 BlockdevOnError on_source_error, 553 BlockdevOnError on_target_error, 554 int creation_flags, 555 BlockCompletionFunc *cb, void *opaque, 556 BlockJobTxn *txn, Error **errp) 557 { 558 int64_t len; 559 BlockDriverInfo bdi; 560 BackupBlockJob *job = NULL; 561 int ret; 562 563 assert(bs); 564 assert(target); 565 566 if (bs == target) { 567 error_setg(errp, "Source and target cannot be the same"); 568 return NULL; 569 } 570 571 if (!bdrv_is_inserted(bs)) { 572 error_setg(errp, "Device is not inserted: %s", 573 bdrv_get_device_name(bs)); 574 return NULL; 575 } 576 577 if (!bdrv_is_inserted(target)) { 578 error_setg(errp, "Device is not inserted: %s", 579 bdrv_get_device_name(target)); 580 return NULL; 581 } 582 583 if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) { 584 error_setg(errp, "Compression is not supported for this drive %s", 585 bdrv_get_device_name(target)); 586 return NULL; 587 } 588 589 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { 590 return NULL; 591 } 592 593 if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) { 594 return NULL; 595 } 596 597 if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 598 if (!sync_bitmap) { 599 error_setg(errp, "must provide a valid bitmap name for " 600 "\"incremental\" sync mode"); 601 return NULL; 602 } 603 604 /* Create a new bitmap, and freeze/disable this one. */ 605 if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { 606 return NULL; 607 } 608 } else if (sync_bitmap) { 609 error_setg(errp, 610 "a sync_bitmap was provided to backup_run, " 611 "but received an incompatible sync_mode (%s)", 612 MirrorSyncMode_str(sync_mode)); 613 return NULL; 614 } 615 616 len = bdrv_getlength(bs); 617 if (len < 0) { 618 error_setg_errno(errp, -len, "unable to get length for '%s'", 619 bdrv_get_device_name(bs)); 620 goto error; 621 } 622 623 /* job->common.len is fixed, so we can't allow resize */ 624 job = block_job_create(job_id, &backup_job_driver, bs, 625 BLK_PERM_CONSISTENT_READ, 626 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | 627 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD, 628 speed, creation_flags, cb, opaque, errp); 629 if (!job) { 630 goto error; 631 } 632 633 /* The target must match the source in size, so no resize here either */ 634 job->target = blk_new(BLK_PERM_WRITE, 635 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | 636 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD); 637 ret = blk_insert_bs(job->target, target, errp); 638 if (ret < 0) { 639 goto error; 640 } 641 642 job->on_source_error = on_source_error; 643 job->on_target_error = on_target_error; 644 job->sync_mode = sync_mode; 645 job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? 646 sync_bitmap : NULL; 647 job->compress = compress; 648 649 /* If there is no backing file on the target, we cannot rely on COW if our 650 * backup cluster size is smaller than the target cluster size. Even for 651 * targets with a backing file, try to avoid COW if possible. */ 652 ret = bdrv_get_info(target, &bdi); 653 if (ret == -ENOTSUP && !target->backing) { 654 /* Cluster size is not defined */ 655 warn_report("The target block device doesn't provide " 656 "information about the block size and it doesn't have a " 657 "backing file. The default block size of %u bytes is " 658 "used. If the actual block size of the target exceeds " 659 "this default, the backup may be unusable", 660 BACKUP_CLUSTER_SIZE_DEFAULT); 661 job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; 662 } else if (ret < 0 && !target->backing) { 663 error_setg_errno(errp, -ret, 664 "Couldn't determine the cluster size of the target image, " 665 "which has no backing file"); 666 error_append_hint(errp, 667 "Aborting, since this may create an unusable destination image\n"); 668 goto error; 669 } else if (ret < 0 && target->backing) { 670 /* Not fatal; just trudge on ahead. */ 671 job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; 672 } else { 673 job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); 674 } 675 676 /* Required permissions are already taken with target's blk_new() */ 677 block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL, 678 &error_abort); 679 job->common.len = len; 680 block_job_txn_add_job(txn, &job->common); 681 682 return &job->common; 683 684 error: 685 if (sync_bitmap) { 686 bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); 687 } 688 if (job) { 689 backup_clean(&job->common); 690 block_job_early_fail(&job->common); 691 } 692 693 return NULL; 694 } 695