1 /* 2 * QEMU backup 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * 6 * Authors: 7 * Dietmar Maurer (dietmar@proxmox.com) 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 * 12 */ 13 14 #include "qemu/osdep.h" 15 16 #include "trace.h" 17 #include "block/block.h" 18 #include "block/block_int.h" 19 #include "block/blockjob_int.h" 20 #include "block/block_backup.h" 21 #include "qapi/error.h" 22 #include "qapi/qmp/qerror.h" 23 #include "qemu/ratelimit.h" 24 #include "qemu/cutils.h" 25 #include "sysemu/block-backend.h" 26 #include "qemu/bitmap.h" 27 #include "qemu/error-report.h" 28 29 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) 30 31 typedef struct BackupBlockJob { 32 BlockJob common; 33 BlockBackend *target; 34 /* bitmap for sync=incremental */ 35 BdrvDirtyBitmap *sync_bitmap; 36 MirrorSyncMode sync_mode; 37 BlockdevOnError on_source_error; 38 BlockdevOnError on_target_error; 39 CoRwlock flush_rwlock; 40 uint64_t len; 41 uint64_t bytes_read; 42 int64_t cluster_size; 43 bool compress; 44 NotifierWithReturn before_write; 45 QLIST_HEAD(, CowRequest) inflight_reqs; 46 47 HBitmap *copy_bitmap; 48 } BackupBlockJob; 49 50 static const BlockJobDriver backup_job_driver; 51 52 /* See if in-flight requests overlap and wait for them to complete */ 53 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, 54 int64_t start, 55 int64_t end) 56 { 57 CowRequest *req; 58 bool retry; 59 60 do { 61 retry = false; 62 QLIST_FOREACH(req, &job->inflight_reqs, list) { 63 if (end > req->start_byte && start < req->end_byte) { 64 qemu_co_queue_wait(&req->wait_queue, NULL); 65 retry = true; 66 break; 67 } 68 } 69 } while (retry); 70 } 71 72 /* Keep track of an in-flight request */ 73 static void cow_request_begin(CowRequest *req, BackupBlockJob *job, 74 int64_t start, int64_t end) 75 { 76 req->start_byte = start; 77 req->end_byte = end; 78 qemu_co_queue_init(&req->wait_queue); 79 QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); 80 } 81 82 /* Forget about a completed request */ 83 static void cow_request_end(CowRequest *req) 84 { 85 QLIST_REMOVE(req, list); 86 qemu_co_queue_restart_all(&req->wait_queue); 87 } 88 89 static int coroutine_fn backup_do_cow(BackupBlockJob *job, 90 int64_t offset, uint64_t bytes, 91 bool *error_is_read, 92 bool is_write_notifier) 93 { 94 BlockBackend *blk = job->common.blk; 95 CowRequest cow_request; 96 struct iovec iov; 97 QEMUIOVector bounce_qiov; 98 void *bounce_buffer = NULL; 99 int ret = 0; 100 int64_t start, end; /* bytes */ 101 int n; /* bytes */ 102 103 qemu_co_rwlock_rdlock(&job->flush_rwlock); 104 105 start = QEMU_ALIGN_DOWN(offset, job->cluster_size); 106 end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size); 107 108 trace_backup_do_cow_enter(job, start, offset, bytes); 109 110 wait_for_overlapping_requests(job, start, end); 111 cow_request_begin(&cow_request, job, start, end); 112 113 for (; start < end; start += job->cluster_size) { 114 if (!hbitmap_get(job->copy_bitmap, start / job->cluster_size)) { 115 trace_backup_do_cow_skip(job, start); 116 continue; /* already copied */ 117 } 118 hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1); 119 120 trace_backup_do_cow_process(job, start); 121 122 n = MIN(job->cluster_size, job->len - start); 123 124 if (!bounce_buffer) { 125 bounce_buffer = blk_blockalign(blk, job->cluster_size); 126 } 127 iov.iov_base = bounce_buffer; 128 iov.iov_len = n; 129 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 130 131 ret = blk_co_preadv(blk, start, bounce_qiov.size, &bounce_qiov, 132 is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); 133 if (ret < 0) { 134 trace_backup_do_cow_read_fail(job, start, ret); 135 if (error_is_read) { 136 *error_is_read = true; 137 } 138 hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1); 139 goto out; 140 } 141 142 if (buffer_is_zero(iov.iov_base, iov.iov_len)) { 143 ret = blk_co_pwrite_zeroes(job->target, start, 144 bounce_qiov.size, BDRV_REQ_MAY_UNMAP); 145 } else { 146 ret = blk_co_pwritev(job->target, start, 147 bounce_qiov.size, &bounce_qiov, 148 job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0); 149 } 150 if (ret < 0) { 151 trace_backup_do_cow_write_fail(job, start, ret); 152 if (error_is_read) { 153 *error_is_read = false; 154 } 155 hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1); 156 goto out; 157 } 158 159 /* Publish progress, guest I/O counts as progress too. Note that the 160 * offset field is an opaque progress value, it is not a disk offset. 161 */ 162 job->bytes_read += n; 163 block_job_progress_update(&job->common, n); 164 } 165 166 out: 167 if (bounce_buffer) { 168 qemu_vfree(bounce_buffer); 169 } 170 171 cow_request_end(&cow_request); 172 173 trace_backup_do_cow_return(job, offset, bytes, ret); 174 175 qemu_co_rwlock_unlock(&job->flush_rwlock); 176 177 return ret; 178 } 179 180 static int coroutine_fn backup_before_write_notify( 181 NotifierWithReturn *notifier, 182 void *opaque) 183 { 184 BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write); 185 BdrvTrackedRequest *req = opaque; 186 187 assert(req->bs == blk_bs(job->common.blk)); 188 assert(QEMU_IS_ALIGNED(req->offset, BDRV_SECTOR_SIZE)); 189 assert(QEMU_IS_ALIGNED(req->bytes, BDRV_SECTOR_SIZE)); 190 191 return backup_do_cow(job, req->offset, req->bytes, NULL, true); 192 } 193 194 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) 195 { 196 BdrvDirtyBitmap *bm; 197 BlockDriverState *bs = blk_bs(job->common.blk); 198 199 if (ret < 0) { 200 /* Merge the successor back into the parent, delete nothing. */ 201 bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); 202 assert(bm); 203 } else { 204 /* Everything is fine, delete this bitmap and install the backup. */ 205 bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); 206 assert(bm); 207 } 208 } 209 210 static void backup_commit(BlockJob *job) 211 { 212 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 213 if (s->sync_bitmap) { 214 backup_cleanup_sync_bitmap(s, 0); 215 } 216 } 217 218 static void backup_abort(BlockJob *job) 219 { 220 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 221 if (s->sync_bitmap) { 222 backup_cleanup_sync_bitmap(s, -1); 223 } 224 } 225 226 static void backup_clean(BlockJob *job) 227 { 228 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 229 assert(s->target); 230 blk_unref(s->target); 231 s->target = NULL; 232 } 233 234 static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context) 235 { 236 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 237 238 blk_set_aio_context(s->target, aio_context); 239 } 240 241 void backup_do_checkpoint(BlockJob *job, Error **errp) 242 { 243 BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); 244 int64_t len; 245 246 assert(block_job_driver(job) == &backup_job_driver); 247 248 if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) { 249 error_setg(errp, "The backup job only supports block checkpoint in" 250 " sync=none mode"); 251 return; 252 } 253 254 len = DIV_ROUND_UP(backup_job->len, backup_job->cluster_size); 255 hbitmap_set(backup_job->copy_bitmap, 0, len); 256 } 257 258 void backup_wait_for_overlapping_requests(BlockJob *job, int64_t offset, 259 uint64_t bytes) 260 { 261 BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); 262 int64_t start, end; 263 264 assert(block_job_driver(job) == &backup_job_driver); 265 266 start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size); 267 end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size); 268 wait_for_overlapping_requests(backup_job, start, end); 269 } 270 271 void backup_cow_request_begin(CowRequest *req, BlockJob *job, 272 int64_t offset, uint64_t bytes) 273 { 274 BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common); 275 int64_t start, end; 276 277 assert(block_job_driver(job) == &backup_job_driver); 278 279 start = QEMU_ALIGN_DOWN(offset, backup_job->cluster_size); 280 end = QEMU_ALIGN_UP(offset + bytes, backup_job->cluster_size); 281 cow_request_begin(req, backup_job, start, end); 282 } 283 284 void backup_cow_request_end(CowRequest *req) 285 { 286 cow_request_end(req); 287 } 288 289 static void backup_drain(BlockJob *job) 290 { 291 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 292 293 /* Need to keep a reference in case blk_drain triggers execution 294 * of backup_complete... 295 */ 296 if (s->target) { 297 BlockBackend *target = s->target; 298 blk_ref(target); 299 blk_drain(target); 300 blk_unref(target); 301 } 302 } 303 304 static BlockErrorAction backup_error_action(BackupBlockJob *job, 305 bool read, int error) 306 { 307 if (read) { 308 return block_job_error_action(&job->common, job->on_source_error, 309 true, error); 310 } else { 311 return block_job_error_action(&job->common, job->on_target_error, 312 false, error); 313 } 314 } 315 316 typedef struct { 317 int ret; 318 } BackupCompleteData; 319 320 static void backup_complete(BlockJob *job, void *opaque) 321 { 322 BackupCompleteData *data = opaque; 323 324 block_job_completed(job, data->ret); 325 g_free(data); 326 } 327 328 static bool coroutine_fn yield_and_check(BackupBlockJob *job) 329 { 330 uint64_t delay_ns; 331 332 if (block_job_is_cancelled(&job->common)) { 333 return true; 334 } 335 336 /* We need to yield even for delay_ns = 0 so that bdrv_drain_all() can 337 * return. Without a yield, the VM would not reboot. */ 338 delay_ns = block_job_ratelimit_get_delay(&job->common, job->bytes_read); 339 job->bytes_read = 0; 340 block_job_sleep_ns(&job->common, delay_ns); 341 342 if (block_job_is_cancelled(&job->common)) { 343 return true; 344 } 345 346 return false; 347 } 348 349 static int coroutine_fn backup_run_incremental(BackupBlockJob *job) 350 { 351 int ret; 352 bool error_is_read; 353 int64_t cluster; 354 HBitmapIter hbi; 355 356 hbitmap_iter_init(&hbi, job->copy_bitmap, 0); 357 while ((cluster = hbitmap_iter_next(&hbi)) != -1) { 358 do { 359 if (yield_and_check(job)) { 360 return 0; 361 } 362 ret = backup_do_cow(job, cluster * job->cluster_size, 363 job->cluster_size, &error_is_read, false); 364 if (ret < 0 && backup_error_action(job, error_is_read, -ret) == 365 BLOCK_ERROR_ACTION_REPORT) 366 { 367 return ret; 368 } 369 } while (ret < 0); 370 } 371 372 return 0; 373 } 374 375 /* init copy_bitmap from sync_bitmap */ 376 static void backup_incremental_init_copy_bitmap(BackupBlockJob *job) 377 { 378 BdrvDirtyBitmapIter *dbi; 379 int64_t offset; 380 int64_t end = DIV_ROUND_UP(bdrv_dirty_bitmap_size(job->sync_bitmap), 381 job->cluster_size); 382 383 dbi = bdrv_dirty_iter_new(job->sync_bitmap); 384 while ((offset = bdrv_dirty_iter_next(dbi)) != -1) { 385 int64_t cluster = offset / job->cluster_size; 386 int64_t next_cluster; 387 388 offset += bdrv_dirty_bitmap_granularity(job->sync_bitmap); 389 if (offset >= bdrv_dirty_bitmap_size(job->sync_bitmap)) { 390 hbitmap_set(job->copy_bitmap, cluster, end - cluster); 391 break; 392 } 393 394 offset = bdrv_dirty_bitmap_next_zero(job->sync_bitmap, offset); 395 if (offset == -1) { 396 hbitmap_set(job->copy_bitmap, cluster, end - cluster); 397 break; 398 } 399 400 next_cluster = DIV_ROUND_UP(offset, job->cluster_size); 401 hbitmap_set(job->copy_bitmap, cluster, next_cluster - cluster); 402 if (next_cluster >= end) { 403 break; 404 } 405 406 bdrv_set_dirty_iter(dbi, next_cluster * job->cluster_size); 407 } 408 409 /* TODO block_job_progress_set_remaining() would make more sense */ 410 block_job_progress_update(&job->common, 411 job->len - hbitmap_count(job->copy_bitmap) * job->cluster_size); 412 413 bdrv_dirty_iter_free(dbi); 414 } 415 416 static void coroutine_fn backup_run(void *opaque) 417 { 418 BackupBlockJob *job = opaque; 419 BackupCompleteData *data; 420 BlockDriverState *bs = blk_bs(job->common.blk); 421 int64_t offset, nb_clusters; 422 int ret = 0; 423 424 QLIST_INIT(&job->inflight_reqs); 425 qemu_co_rwlock_init(&job->flush_rwlock); 426 427 nb_clusters = DIV_ROUND_UP(job->len, job->cluster_size); 428 block_job_progress_set_remaining(&job->common, job->len); 429 430 job->copy_bitmap = hbitmap_alloc(nb_clusters, 0); 431 if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 432 backup_incremental_init_copy_bitmap(job); 433 } else { 434 hbitmap_set(job->copy_bitmap, 0, nb_clusters); 435 } 436 437 438 job->before_write.notify = backup_before_write_notify; 439 bdrv_add_before_write_notifier(bs, &job->before_write); 440 441 if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { 442 /* All bits are set in copy_bitmap to allow any cluster to be copied. 443 * This does not actually require them to be copied. */ 444 while (!block_job_is_cancelled(&job->common)) { 445 /* Yield until the job is cancelled. We just let our before_write 446 * notify callback service CoW requests. */ 447 block_job_yield(&job->common); 448 } 449 } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 450 ret = backup_run_incremental(job); 451 } else { 452 /* Both FULL and TOP SYNC_MODE's require copying.. */ 453 for (offset = 0; offset < job->len; 454 offset += job->cluster_size) { 455 bool error_is_read; 456 int alloced = 0; 457 458 if (yield_and_check(job)) { 459 break; 460 } 461 462 if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { 463 int i; 464 int64_t n; 465 466 /* Check to see if these blocks are already in the 467 * backing file. */ 468 469 for (i = 0; i < job->cluster_size;) { 470 /* bdrv_is_allocated() only returns true/false based 471 * on the first set of sectors it comes across that 472 * are are all in the same state. 473 * For that reason we must verify each sector in the 474 * backup cluster length. We end up copying more than 475 * needed but at some point that is always the case. */ 476 alloced = 477 bdrv_is_allocated(bs, offset + i, 478 job->cluster_size - i, &n); 479 i += n; 480 481 if (alloced || n == 0) { 482 break; 483 } 484 } 485 486 /* If the above loop never found any sectors that are in 487 * the topmost image, skip this backup. */ 488 if (alloced == 0) { 489 continue; 490 } 491 } 492 /* FULL sync mode we copy the whole drive. */ 493 if (alloced < 0) { 494 ret = alloced; 495 } else { 496 ret = backup_do_cow(job, offset, job->cluster_size, 497 &error_is_read, false); 498 } 499 if (ret < 0) { 500 /* Depending on error action, fail now or retry cluster */ 501 BlockErrorAction action = 502 backup_error_action(job, error_is_read, -ret); 503 if (action == BLOCK_ERROR_ACTION_REPORT) { 504 break; 505 } else { 506 offset -= job->cluster_size; 507 continue; 508 } 509 } 510 } 511 } 512 513 notifier_with_return_remove(&job->before_write); 514 515 /* wait until pending backup_do_cow() calls have completed */ 516 qemu_co_rwlock_wrlock(&job->flush_rwlock); 517 qemu_co_rwlock_unlock(&job->flush_rwlock); 518 hbitmap_free(job->copy_bitmap); 519 520 data = g_malloc(sizeof(*data)); 521 data->ret = ret; 522 block_job_defer_to_main_loop(&job->common, backup_complete, data); 523 } 524 525 static const BlockJobDriver backup_job_driver = { 526 .instance_size = sizeof(BackupBlockJob), 527 .job_type = BLOCK_JOB_TYPE_BACKUP, 528 .start = backup_run, 529 .commit = backup_commit, 530 .abort = backup_abort, 531 .clean = backup_clean, 532 .attached_aio_context = backup_attached_aio_context, 533 .drain = backup_drain, 534 }; 535 536 BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, 537 BlockDriverState *target, int64_t speed, 538 MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap, 539 bool compress, 540 BlockdevOnError on_source_error, 541 BlockdevOnError on_target_error, 542 int creation_flags, 543 BlockCompletionFunc *cb, void *opaque, 544 BlockJobTxn *txn, Error **errp) 545 { 546 int64_t len; 547 BlockDriverInfo bdi; 548 BackupBlockJob *job = NULL; 549 int ret; 550 551 assert(bs); 552 assert(target); 553 554 if (bs == target) { 555 error_setg(errp, "Source and target cannot be the same"); 556 return NULL; 557 } 558 559 if (!bdrv_is_inserted(bs)) { 560 error_setg(errp, "Device is not inserted: %s", 561 bdrv_get_device_name(bs)); 562 return NULL; 563 } 564 565 if (!bdrv_is_inserted(target)) { 566 error_setg(errp, "Device is not inserted: %s", 567 bdrv_get_device_name(target)); 568 return NULL; 569 } 570 571 if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) { 572 error_setg(errp, "Compression is not supported for this drive %s", 573 bdrv_get_device_name(target)); 574 return NULL; 575 } 576 577 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { 578 return NULL; 579 } 580 581 if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) { 582 return NULL; 583 } 584 585 if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 586 if (!sync_bitmap) { 587 error_setg(errp, "must provide a valid bitmap name for " 588 "\"incremental\" sync mode"); 589 return NULL; 590 } 591 592 /* Create a new bitmap, and freeze/disable this one. */ 593 if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { 594 return NULL; 595 } 596 } else if (sync_bitmap) { 597 error_setg(errp, 598 "a sync_bitmap was provided to backup_run, " 599 "but received an incompatible sync_mode (%s)", 600 MirrorSyncMode_str(sync_mode)); 601 return NULL; 602 } 603 604 len = bdrv_getlength(bs); 605 if (len < 0) { 606 error_setg_errno(errp, -len, "unable to get length for '%s'", 607 bdrv_get_device_name(bs)); 608 goto error; 609 } 610 611 /* job->len is fixed, so we can't allow resize */ 612 job = block_job_create(job_id, &backup_job_driver, txn, bs, 613 BLK_PERM_CONSISTENT_READ, 614 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | 615 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD, 616 speed, creation_flags, cb, opaque, errp); 617 if (!job) { 618 goto error; 619 } 620 621 /* The target must match the source in size, so no resize here either */ 622 job->target = blk_new(BLK_PERM_WRITE, 623 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | 624 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD); 625 ret = blk_insert_bs(job->target, target, errp); 626 if (ret < 0) { 627 goto error; 628 } 629 630 job->on_source_error = on_source_error; 631 job->on_target_error = on_target_error; 632 job->sync_mode = sync_mode; 633 job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? 634 sync_bitmap : NULL; 635 job->compress = compress; 636 637 /* If there is no backing file on the target, we cannot rely on COW if our 638 * backup cluster size is smaller than the target cluster size. Even for 639 * targets with a backing file, try to avoid COW if possible. */ 640 ret = bdrv_get_info(target, &bdi); 641 if (ret == -ENOTSUP && !target->backing) { 642 /* Cluster size is not defined */ 643 warn_report("The target block device doesn't provide " 644 "information about the block size and it doesn't have a " 645 "backing file. The default block size of %u bytes is " 646 "used. If the actual block size of the target exceeds " 647 "this default, the backup may be unusable", 648 BACKUP_CLUSTER_SIZE_DEFAULT); 649 job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; 650 } else if (ret < 0 && !target->backing) { 651 error_setg_errno(errp, -ret, 652 "Couldn't determine the cluster size of the target image, " 653 "which has no backing file"); 654 error_append_hint(errp, 655 "Aborting, since this may create an unusable destination image\n"); 656 goto error; 657 } else if (ret < 0 && target->backing) { 658 /* Not fatal; just trudge on ahead. */ 659 job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; 660 } else { 661 job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); 662 } 663 664 /* Required permissions are already taken with target's blk_new() */ 665 block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL, 666 &error_abort); 667 job->len = len; 668 669 return &job->common; 670 671 error: 672 if (sync_bitmap) { 673 bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); 674 } 675 if (job) { 676 backup_clean(&job->common); 677 block_job_early_fail(&job->common); 678 } 679 680 return NULL; 681 } 682