1 /* 2 * QEMU backup 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * 6 * Authors: 7 * Dietmar Maurer (dietmar@proxmox.com) 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 * 12 */ 13 14 #include "qemu/osdep.h" 15 16 #include "trace.h" 17 #include "block/block.h" 18 #include "block/block_int.h" 19 #include "block/blockjob.h" 20 #include "qapi/error.h" 21 #include "qapi/qmp/qerror.h" 22 #include "qemu/ratelimit.h" 23 #include "qemu/cutils.h" 24 #include "sysemu/block-backend.h" 25 #include "qemu/bitmap.h" 26 27 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) 28 #define SLICE_TIME 100000000ULL /* ns */ 29 30 typedef struct CowRequest { 31 int64_t start; 32 int64_t end; 33 QLIST_ENTRY(CowRequest) list; 34 CoQueue wait_queue; /* coroutines blocked on this request */ 35 } CowRequest; 36 37 typedef struct BackupBlockJob { 38 BlockJob common; 39 BlockBackend *target; 40 /* bitmap for sync=incremental */ 41 BdrvDirtyBitmap *sync_bitmap; 42 MirrorSyncMode sync_mode; 43 RateLimit limit; 44 BlockdevOnError on_source_error; 45 BlockdevOnError on_target_error; 46 CoRwlock flush_rwlock; 47 uint64_t sectors_read; 48 unsigned long *done_bitmap; 49 int64_t cluster_size; 50 bool compress; 51 NotifierWithReturn before_write; 52 QLIST_HEAD(, CowRequest) inflight_reqs; 53 } BackupBlockJob; 54 55 /* Size of a cluster in sectors, instead of bytes. */ 56 static inline int64_t cluster_size_sectors(BackupBlockJob *job) 57 { 58 return job->cluster_size / BDRV_SECTOR_SIZE; 59 } 60 61 /* See if in-flight requests overlap and wait for them to complete */ 62 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, 63 int64_t start, 64 int64_t end) 65 { 66 CowRequest *req; 67 bool retry; 68 69 do { 70 retry = false; 71 QLIST_FOREACH(req, &job->inflight_reqs, list) { 72 if (end > req->start && start < req->end) { 73 qemu_co_queue_wait(&req->wait_queue); 74 retry = true; 75 break; 76 } 77 } 78 } while (retry); 79 } 80 81 /* Keep track of an in-flight request */ 82 static void cow_request_begin(CowRequest *req, BackupBlockJob *job, 83 int64_t start, int64_t end) 84 { 85 req->start = start; 86 req->end = end; 87 qemu_co_queue_init(&req->wait_queue); 88 QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); 89 } 90 91 /* Forget about a completed request */ 92 static void cow_request_end(CowRequest *req) 93 { 94 QLIST_REMOVE(req, list); 95 qemu_co_queue_restart_all(&req->wait_queue); 96 } 97 98 static int coroutine_fn backup_do_cow(BackupBlockJob *job, 99 int64_t sector_num, int nb_sectors, 100 bool *error_is_read, 101 bool is_write_notifier) 102 { 103 BlockBackend *blk = job->common.blk; 104 CowRequest cow_request; 105 struct iovec iov; 106 QEMUIOVector bounce_qiov; 107 void *bounce_buffer = NULL; 108 int ret = 0; 109 int64_t sectors_per_cluster = cluster_size_sectors(job); 110 int64_t start, end; 111 int n; 112 113 qemu_co_rwlock_rdlock(&job->flush_rwlock); 114 115 start = sector_num / sectors_per_cluster; 116 end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster); 117 118 trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); 119 120 wait_for_overlapping_requests(job, start, end); 121 cow_request_begin(&cow_request, job, start, end); 122 123 for (; start < end; start++) { 124 if (test_bit(start, job->done_bitmap)) { 125 trace_backup_do_cow_skip(job, start); 126 continue; /* already copied */ 127 } 128 129 trace_backup_do_cow_process(job, start); 130 131 n = MIN(sectors_per_cluster, 132 job->common.len / BDRV_SECTOR_SIZE - 133 start * sectors_per_cluster); 134 135 if (!bounce_buffer) { 136 bounce_buffer = blk_blockalign(blk, job->cluster_size); 137 } 138 iov.iov_base = bounce_buffer; 139 iov.iov_len = n * BDRV_SECTOR_SIZE; 140 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 141 142 ret = blk_co_preadv(blk, start * job->cluster_size, 143 bounce_qiov.size, &bounce_qiov, 144 is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0); 145 if (ret < 0) { 146 trace_backup_do_cow_read_fail(job, start, ret); 147 if (error_is_read) { 148 *error_is_read = true; 149 } 150 goto out; 151 } 152 153 if (buffer_is_zero(iov.iov_base, iov.iov_len)) { 154 ret = blk_co_pwrite_zeroes(job->target, start * job->cluster_size, 155 bounce_qiov.size, BDRV_REQ_MAY_UNMAP); 156 } else { 157 ret = blk_co_pwritev(job->target, start * job->cluster_size, 158 bounce_qiov.size, &bounce_qiov, 159 job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0); 160 } 161 if (ret < 0) { 162 trace_backup_do_cow_write_fail(job, start, ret); 163 if (error_is_read) { 164 *error_is_read = false; 165 } 166 goto out; 167 } 168 169 set_bit(start, job->done_bitmap); 170 171 /* Publish progress, guest I/O counts as progress too. Note that the 172 * offset field is an opaque progress value, it is not a disk offset. 173 */ 174 job->sectors_read += n; 175 job->common.offset += n * BDRV_SECTOR_SIZE; 176 } 177 178 out: 179 if (bounce_buffer) { 180 qemu_vfree(bounce_buffer); 181 } 182 183 cow_request_end(&cow_request); 184 185 trace_backup_do_cow_return(job, sector_num, nb_sectors, ret); 186 187 qemu_co_rwlock_unlock(&job->flush_rwlock); 188 189 return ret; 190 } 191 192 static int coroutine_fn backup_before_write_notify( 193 NotifierWithReturn *notifier, 194 void *opaque) 195 { 196 BackupBlockJob *job = container_of(notifier, BackupBlockJob, before_write); 197 BdrvTrackedRequest *req = opaque; 198 int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; 199 int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; 200 201 assert(req->bs == blk_bs(job->common.blk)); 202 assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); 203 assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 204 205 return backup_do_cow(job, sector_num, nb_sectors, NULL, true); 206 } 207 208 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) 209 { 210 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 211 212 if (speed < 0) { 213 error_setg(errp, QERR_INVALID_PARAMETER, "speed"); 214 return; 215 } 216 ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); 217 } 218 219 static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) 220 { 221 BdrvDirtyBitmap *bm; 222 BlockDriverState *bs = blk_bs(job->common.blk); 223 224 if (ret < 0 || block_job_is_cancelled(&job->common)) { 225 /* Merge the successor back into the parent, delete nothing. */ 226 bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); 227 assert(bm); 228 } else { 229 /* Everything is fine, delete this bitmap and install the backup. */ 230 bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); 231 assert(bm); 232 } 233 } 234 235 static void backup_commit(BlockJob *job) 236 { 237 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 238 if (s->sync_bitmap) { 239 backup_cleanup_sync_bitmap(s, 0); 240 } 241 } 242 243 static void backup_abort(BlockJob *job) 244 { 245 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 246 if (s->sync_bitmap) { 247 backup_cleanup_sync_bitmap(s, -1); 248 } 249 } 250 251 static void backup_attached_aio_context(BlockJob *job, AioContext *aio_context) 252 { 253 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 254 255 blk_set_aio_context(s->target, aio_context); 256 } 257 258 static const BlockJobDriver backup_job_driver = { 259 .instance_size = sizeof(BackupBlockJob), 260 .job_type = BLOCK_JOB_TYPE_BACKUP, 261 .set_speed = backup_set_speed, 262 .commit = backup_commit, 263 .abort = backup_abort, 264 .attached_aio_context = backup_attached_aio_context, 265 }; 266 267 static BlockErrorAction backup_error_action(BackupBlockJob *job, 268 bool read, int error) 269 { 270 if (read) { 271 return block_job_error_action(&job->common, job->on_source_error, 272 true, error); 273 } else { 274 return block_job_error_action(&job->common, job->on_target_error, 275 false, error); 276 } 277 } 278 279 typedef struct { 280 int ret; 281 } BackupCompleteData; 282 283 static void backup_complete(BlockJob *job, void *opaque) 284 { 285 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 286 BackupCompleteData *data = opaque; 287 288 blk_unref(s->target); 289 290 block_job_completed(job, data->ret); 291 g_free(data); 292 } 293 294 static bool coroutine_fn yield_and_check(BackupBlockJob *job) 295 { 296 if (block_job_is_cancelled(&job->common)) { 297 return true; 298 } 299 300 /* we need to yield so that bdrv_drain_all() returns. 301 * (without, VM does not reboot) 302 */ 303 if (job->common.speed) { 304 uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, 305 job->sectors_read); 306 job->sectors_read = 0; 307 block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); 308 } else { 309 block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); 310 } 311 312 if (block_job_is_cancelled(&job->common)) { 313 return true; 314 } 315 316 return false; 317 } 318 319 static int coroutine_fn backup_run_incremental(BackupBlockJob *job) 320 { 321 bool error_is_read; 322 int ret = 0; 323 int clusters_per_iter; 324 uint32_t granularity; 325 int64_t sector; 326 int64_t cluster; 327 int64_t end; 328 int64_t last_cluster = -1; 329 int64_t sectors_per_cluster = cluster_size_sectors(job); 330 HBitmapIter hbi; 331 332 granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); 333 clusters_per_iter = MAX((granularity / job->cluster_size), 1); 334 bdrv_dirty_iter_init(job->sync_bitmap, &hbi); 335 336 /* Find the next dirty sector(s) */ 337 while ((sector = hbitmap_iter_next(&hbi)) != -1) { 338 cluster = sector / sectors_per_cluster; 339 340 /* Fake progress updates for any clusters we skipped */ 341 if (cluster != last_cluster + 1) { 342 job->common.offset += ((cluster - last_cluster - 1) * 343 job->cluster_size); 344 } 345 346 for (end = cluster + clusters_per_iter; cluster < end; cluster++) { 347 do { 348 if (yield_and_check(job)) { 349 return ret; 350 } 351 ret = backup_do_cow(job, cluster * sectors_per_cluster, 352 sectors_per_cluster, &error_is_read, 353 false); 354 if ((ret < 0) && 355 backup_error_action(job, error_is_read, -ret) == 356 BLOCK_ERROR_ACTION_REPORT) { 357 return ret; 358 } 359 } while (ret < 0); 360 } 361 362 /* If the bitmap granularity is smaller than the backup granularity, 363 * we need to advance the iterator pointer to the next cluster. */ 364 if (granularity < job->cluster_size) { 365 bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster); 366 } 367 368 last_cluster = cluster - 1; 369 } 370 371 /* Play some final catchup with the progress meter */ 372 end = DIV_ROUND_UP(job->common.len, job->cluster_size); 373 if (last_cluster + 1 < end) { 374 job->common.offset += ((end - last_cluster - 1) * job->cluster_size); 375 } 376 377 return ret; 378 } 379 380 static void coroutine_fn backup_run(void *opaque) 381 { 382 BackupBlockJob *job = opaque; 383 BackupCompleteData *data; 384 BlockDriverState *bs = blk_bs(job->common.blk); 385 BlockBackend *target = job->target; 386 int64_t start, end; 387 int64_t sectors_per_cluster = cluster_size_sectors(job); 388 int ret = 0; 389 390 QLIST_INIT(&job->inflight_reqs); 391 qemu_co_rwlock_init(&job->flush_rwlock); 392 393 start = 0; 394 end = DIV_ROUND_UP(job->common.len, job->cluster_size); 395 396 job->done_bitmap = bitmap_new(end); 397 398 job->before_write.notify = backup_before_write_notify; 399 bdrv_add_before_write_notifier(bs, &job->before_write); 400 401 if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { 402 while (!block_job_is_cancelled(&job->common)) { 403 /* Yield until the job is cancelled. We just let our before_write 404 * notify callback service CoW requests. */ 405 block_job_yield(&job->common); 406 } 407 } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 408 ret = backup_run_incremental(job); 409 } else { 410 /* Both FULL and TOP SYNC_MODE's require copying.. */ 411 for (; start < end; start++) { 412 bool error_is_read; 413 if (yield_and_check(job)) { 414 break; 415 } 416 417 if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { 418 int i, n; 419 int alloced = 0; 420 421 /* Check to see if these blocks are already in the 422 * backing file. */ 423 424 for (i = 0; i < sectors_per_cluster;) { 425 /* bdrv_is_allocated() only returns true/false based 426 * on the first set of sectors it comes across that 427 * are are all in the same state. 428 * For that reason we must verify each sector in the 429 * backup cluster length. We end up copying more than 430 * needed but at some point that is always the case. */ 431 alloced = 432 bdrv_is_allocated(bs, 433 start * sectors_per_cluster + i, 434 sectors_per_cluster - i, &n); 435 i += n; 436 437 if (alloced == 1 || n == 0) { 438 break; 439 } 440 } 441 442 /* If the above loop never found any sectors that are in 443 * the topmost image, skip this backup. */ 444 if (alloced == 0) { 445 continue; 446 } 447 } 448 /* FULL sync mode we copy the whole drive. */ 449 ret = backup_do_cow(job, start * sectors_per_cluster, 450 sectors_per_cluster, &error_is_read, false); 451 if (ret < 0) { 452 /* Depending on error action, fail now or retry cluster */ 453 BlockErrorAction action = 454 backup_error_action(job, error_is_read, -ret); 455 if (action == BLOCK_ERROR_ACTION_REPORT) { 456 break; 457 } else { 458 start--; 459 continue; 460 } 461 } 462 } 463 } 464 465 notifier_with_return_remove(&job->before_write); 466 467 /* wait until pending backup_do_cow() calls have completed */ 468 qemu_co_rwlock_wrlock(&job->flush_rwlock); 469 qemu_co_rwlock_unlock(&job->flush_rwlock); 470 g_free(job->done_bitmap); 471 472 bdrv_op_unblock_all(blk_bs(target), job->common.blocker); 473 474 data = g_malloc(sizeof(*data)); 475 data->ret = ret; 476 block_job_defer_to_main_loop(&job->common, backup_complete, data); 477 } 478 479 void backup_start(const char *job_id, BlockDriverState *bs, 480 BlockDriverState *target, int64_t speed, 481 MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap, 482 bool compress, 483 BlockdevOnError on_source_error, 484 BlockdevOnError on_target_error, 485 BlockCompletionFunc *cb, void *opaque, 486 BlockJobTxn *txn, Error **errp) 487 { 488 int64_t len; 489 BlockDriverInfo bdi; 490 BackupBlockJob *job = NULL; 491 int ret; 492 493 assert(bs); 494 assert(target); 495 496 if (bs == target) { 497 error_setg(errp, "Source and target cannot be the same"); 498 return; 499 } 500 501 if (!bdrv_is_inserted(bs)) { 502 error_setg(errp, "Device is not inserted: %s", 503 bdrv_get_device_name(bs)); 504 return; 505 } 506 507 if (!bdrv_is_inserted(target)) { 508 error_setg(errp, "Device is not inserted: %s", 509 bdrv_get_device_name(target)); 510 return; 511 } 512 513 if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) { 514 error_setg(errp, "Compression is not supported for this drive %s", 515 bdrv_get_device_name(target)); 516 return; 517 } 518 519 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { 520 return; 521 } 522 523 if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) { 524 return; 525 } 526 527 if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 528 if (!sync_bitmap) { 529 error_setg(errp, "must provide a valid bitmap name for " 530 "\"incremental\" sync mode"); 531 return; 532 } 533 534 /* Create a new bitmap, and freeze/disable this one. */ 535 if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { 536 return; 537 } 538 } else if (sync_bitmap) { 539 error_setg(errp, 540 "a sync_bitmap was provided to backup_run, " 541 "but received an incompatible sync_mode (%s)", 542 MirrorSyncMode_lookup[sync_mode]); 543 return; 544 } 545 546 len = bdrv_getlength(bs); 547 if (len < 0) { 548 error_setg_errno(errp, -len, "unable to get length for '%s'", 549 bdrv_get_device_name(bs)); 550 goto error; 551 } 552 553 job = block_job_create(job_id, &backup_job_driver, bs, speed, 554 cb, opaque, errp); 555 if (!job) { 556 goto error; 557 } 558 559 job->target = blk_new(); 560 blk_insert_bs(job->target, target); 561 562 job->on_source_error = on_source_error; 563 job->on_target_error = on_target_error; 564 job->sync_mode = sync_mode; 565 job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? 566 sync_bitmap : NULL; 567 job->compress = compress; 568 569 /* If there is no backing file on the target, we cannot rely on COW if our 570 * backup cluster size is smaller than the target cluster size. Even for 571 * targets with a backing file, try to avoid COW if possible. */ 572 ret = bdrv_get_info(target, &bdi); 573 if (ret < 0 && !target->backing) { 574 error_setg_errno(errp, -ret, 575 "Couldn't determine the cluster size of the target image, " 576 "which has no backing file"); 577 error_append_hint(errp, 578 "Aborting, since this may create an unusable destination image\n"); 579 goto error; 580 } else if (ret < 0 && target->backing) { 581 /* Not fatal; just trudge on ahead. */ 582 job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; 583 } else { 584 job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); 585 } 586 587 bdrv_op_block_all(target, job->common.blocker); 588 job->common.len = len; 589 job->common.co = qemu_coroutine_create(backup_run, job); 590 block_job_txn_add_job(txn, &job->common); 591 qemu_coroutine_enter(job->common.co); 592 return; 593 594 error: 595 if (sync_bitmap) { 596 bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); 597 } 598 if (job) { 599 blk_unref(job->target); 600 block_job_unref(&job->common); 601 } 602 } 603