1 /* 2 * QEMU backup 3 * 4 * Copyright (C) 2013 Proxmox Server Solutions 5 * 6 * Authors: 7 * Dietmar Maurer (dietmar@proxmox.com) 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 * 12 */ 13 14 #include <stdio.h> 15 #include <errno.h> 16 #include <unistd.h> 17 18 #include "trace.h" 19 #include "block/block.h" 20 #include "block/block_int.h" 21 #include "block/blockjob.h" 22 #include "qapi/qmp/qerror.h" 23 #include "qemu/ratelimit.h" 24 25 #define BACKUP_CLUSTER_BITS 16 26 #define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS) 27 #define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE) 28 29 #define SLICE_TIME 100000000ULL /* ns */ 30 31 typedef struct CowRequest { 32 int64_t start; 33 int64_t end; 34 QLIST_ENTRY(CowRequest) list; 35 CoQueue wait_queue; /* coroutines blocked on this request */ 36 } CowRequest; 37 38 typedef struct BackupBlockJob { 39 BlockJob common; 40 BlockDriverState *target; 41 /* bitmap for sync=incremental */ 42 BdrvDirtyBitmap *sync_bitmap; 43 MirrorSyncMode sync_mode; 44 RateLimit limit; 45 BlockdevOnError on_source_error; 46 BlockdevOnError on_target_error; 47 CoRwlock flush_rwlock; 48 uint64_t sectors_read; 49 HBitmap *bitmap; 50 QLIST_HEAD(, CowRequest) inflight_reqs; 51 } BackupBlockJob; 52 53 /* See if in-flight requests overlap and wait for them to complete */ 54 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, 55 int64_t start, 56 int64_t end) 57 { 58 CowRequest *req; 59 bool retry; 60 61 do { 62 retry = false; 63 QLIST_FOREACH(req, &job->inflight_reqs, list) { 64 if (end > req->start && start < req->end) { 65 qemu_co_queue_wait(&req->wait_queue); 66 retry = true; 67 break; 68 } 69 } 70 } while (retry); 71 } 72 73 /* Keep track of an in-flight request */ 74 static void cow_request_begin(CowRequest *req, BackupBlockJob *job, 75 int64_t start, int64_t end) 76 { 77 req->start = start; 78 req->end = end; 79 qemu_co_queue_init(&req->wait_queue); 80 QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); 81 } 82 83 /* Forget about a completed request */ 84 static void cow_request_end(CowRequest *req) 85 { 86 QLIST_REMOVE(req, list); 87 qemu_co_queue_restart_all(&req->wait_queue); 88 } 89 90 static int coroutine_fn backup_do_cow(BlockDriverState *bs, 91 int64_t sector_num, int nb_sectors, 92 bool *error_is_read, 93 bool is_write_notifier) 94 { 95 BackupBlockJob *job = (BackupBlockJob *)bs->job; 96 CowRequest cow_request; 97 struct iovec iov; 98 QEMUIOVector bounce_qiov; 99 void *bounce_buffer = NULL; 100 int ret = 0; 101 int64_t start, end; 102 int n; 103 104 qemu_co_rwlock_rdlock(&job->flush_rwlock); 105 106 start = sector_num / BACKUP_SECTORS_PER_CLUSTER; 107 end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER); 108 109 trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); 110 111 wait_for_overlapping_requests(job, start, end); 112 cow_request_begin(&cow_request, job, start, end); 113 114 for (; start < end; start++) { 115 if (hbitmap_get(job->bitmap, start)) { 116 trace_backup_do_cow_skip(job, start); 117 continue; /* already copied */ 118 } 119 120 trace_backup_do_cow_process(job, start); 121 122 n = MIN(BACKUP_SECTORS_PER_CLUSTER, 123 job->common.len / BDRV_SECTOR_SIZE - 124 start * BACKUP_SECTORS_PER_CLUSTER); 125 126 if (!bounce_buffer) { 127 bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE); 128 } 129 iov.iov_base = bounce_buffer; 130 iov.iov_len = n * BDRV_SECTOR_SIZE; 131 qemu_iovec_init_external(&bounce_qiov, &iov, 1); 132 133 if (is_write_notifier) { 134 ret = bdrv_co_no_copy_on_readv(bs, 135 start * BACKUP_SECTORS_PER_CLUSTER, 136 n, &bounce_qiov); 137 } else { 138 ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n, 139 &bounce_qiov); 140 } 141 if (ret < 0) { 142 trace_backup_do_cow_read_fail(job, start, ret); 143 if (error_is_read) { 144 *error_is_read = true; 145 } 146 goto out; 147 } 148 149 if (buffer_is_zero(iov.iov_base, iov.iov_len)) { 150 ret = bdrv_co_write_zeroes(job->target, 151 start * BACKUP_SECTORS_PER_CLUSTER, 152 n, BDRV_REQ_MAY_UNMAP); 153 } else { 154 ret = bdrv_co_writev(job->target, 155 start * BACKUP_SECTORS_PER_CLUSTER, n, 156 &bounce_qiov); 157 } 158 if (ret < 0) { 159 trace_backup_do_cow_write_fail(job, start, ret); 160 if (error_is_read) { 161 *error_is_read = false; 162 } 163 goto out; 164 } 165 166 hbitmap_set(job->bitmap, start, 1); 167 168 /* Publish progress, guest I/O counts as progress too. Note that the 169 * offset field is an opaque progress value, it is not a disk offset. 170 */ 171 job->sectors_read += n; 172 job->common.offset += n * BDRV_SECTOR_SIZE; 173 } 174 175 out: 176 if (bounce_buffer) { 177 qemu_vfree(bounce_buffer); 178 } 179 180 cow_request_end(&cow_request); 181 182 trace_backup_do_cow_return(job, sector_num, nb_sectors, ret); 183 184 qemu_co_rwlock_unlock(&job->flush_rwlock); 185 186 return ret; 187 } 188 189 static int coroutine_fn backup_before_write_notify( 190 NotifierWithReturn *notifier, 191 void *opaque) 192 { 193 BdrvTrackedRequest *req = opaque; 194 int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; 195 int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; 196 197 assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); 198 assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); 199 200 return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true); 201 } 202 203 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) 204 { 205 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 206 207 if (speed < 0) { 208 error_setg(errp, QERR_INVALID_PARAMETER, "speed"); 209 return; 210 } 211 ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); 212 } 213 214 static void backup_iostatus_reset(BlockJob *job) 215 { 216 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 217 218 bdrv_iostatus_reset(s->target); 219 } 220 221 static const BlockJobDriver backup_job_driver = { 222 .instance_size = sizeof(BackupBlockJob), 223 .job_type = BLOCK_JOB_TYPE_BACKUP, 224 .set_speed = backup_set_speed, 225 .iostatus_reset = backup_iostatus_reset, 226 }; 227 228 static BlockErrorAction backup_error_action(BackupBlockJob *job, 229 bool read, int error) 230 { 231 if (read) { 232 return block_job_error_action(&job->common, job->common.bs, 233 job->on_source_error, true, error); 234 } else { 235 return block_job_error_action(&job->common, job->target, 236 job->on_target_error, false, error); 237 } 238 } 239 240 typedef struct { 241 int ret; 242 } BackupCompleteData; 243 244 static void backup_complete(BlockJob *job, void *opaque) 245 { 246 BackupBlockJob *s = container_of(job, BackupBlockJob, common); 247 BackupCompleteData *data = opaque; 248 249 bdrv_unref(s->target); 250 251 block_job_completed(job, data->ret); 252 g_free(data); 253 } 254 255 static bool coroutine_fn yield_and_check(BackupBlockJob *job) 256 { 257 if (block_job_is_cancelled(&job->common)) { 258 return true; 259 } 260 261 /* we need to yield so that bdrv_drain_all() returns. 262 * (without, VM does not reboot) 263 */ 264 if (job->common.speed) { 265 uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, 266 job->sectors_read); 267 job->sectors_read = 0; 268 block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); 269 } else { 270 block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); 271 } 272 273 if (block_job_is_cancelled(&job->common)) { 274 return true; 275 } 276 277 return false; 278 } 279 280 static int coroutine_fn backup_run_incremental(BackupBlockJob *job) 281 { 282 bool error_is_read; 283 int ret = 0; 284 int clusters_per_iter; 285 uint32_t granularity; 286 int64_t sector; 287 int64_t cluster; 288 int64_t end; 289 int64_t last_cluster = -1; 290 BlockDriverState *bs = job->common.bs; 291 HBitmapIter hbi; 292 293 granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); 294 clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1); 295 bdrv_dirty_iter_init(job->sync_bitmap, &hbi); 296 297 /* Find the next dirty sector(s) */ 298 while ((sector = hbitmap_iter_next(&hbi)) != -1) { 299 cluster = sector / BACKUP_SECTORS_PER_CLUSTER; 300 301 /* Fake progress updates for any clusters we skipped */ 302 if (cluster != last_cluster + 1) { 303 job->common.offset += ((cluster - last_cluster - 1) * 304 BACKUP_CLUSTER_SIZE); 305 } 306 307 for (end = cluster + clusters_per_iter; cluster < end; cluster++) { 308 do { 309 if (yield_and_check(job)) { 310 return ret; 311 } 312 ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER, 313 BACKUP_SECTORS_PER_CLUSTER, &error_is_read, 314 false); 315 if ((ret < 0) && 316 backup_error_action(job, error_is_read, -ret) == 317 BLOCK_ERROR_ACTION_REPORT) { 318 return ret; 319 } 320 } while (ret < 0); 321 } 322 323 /* If the bitmap granularity is smaller than the backup granularity, 324 * we need to advance the iterator pointer to the next cluster. */ 325 if (granularity < BACKUP_CLUSTER_SIZE) { 326 bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER); 327 } 328 329 last_cluster = cluster - 1; 330 } 331 332 /* Play some final catchup with the progress meter */ 333 end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); 334 if (last_cluster + 1 < end) { 335 job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE); 336 } 337 338 return ret; 339 } 340 341 static void coroutine_fn backup_run(void *opaque) 342 { 343 BackupBlockJob *job = opaque; 344 BackupCompleteData *data; 345 BlockDriverState *bs = job->common.bs; 346 BlockDriverState *target = job->target; 347 BlockdevOnError on_target_error = job->on_target_error; 348 NotifierWithReturn before_write = { 349 .notify = backup_before_write_notify, 350 }; 351 int64_t start, end; 352 int ret = 0; 353 354 QLIST_INIT(&job->inflight_reqs); 355 qemu_co_rwlock_init(&job->flush_rwlock); 356 357 start = 0; 358 end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); 359 360 job->bitmap = hbitmap_alloc(end, 0); 361 362 bdrv_set_enable_write_cache(target, true); 363 bdrv_set_on_error(target, on_target_error, on_target_error); 364 bdrv_iostatus_enable(target); 365 366 bdrv_add_before_write_notifier(bs, &before_write); 367 368 if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { 369 while (!block_job_is_cancelled(&job->common)) { 370 /* Yield until the job is cancelled. We just let our before_write 371 * notify callback service CoW requests. */ 372 job->common.busy = false; 373 qemu_coroutine_yield(); 374 job->common.busy = true; 375 } 376 } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 377 ret = backup_run_incremental(job); 378 } else { 379 /* Both FULL and TOP SYNC_MODE's require copying.. */ 380 for (; start < end; start++) { 381 bool error_is_read; 382 if (yield_and_check(job)) { 383 break; 384 } 385 386 if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { 387 int i, n; 388 int alloced = 0; 389 390 /* Check to see if these blocks are already in the 391 * backing file. */ 392 393 for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { 394 /* bdrv_is_allocated() only returns true/false based 395 * on the first set of sectors it comes across that 396 * are are all in the same state. 397 * For that reason we must verify each sector in the 398 * backup cluster length. We end up copying more than 399 * needed but at some point that is always the case. */ 400 alloced = 401 bdrv_is_allocated(bs, 402 start * BACKUP_SECTORS_PER_CLUSTER + i, 403 BACKUP_SECTORS_PER_CLUSTER - i, &n); 404 i += n; 405 406 if (alloced == 1 || n == 0) { 407 break; 408 } 409 } 410 411 /* If the above loop never found any sectors that are in 412 * the topmost image, skip this backup. */ 413 if (alloced == 0) { 414 continue; 415 } 416 } 417 /* FULL sync mode we copy the whole drive. */ 418 ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER, 419 BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false); 420 if (ret < 0) { 421 /* Depending on error action, fail now or retry cluster */ 422 BlockErrorAction action = 423 backup_error_action(job, error_is_read, -ret); 424 if (action == BLOCK_ERROR_ACTION_REPORT) { 425 break; 426 } else { 427 start--; 428 continue; 429 } 430 } 431 } 432 } 433 434 notifier_with_return_remove(&before_write); 435 436 /* wait until pending backup_do_cow() calls have completed */ 437 qemu_co_rwlock_wrlock(&job->flush_rwlock); 438 qemu_co_rwlock_unlock(&job->flush_rwlock); 439 440 if (job->sync_bitmap) { 441 BdrvDirtyBitmap *bm; 442 if (ret < 0 || block_job_is_cancelled(&job->common)) { 443 /* Merge the successor back into the parent, delete nothing. */ 444 bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); 445 assert(bm); 446 } else { 447 /* Everything is fine, delete this bitmap and install the backup. */ 448 bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); 449 assert(bm); 450 } 451 } 452 hbitmap_free(job->bitmap); 453 454 bdrv_iostatus_disable(target); 455 bdrv_op_unblock_all(target, job->common.blocker); 456 457 data = g_malloc(sizeof(*data)); 458 data->ret = ret; 459 block_job_defer_to_main_loop(&job->common, backup_complete, data); 460 } 461 462 void backup_start(BlockDriverState *bs, BlockDriverState *target, 463 int64_t speed, MirrorSyncMode sync_mode, 464 BdrvDirtyBitmap *sync_bitmap, 465 BlockdevOnError on_source_error, 466 BlockdevOnError on_target_error, 467 BlockCompletionFunc *cb, void *opaque, 468 Error **errp) 469 { 470 int64_t len; 471 472 assert(bs); 473 assert(target); 474 assert(cb); 475 476 if (bs == target) { 477 error_setg(errp, "Source and target cannot be the same"); 478 return; 479 } 480 481 if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || 482 on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && 483 !bdrv_iostatus_is_enabled(bs)) { 484 error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); 485 return; 486 } 487 488 if (!bdrv_is_inserted(bs)) { 489 error_setg(errp, "Device is not inserted: %s", 490 bdrv_get_device_name(bs)); 491 return; 492 } 493 494 if (!bdrv_is_inserted(target)) { 495 error_setg(errp, "Device is not inserted: %s", 496 bdrv_get_device_name(target)); 497 return; 498 } 499 500 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { 501 return; 502 } 503 504 if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) { 505 return; 506 } 507 508 if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { 509 if (!sync_bitmap) { 510 error_setg(errp, "must provide a valid bitmap name for " 511 "\"incremental\" sync mode"); 512 return; 513 } 514 515 /* Create a new bitmap, and freeze/disable this one. */ 516 if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { 517 return; 518 } 519 } else if (sync_bitmap) { 520 error_setg(errp, 521 "a sync_bitmap was provided to backup_run, " 522 "but received an incompatible sync_mode (%s)", 523 MirrorSyncMode_lookup[sync_mode]); 524 return; 525 } 526 527 len = bdrv_getlength(bs); 528 if (len < 0) { 529 error_setg_errno(errp, -len, "unable to get length for '%s'", 530 bdrv_get_device_name(bs)); 531 goto error; 532 } 533 534 BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, 535 cb, opaque, errp); 536 if (!job) { 537 goto error; 538 } 539 540 bdrv_op_block_all(target, job->common.blocker); 541 542 job->on_source_error = on_source_error; 543 job->on_target_error = on_target_error; 544 job->target = target; 545 job->sync_mode = sync_mode; 546 job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? 547 sync_bitmap : NULL; 548 job->common.len = len; 549 job->common.co = qemu_coroutine_create(backup_run); 550 qemu_coroutine_enter(job->common.co, job); 551 return; 552 553 error: 554 if (sync_bitmap) { 555 bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); 556 } 557 } 558