1 /* 2 * Live block commit 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Jeff Cody <jcody@redhat.com> 8 * Based on stream.c by Stefan Hajnoczi 9 * 10 * This work is licensed under the terms of the GNU LGPL, version 2 or later. 11 * See the COPYING.LIB file in the top-level directory. 12 * 13 */ 14 15 #include "qemu/osdep.h" 16 #include "qemu/cutils.h" 17 #include "trace.h" 18 #include "block/block_int.h" 19 #include "block/blockjob_int.h" 20 #include "qapi/error.h" 21 #include "qapi/qmp/qerror.h" 22 #include "qemu/ratelimit.h" 23 #include "sysemu/block-backend.h" 24 25 enum { 26 /* 27 * Size of data buffer for populating the image file. This should be large 28 * enough to process multiple clusters in a single call, so that populating 29 * contiguous regions of the image is efficient. 30 */ 31 COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */ 32 }; 33 34 #define SLICE_TIME 100000000ULL /* ns */ 35 36 typedef struct CommitBlockJob { 37 BlockJob common; 38 RateLimit limit; 39 BlockDriverState *active; 40 BlockDriverState *commit_top_bs; 41 BlockBackend *top; 42 BlockBackend *base; 43 BlockdevOnError on_error; 44 int base_flags; 45 int orig_overlay_flags; 46 char *backing_file_str; 47 } CommitBlockJob; 48 49 static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base, 50 int64_t offset, uint64_t bytes, 51 void *buf) 52 { 53 int ret = 0; 54 QEMUIOVector qiov; 55 struct iovec iov = { 56 .iov_base = buf, 57 .iov_len = bytes, 58 }; 59 60 assert(bytes < SIZE_MAX); 61 qemu_iovec_init_external(&qiov, &iov, 1); 62 63 ret = blk_co_preadv(bs, offset, qiov.size, &qiov, 0); 64 if (ret < 0) { 65 return ret; 66 } 67 68 ret = blk_co_pwritev(base, offset, qiov.size, &qiov, 0); 69 if (ret < 0) { 70 return ret; 71 } 72 73 return 0; 74 } 75 76 typedef struct { 77 int ret; 78 } CommitCompleteData; 79 80 static void commit_complete(BlockJob *job, void *opaque) 81 { 82 CommitBlockJob *s = container_of(job, CommitBlockJob, common); 83 CommitCompleteData *data = opaque; 84 BlockDriverState *active = s->active; 85 BlockDriverState *top = blk_bs(s->top); 86 BlockDriverState *base = blk_bs(s->base); 87 BlockDriverState *overlay_bs = bdrv_find_overlay(active, s->commit_top_bs); 88 int ret = data->ret; 89 bool remove_commit_top_bs = false; 90 91 /* Make sure overlay_bs and top stay around until bdrv_set_backing_hd() */ 92 bdrv_ref(top); 93 bdrv_ref(overlay_bs); 94 95 /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before 96 * the normal backing chain can be restored. */ 97 blk_unref(s->base); 98 99 if (!block_job_is_cancelled(&s->common) && ret == 0) { 100 /* success */ 101 ret = bdrv_drop_intermediate(active, s->commit_top_bs, base, 102 s->backing_file_str); 103 } else if (overlay_bs) { 104 /* XXX Can (or should) we somehow keep 'consistent read' blocked even 105 * after the failed/cancelled commit job is gone? If we already wrote 106 * something to base, the intermediate images aren't valid any more. */ 107 remove_commit_top_bs = true; 108 } 109 110 /* restore base open flags here if appropriate (e.g., change the base back 111 * to r/o). These reopens do not need to be atomic, since we won't abort 112 * even on failure here */ 113 if (s->base_flags != bdrv_get_flags(base)) { 114 bdrv_reopen(base, s->base_flags, NULL); 115 } 116 if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { 117 bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); 118 } 119 g_free(s->backing_file_str); 120 blk_unref(s->top); 121 122 /* If there is more than one reference to the job (e.g. if called from 123 * block_job_finish_sync()), block_job_completed() won't free it and 124 * therefore the blockers on the intermediate nodes remain. This would 125 * cause bdrv_set_backing_hd() to fail. */ 126 block_job_remove_all_bdrv(job); 127 128 block_job_completed(&s->common, ret); 129 g_free(data); 130 131 /* If bdrv_drop_intermediate() didn't already do that, remove the commit 132 * filter driver from the backing chain. Do this as the final step so that 133 * the 'consistent read' permission can be granted. */ 134 if (remove_commit_top_bs) { 135 bdrv_set_backing_hd(overlay_bs, top, &error_abort); 136 } 137 138 bdrv_unref(overlay_bs); 139 bdrv_unref(top); 140 } 141 142 static void coroutine_fn commit_run(void *opaque) 143 { 144 CommitBlockJob *s = opaque; 145 CommitCompleteData *data; 146 int64_t offset; 147 uint64_t delay_ns = 0; 148 int ret = 0; 149 int64_t n = 0; /* bytes */ 150 void *buf = NULL; 151 int bytes_written = 0; 152 int64_t base_len; 153 154 ret = s->common.len = blk_getlength(s->top); 155 156 if (s->common.len < 0) { 157 goto out; 158 } 159 160 ret = base_len = blk_getlength(s->base); 161 if (base_len < 0) { 162 goto out; 163 } 164 165 if (base_len < s->common.len) { 166 ret = blk_truncate(s->base, s->common.len, NULL); 167 if (ret) { 168 goto out; 169 } 170 } 171 172 buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE); 173 174 for (offset = 0; offset < s->common.len; offset += n) { 175 bool copy; 176 177 /* Note that even when no rate limit is applied we need to yield 178 * with no pending I/O here so that bdrv_drain_all() returns. 179 */ 180 block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); 181 if (block_job_is_cancelled(&s->common)) { 182 break; 183 } 184 /* Copy if allocated above the base */ 185 ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), 186 offset, COMMIT_BUFFER_SIZE, &n); 187 copy = (ret == 1); 188 trace_commit_one_iteration(s, offset, n, ret); 189 if (copy) { 190 ret = commit_populate(s->top, s->base, offset, n, buf); 191 bytes_written += n; 192 } 193 if (ret < 0) { 194 BlockErrorAction action = 195 block_job_error_action(&s->common, false, s->on_error, -ret); 196 if (action == BLOCK_ERROR_ACTION_REPORT) { 197 goto out; 198 } else { 199 n = 0; 200 continue; 201 } 202 } 203 /* Publish progress */ 204 s->common.offset += n; 205 206 if (copy && s->common.speed) { 207 delay_ns = ratelimit_calculate_delay(&s->limit, n); 208 } 209 } 210 211 ret = 0; 212 213 out: 214 qemu_vfree(buf); 215 216 data = g_malloc(sizeof(*data)); 217 data->ret = ret; 218 block_job_defer_to_main_loop(&s->common, commit_complete, data); 219 } 220 221 static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) 222 { 223 CommitBlockJob *s = container_of(job, CommitBlockJob, common); 224 225 if (speed < 0) { 226 error_setg(errp, QERR_INVALID_PARAMETER, "speed"); 227 return; 228 } 229 ratelimit_set_speed(&s->limit, speed, SLICE_TIME); 230 } 231 232 static const BlockJobDriver commit_job_driver = { 233 .instance_size = sizeof(CommitBlockJob), 234 .job_type = BLOCK_JOB_TYPE_COMMIT, 235 .set_speed = commit_set_speed, 236 .start = commit_run, 237 }; 238 239 static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs, 240 uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) 241 { 242 return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags); 243 } 244 245 static int64_t coroutine_fn bdrv_commit_top_get_block_status( 246 BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, 247 BlockDriverState **file) 248 { 249 *pnum = nb_sectors; 250 *file = bs->backing->bs; 251 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | 252 (sector_num << BDRV_SECTOR_BITS); 253 } 254 255 static void bdrv_commit_top_refresh_filename(BlockDriverState *bs, QDict *opts) 256 { 257 bdrv_refresh_filename(bs->backing->bs); 258 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), 259 bs->backing->bs->filename); 260 } 261 262 static void bdrv_commit_top_close(BlockDriverState *bs) 263 { 264 } 265 266 static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c, 267 const BdrvChildRole *role, 268 uint64_t perm, uint64_t shared, 269 uint64_t *nperm, uint64_t *nshared) 270 { 271 *nperm = 0; 272 *nshared = BLK_PERM_ALL; 273 } 274 275 /* Dummy node that provides consistent read to its users without requiring it 276 * from its backing file and that allows writes on the backing file chain. */ 277 static BlockDriver bdrv_commit_top = { 278 .format_name = "commit_top", 279 .bdrv_co_preadv = bdrv_commit_top_preadv, 280 .bdrv_co_get_block_status = bdrv_commit_top_get_block_status, 281 .bdrv_refresh_filename = bdrv_commit_top_refresh_filename, 282 .bdrv_close = bdrv_commit_top_close, 283 .bdrv_child_perm = bdrv_commit_top_child_perm, 284 }; 285 286 void commit_start(const char *job_id, BlockDriverState *bs, 287 BlockDriverState *base, BlockDriverState *top, int64_t speed, 288 BlockdevOnError on_error, const char *backing_file_str, 289 const char *filter_node_name, Error **errp) 290 { 291 CommitBlockJob *s; 292 BlockReopenQueue *reopen_queue = NULL; 293 int orig_overlay_flags; 294 int orig_base_flags; 295 BlockDriverState *iter; 296 BlockDriverState *overlay_bs; 297 BlockDriverState *commit_top_bs = NULL; 298 Error *local_err = NULL; 299 int ret; 300 301 assert(top != bs); 302 if (top == base) { 303 error_setg(errp, "Invalid files for merge: top and base are the same"); 304 return; 305 } 306 307 overlay_bs = bdrv_find_overlay(bs, top); 308 309 if (overlay_bs == NULL) { 310 error_setg(errp, "Could not find overlay image for %s:", top->filename); 311 return; 312 } 313 314 s = block_job_create(job_id, &commit_job_driver, bs, 0, BLK_PERM_ALL, 315 speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp); 316 if (!s) { 317 return; 318 } 319 320 orig_base_flags = bdrv_get_flags(base); 321 orig_overlay_flags = bdrv_get_flags(overlay_bs); 322 323 /* convert base & overlay_bs to r/w, if necessary */ 324 if (!(orig_base_flags & BDRV_O_RDWR)) { 325 reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL, 326 orig_base_flags | BDRV_O_RDWR); 327 } 328 if (!(orig_overlay_flags & BDRV_O_RDWR)) { 329 reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL, 330 orig_overlay_flags | BDRV_O_RDWR); 331 } 332 if (reopen_queue) { 333 bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err); 334 if (local_err != NULL) { 335 error_propagate(errp, local_err); 336 goto fail; 337 } 338 } 339 340 /* Insert commit_top block node above top, so we can block consistent read 341 * on the backing chain below it */ 342 commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0, 343 errp); 344 if (commit_top_bs == NULL) { 345 goto fail; 346 } 347 commit_top_bs->total_sectors = top->total_sectors; 348 bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(top)); 349 350 bdrv_set_backing_hd(commit_top_bs, top, &local_err); 351 if (local_err) { 352 bdrv_unref(commit_top_bs); 353 commit_top_bs = NULL; 354 error_propagate(errp, local_err); 355 goto fail; 356 } 357 bdrv_set_backing_hd(overlay_bs, commit_top_bs, &local_err); 358 if (local_err) { 359 bdrv_unref(commit_top_bs); 360 commit_top_bs = NULL; 361 error_propagate(errp, local_err); 362 goto fail; 363 } 364 365 s->commit_top_bs = commit_top_bs; 366 bdrv_unref(commit_top_bs); 367 368 /* Block all nodes between top and base, because they will 369 * disappear from the chain after this operation. */ 370 assert(bdrv_chain_contains(top, base)); 371 for (iter = top; iter != base; iter = backing_bs(iter)) { 372 /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves 373 * at s->base (if writes are blocked for a node, they are also blocked 374 * for its backing file). The other options would be a second filter 375 * driver above s->base. */ 376 ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0, 377 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, 378 errp); 379 if (ret < 0) { 380 goto fail; 381 } 382 } 383 384 ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp); 385 if (ret < 0) { 386 goto fail; 387 } 388 389 /* overlay_bs must be blocked because it needs to be modified to 390 * update the backing image string. */ 391 ret = block_job_add_bdrv(&s->common, "overlay of top", overlay_bs, 392 BLK_PERM_GRAPH_MOD, BLK_PERM_ALL, errp); 393 if (ret < 0) { 394 goto fail; 395 } 396 397 s->base = blk_new(BLK_PERM_CONSISTENT_READ 398 | BLK_PERM_WRITE 399 | BLK_PERM_RESIZE, 400 BLK_PERM_CONSISTENT_READ 401 | BLK_PERM_GRAPH_MOD 402 | BLK_PERM_WRITE_UNCHANGED); 403 ret = blk_insert_bs(s->base, base, errp); 404 if (ret < 0) { 405 goto fail; 406 } 407 408 /* Required permissions are already taken with block_job_add_bdrv() */ 409 s->top = blk_new(0, BLK_PERM_ALL); 410 ret = blk_insert_bs(s->top, top, errp); 411 if (ret < 0) { 412 goto fail; 413 } 414 415 s->active = bs; 416 417 s->base_flags = orig_base_flags; 418 s->orig_overlay_flags = orig_overlay_flags; 419 420 s->backing_file_str = g_strdup(backing_file_str); 421 422 s->on_error = on_error; 423 424 trace_commit_start(bs, base, top, s); 425 block_job_start(&s->common); 426 return; 427 428 fail: 429 if (s->base) { 430 blk_unref(s->base); 431 } 432 if (s->top) { 433 blk_unref(s->top); 434 } 435 if (commit_top_bs) { 436 bdrv_set_backing_hd(overlay_bs, top, &error_abort); 437 } 438 block_job_early_fail(&s->common); 439 } 440 441 442 #define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE) 443 444 /* commit COW file into the raw image */ 445 int bdrv_commit(BlockDriverState *bs) 446 { 447 BlockBackend *src, *backing; 448 BlockDriverState *backing_file_bs = NULL; 449 BlockDriverState *commit_top_bs = NULL; 450 BlockDriver *drv = bs->drv; 451 int64_t offset, length, backing_length; 452 int ro, open_flags; 453 int64_t n; 454 int ret = 0; 455 uint8_t *buf = NULL; 456 Error *local_err = NULL; 457 458 if (!drv) 459 return -ENOMEDIUM; 460 461 if (!bs->backing) { 462 return -ENOTSUP; 463 } 464 465 if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) || 466 bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) { 467 return -EBUSY; 468 } 469 470 ro = bs->backing->bs->read_only; 471 open_flags = bs->backing->bs->open_flags; 472 473 if (ro) { 474 if (bdrv_reopen(bs->backing->bs, open_flags | BDRV_O_RDWR, NULL)) { 475 return -EACCES; 476 } 477 } 478 479 src = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); 480 backing = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL); 481 482 ret = blk_insert_bs(src, bs, &local_err); 483 if (ret < 0) { 484 error_report_err(local_err); 485 goto ro_cleanup; 486 } 487 488 /* Insert commit_top block node above backing, so we can write to it */ 489 backing_file_bs = backing_bs(bs); 490 491 commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR, 492 &local_err); 493 if (commit_top_bs == NULL) { 494 error_report_err(local_err); 495 goto ro_cleanup; 496 } 497 bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(backing_file_bs)); 498 499 bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort); 500 bdrv_set_backing_hd(bs, commit_top_bs, &error_abort); 501 502 ret = blk_insert_bs(backing, backing_file_bs, &local_err); 503 if (ret < 0) { 504 error_report_err(local_err); 505 goto ro_cleanup; 506 } 507 508 length = blk_getlength(src); 509 if (length < 0) { 510 ret = length; 511 goto ro_cleanup; 512 } 513 514 backing_length = blk_getlength(backing); 515 if (backing_length < 0) { 516 ret = backing_length; 517 goto ro_cleanup; 518 } 519 520 /* If our top snapshot is larger than the backing file image, 521 * grow the backing file image if possible. If not possible, 522 * we must return an error */ 523 if (length > backing_length) { 524 ret = blk_truncate(backing, length, &local_err); 525 if (ret < 0) { 526 error_report_err(local_err); 527 goto ro_cleanup; 528 } 529 } 530 531 /* blk_try_blockalign() for src will choose an alignment that works for 532 * backing as well, so no need to compare the alignment manually. */ 533 buf = blk_try_blockalign(src, COMMIT_BUF_SIZE); 534 if (buf == NULL) { 535 ret = -ENOMEM; 536 goto ro_cleanup; 537 } 538 539 for (offset = 0; offset < length; offset += n) { 540 ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n); 541 if (ret < 0) { 542 goto ro_cleanup; 543 } 544 if (ret) { 545 ret = blk_pread(src, offset, buf, n); 546 if (ret < 0) { 547 goto ro_cleanup; 548 } 549 550 ret = blk_pwrite(backing, offset, buf, n, 0); 551 if (ret < 0) { 552 goto ro_cleanup; 553 } 554 } 555 } 556 557 if (drv->bdrv_make_empty) { 558 ret = drv->bdrv_make_empty(bs); 559 if (ret < 0) { 560 goto ro_cleanup; 561 } 562 blk_flush(src); 563 } 564 565 /* 566 * Make sure all data we wrote to the backing device is actually 567 * stable on disk. 568 */ 569 blk_flush(backing); 570 571 ret = 0; 572 ro_cleanup: 573 qemu_vfree(buf); 574 575 blk_unref(backing); 576 if (backing_file_bs) { 577 bdrv_set_backing_hd(bs, backing_file_bs, &error_abort); 578 } 579 bdrv_unref(commit_top_bs); 580 blk_unref(src); 581 582 if (ro) { 583 /* ignoring error return here */ 584 bdrv_reopen(bs->backing->bs, open_flags & ~BDRV_O_RDWR, NULL); 585 } 586 587 return ret; 588 } 589