1 /* 2 * QEMU live block migration 3 * 4 * Copyright IBM, Corp. 2009 5 * 6 * Authors: 7 * Liran Schour <lirans@il.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "qemu/error-report.h" 19 #include "qemu/cutils.h" 20 #include "qemu/queue.h" 21 #include "block.h" 22 #include "migration/misc.h" 23 #include "migration.h" 24 #include "migration/register.h" 25 #include "qemu-file.h" 26 #include "migration/vmstate.h" 27 #include "sysemu/block-backend.h" 28 29 #define BLOCK_SIZE (1 << 20) 30 #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS) 31 32 #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 33 #define BLK_MIG_FLAG_EOS 0x02 34 #define BLK_MIG_FLAG_PROGRESS 0x04 35 #define BLK_MIG_FLAG_ZERO_BLOCK 0x08 36 37 #define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE) 38 39 #define MAX_IO_BUFFERS 512 40 #define MAX_PARALLEL_IO 16 41 42 //#define DEBUG_BLK_MIGRATION 43 44 #ifdef DEBUG_BLK_MIGRATION 45 #define DPRINTF(fmt, ...) \ 46 do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) 47 #else 48 #define DPRINTF(fmt, ...) \ 49 do { } while (0) 50 #endif 51 52 typedef struct BlkMigDevState { 53 /* Written during setup phase. Can be read without a lock. */ 54 BlockBackend *blk; 55 char *blk_name; 56 int shared_base; 57 int64_t total_sectors; 58 QSIMPLEQ_ENTRY(BlkMigDevState) entry; 59 Error *blocker; 60 61 /* Only used by migration thread. Does not need a lock. */ 62 int bulk_completed; 63 int64_t cur_sector; 64 int64_t cur_dirty; 65 66 /* Data in the aio_bitmap is protected by block migration lock. 67 * Allocation and free happen during setup and cleanup respectively. 68 */ 69 unsigned long *aio_bitmap; 70 71 /* Protected by block migration lock. */ 72 int64_t completed_sectors; 73 74 /* During migration this is protected by iothread lock / AioContext. 75 * Allocation and free happen during setup and cleanup respectively. 76 */ 77 BdrvDirtyBitmap *dirty_bitmap; 78 } BlkMigDevState; 79 80 typedef struct BlkMigBlock { 81 /* Only used by migration thread. */ 82 uint8_t *buf; 83 BlkMigDevState *bmds; 84 int64_t sector; 85 int nr_sectors; 86 QEMUIOVector qiov; 87 BlockAIOCB *aiocb; 88 89 /* Protected by block migration lock. */ 90 int ret; 91 QSIMPLEQ_ENTRY(BlkMigBlock) entry; 92 } BlkMigBlock; 93 94 typedef struct BlkMigState { 95 QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list; 96 int64_t total_sector_sum; 97 bool zero_blocks; 98 99 /* Protected by lock. */ 100 QSIMPLEQ_HEAD(, BlkMigBlock) blk_list; 101 int submitted; 102 int read_done; 103 104 /* Only used by migration thread. Does not need a lock. */ 105 int transferred; 106 int prev_progress; 107 int bulk_completed; 108 109 /* Lock must be taken _inside_ the iothread lock and any AioContexts. */ 110 QemuMutex lock; 111 } BlkMigState; 112 113 static BlkMigState block_mig_state; 114 115 static void blk_mig_lock(void) 116 { 117 qemu_mutex_lock(&block_mig_state.lock); 118 } 119 120 static void blk_mig_unlock(void) 121 { 122 qemu_mutex_unlock(&block_mig_state.lock); 123 } 124 125 /* Must run outside of the iothread lock during the bulk phase, 126 * or the VM will stall. 127 */ 128 129 static void blk_send(QEMUFile *f, BlkMigBlock * blk) 130 { 131 int len; 132 uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; 133 134 if (block_mig_state.zero_blocks && 135 buffer_is_zero(blk->buf, BLOCK_SIZE)) { 136 flags |= BLK_MIG_FLAG_ZERO_BLOCK; 137 } 138 139 /* sector number and flags */ 140 qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) 141 | flags); 142 143 /* device name */ 144 len = strlen(blk->bmds->blk_name); 145 qemu_put_byte(f, len); 146 qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len); 147 148 /* if a block is zero we need to flush here since the network 149 * bandwidth is now a lot higher than the storage device bandwidth. 150 * thus if we queue zero blocks we slow down the migration */ 151 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 152 qemu_fflush(f); 153 return; 154 } 155 156 qemu_put_buffer(f, blk->buf, BLOCK_SIZE); 157 } 158 159 int blk_mig_active(void) 160 { 161 return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); 162 } 163 164 int blk_mig_bulk_active(void) 165 { 166 return blk_mig_active() && !block_mig_state.bulk_completed; 167 } 168 169 uint64_t blk_mig_bytes_transferred(void) 170 { 171 BlkMigDevState *bmds; 172 uint64_t sum = 0; 173 174 blk_mig_lock(); 175 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 176 sum += bmds->completed_sectors; 177 } 178 blk_mig_unlock(); 179 return sum << BDRV_SECTOR_BITS; 180 } 181 182 uint64_t blk_mig_bytes_remaining(void) 183 { 184 return blk_mig_bytes_total() - blk_mig_bytes_transferred(); 185 } 186 187 uint64_t blk_mig_bytes_total(void) 188 { 189 BlkMigDevState *bmds; 190 uint64_t sum = 0; 191 192 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 193 sum += bmds->total_sectors; 194 } 195 return sum << BDRV_SECTOR_BITS; 196 } 197 198 199 /* Called with migration lock held. */ 200 201 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) 202 { 203 int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; 204 205 if (sector < blk_nb_sectors(bmds->blk)) { 206 return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & 207 (1UL << (chunk % (sizeof(unsigned long) * 8)))); 208 } else { 209 return 0; 210 } 211 } 212 213 /* Called with migration lock held. */ 214 215 static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, 216 int nb_sectors, int set) 217 { 218 int64_t start, end; 219 unsigned long val, idx, bit; 220 221 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; 222 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; 223 224 for (; start <= end; start++) { 225 idx = start / (sizeof(unsigned long) * 8); 226 bit = start % (sizeof(unsigned long) * 8); 227 val = bmds->aio_bitmap[idx]; 228 if (set) { 229 val |= 1UL << bit; 230 } else { 231 val &= ~(1UL << bit); 232 } 233 bmds->aio_bitmap[idx] = val; 234 } 235 } 236 237 static void alloc_aio_bitmap(BlkMigDevState *bmds) 238 { 239 BlockBackend *bb = bmds->blk; 240 int64_t bitmap_size; 241 242 bitmap_size = blk_nb_sectors(bb) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; 243 bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; 244 245 bmds->aio_bitmap = g_malloc0(bitmap_size); 246 } 247 248 /* Never hold migration lock when yielding to the main loop! */ 249 250 static void blk_mig_read_cb(void *opaque, int ret) 251 { 252 BlkMigBlock *blk = opaque; 253 254 blk_mig_lock(); 255 blk->ret = ret; 256 257 QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); 258 bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); 259 260 block_mig_state.submitted--; 261 block_mig_state.read_done++; 262 assert(block_mig_state.submitted >= 0); 263 blk_mig_unlock(); 264 } 265 266 /* Called with no lock taken. */ 267 268 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) 269 { 270 int64_t total_sectors = bmds->total_sectors; 271 int64_t cur_sector = bmds->cur_sector; 272 BlockBackend *bb = bmds->blk; 273 BlkMigBlock *blk; 274 int nr_sectors; 275 int64_t count; 276 277 if (bmds->shared_base) { 278 qemu_mutex_lock_iothread(); 279 aio_context_acquire(blk_get_aio_context(bb)); 280 /* Skip unallocated sectors; intentionally treats failure or 281 * partial sector as an allocated sector */ 282 while (cur_sector < total_sectors && 283 !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE, 284 MAX_IS_ALLOCATED_SEARCH, &count)) { 285 if (count < BDRV_SECTOR_SIZE) { 286 break; 287 } 288 cur_sector += count >> BDRV_SECTOR_BITS; 289 } 290 aio_context_release(blk_get_aio_context(bb)); 291 qemu_mutex_unlock_iothread(); 292 } 293 294 if (cur_sector >= total_sectors) { 295 bmds->cur_sector = bmds->completed_sectors = total_sectors; 296 return 1; 297 } 298 299 bmds->completed_sectors = cur_sector; 300 301 cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); 302 303 /* we are going to transfer a full block even if it is not allocated */ 304 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 305 306 if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 307 nr_sectors = total_sectors - cur_sector; 308 } 309 310 blk = g_new(BlkMigBlock, 1); 311 blk->buf = g_malloc(BLOCK_SIZE); 312 blk->bmds = bmds; 313 blk->sector = cur_sector; 314 blk->nr_sectors = nr_sectors; 315 316 qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE); 317 318 blk_mig_lock(); 319 block_mig_state.submitted++; 320 blk_mig_unlock(); 321 322 /* We do not know if bs is under the main thread (and thus does 323 * not acquire the AioContext when doing AIO) or rather under 324 * dataplane. Thus acquire both the iothread mutex and the 325 * AioContext. 326 * 327 * This is ugly and will disappear when we make bdrv_* thread-safe, 328 * without the need to acquire the AioContext. 329 */ 330 qemu_mutex_lock_iothread(); 331 aio_context_acquire(blk_get_aio_context(bmds->blk)); 332 bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE, 333 nr_sectors * BDRV_SECTOR_SIZE); 334 blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov, 335 0, blk_mig_read_cb, blk); 336 aio_context_release(blk_get_aio_context(bmds->blk)); 337 qemu_mutex_unlock_iothread(); 338 339 bmds->cur_sector = cur_sector + nr_sectors; 340 return (bmds->cur_sector >= total_sectors); 341 } 342 343 /* Called with iothread lock taken. */ 344 345 static int set_dirty_tracking(void) 346 { 347 BlkMigDevState *bmds; 348 int ret; 349 350 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 351 bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk), 352 BLOCK_SIZE, NULL, NULL); 353 if (!bmds->dirty_bitmap) { 354 ret = -errno; 355 goto fail; 356 } 357 } 358 return 0; 359 360 fail: 361 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 362 if (bmds->dirty_bitmap) { 363 bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap); 364 } 365 } 366 return ret; 367 } 368 369 /* Called with iothread lock taken. */ 370 371 static void unset_dirty_tracking(void) 372 { 373 BlkMigDevState *bmds; 374 375 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 376 bdrv_release_dirty_bitmap(blk_bs(bmds->blk), bmds->dirty_bitmap); 377 } 378 } 379 380 static int init_blk_migration(QEMUFile *f) 381 { 382 BlockDriverState *bs; 383 BlkMigDevState *bmds; 384 int64_t sectors; 385 BdrvNextIterator it; 386 int i, num_bs = 0; 387 struct { 388 BlkMigDevState *bmds; 389 BlockDriverState *bs; 390 } *bmds_bs; 391 Error *local_err = NULL; 392 int ret; 393 394 block_mig_state.submitted = 0; 395 block_mig_state.read_done = 0; 396 block_mig_state.transferred = 0; 397 block_mig_state.total_sector_sum = 0; 398 block_mig_state.prev_progress = -1; 399 block_mig_state.bulk_completed = 0; 400 block_mig_state.zero_blocks = migrate_zero_blocks(); 401 402 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { 403 num_bs++; 404 } 405 bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs)); 406 407 for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) { 408 if (bdrv_is_read_only(bs)) { 409 continue; 410 } 411 412 sectors = bdrv_nb_sectors(bs); 413 if (sectors <= 0) { 414 ret = sectors; 415 bdrv_next_cleanup(&it); 416 goto out; 417 } 418 419 bmds = g_new0(BlkMigDevState, 1); 420 bmds->blk = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL); 421 bmds->blk_name = g_strdup(bdrv_get_device_name(bs)); 422 bmds->bulk_completed = 0; 423 bmds->total_sectors = sectors; 424 bmds->completed_sectors = 0; 425 bmds->shared_base = migrate_use_block_incremental(); 426 427 assert(i < num_bs); 428 bmds_bs[i].bmds = bmds; 429 bmds_bs[i].bs = bs; 430 431 block_mig_state.total_sector_sum += sectors; 432 433 if (bmds->shared_base) { 434 DPRINTF("Start migration for %s with shared base image\n", 435 bdrv_get_device_name(bs)); 436 } else { 437 DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs)); 438 } 439 440 QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); 441 } 442 443 /* Can only insert new BDSes now because doing so while iterating block 444 * devices may end up in a deadlock (iterating the new BDSes, too). */ 445 for (i = 0; i < num_bs; i++) { 446 BlkMigDevState *bmds = bmds_bs[i].bmds; 447 BlockDriverState *bs = bmds_bs[i].bs; 448 449 if (bmds) { 450 ret = blk_insert_bs(bmds->blk, bs, &local_err); 451 if (ret < 0) { 452 error_report_err(local_err); 453 goto out; 454 } 455 456 alloc_aio_bitmap(bmds); 457 error_setg(&bmds->blocker, "block device is in use by migration"); 458 bdrv_op_block_all(bs, bmds->blocker); 459 } 460 } 461 462 ret = 0; 463 out: 464 g_free(bmds_bs); 465 return ret; 466 } 467 468 /* Called with no lock taken. */ 469 470 static int blk_mig_save_bulked_block(QEMUFile *f) 471 { 472 int64_t completed_sector_sum = 0; 473 BlkMigDevState *bmds; 474 int progress; 475 int ret = 0; 476 477 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 478 if (bmds->bulk_completed == 0) { 479 if (mig_save_device_bulk(f, bmds) == 1) { 480 /* completed bulk section for this device */ 481 bmds->bulk_completed = 1; 482 } 483 completed_sector_sum += bmds->completed_sectors; 484 ret = 1; 485 break; 486 } else { 487 completed_sector_sum += bmds->completed_sectors; 488 } 489 } 490 491 if (block_mig_state.total_sector_sum != 0) { 492 progress = completed_sector_sum * 100 / 493 block_mig_state.total_sector_sum; 494 } else { 495 progress = 100; 496 } 497 if (progress != block_mig_state.prev_progress) { 498 block_mig_state.prev_progress = progress; 499 qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) 500 | BLK_MIG_FLAG_PROGRESS); 501 DPRINTF("Completed %d %%\r", progress); 502 } 503 504 return ret; 505 } 506 507 static void blk_mig_reset_dirty_cursor(void) 508 { 509 BlkMigDevState *bmds; 510 511 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 512 bmds->cur_dirty = 0; 513 } 514 } 515 516 /* Called with iothread lock and AioContext taken. */ 517 518 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, 519 int is_async) 520 { 521 BlkMigBlock *blk; 522 BlockDriverState *bs = blk_bs(bmds->blk); 523 int64_t total_sectors = bmds->total_sectors; 524 int64_t sector; 525 int nr_sectors; 526 int ret = -EIO; 527 528 for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { 529 blk_mig_lock(); 530 if (bmds_aio_inflight(bmds, sector)) { 531 blk_mig_unlock(); 532 blk_drain(bmds->blk); 533 } else { 534 blk_mig_unlock(); 535 } 536 bdrv_dirty_bitmap_lock(bmds->dirty_bitmap); 537 if (bdrv_get_dirty_locked(bs, bmds->dirty_bitmap, 538 sector * BDRV_SECTOR_SIZE)) { 539 if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { 540 nr_sectors = total_sectors - sector; 541 } else { 542 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 543 } 544 bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap, 545 sector * BDRV_SECTOR_SIZE, 546 nr_sectors * BDRV_SECTOR_SIZE); 547 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 548 549 blk = g_new(BlkMigBlock, 1); 550 blk->buf = g_malloc(BLOCK_SIZE); 551 blk->bmds = bmds; 552 blk->sector = sector; 553 blk->nr_sectors = nr_sectors; 554 555 if (is_async) { 556 qemu_iovec_init_buf(&blk->qiov, blk->buf, 557 nr_sectors * BDRV_SECTOR_SIZE); 558 559 blk->aiocb = blk_aio_preadv(bmds->blk, 560 sector * BDRV_SECTOR_SIZE, 561 &blk->qiov, 0, blk_mig_read_cb, 562 blk); 563 564 blk_mig_lock(); 565 block_mig_state.submitted++; 566 bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); 567 blk_mig_unlock(); 568 } else { 569 ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE, blk->buf, 570 nr_sectors * BDRV_SECTOR_SIZE); 571 if (ret < 0) { 572 goto error; 573 } 574 blk_send(f, blk); 575 576 g_free(blk->buf); 577 g_free(blk); 578 } 579 580 sector += nr_sectors; 581 bmds->cur_dirty = sector; 582 break; 583 } 584 585 bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap); 586 sector += BDRV_SECTORS_PER_DIRTY_CHUNK; 587 bmds->cur_dirty = sector; 588 } 589 590 return (bmds->cur_dirty >= bmds->total_sectors); 591 592 error: 593 DPRINTF("Error reading sector %" PRId64 "\n", sector); 594 g_free(blk->buf); 595 g_free(blk); 596 return ret; 597 } 598 599 /* Called with iothread lock taken. 600 * 601 * return value: 602 * 0: too much data for max_downtime 603 * 1: few enough data for max_downtime 604 */ 605 static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) 606 { 607 BlkMigDevState *bmds; 608 int ret = 1; 609 610 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 611 aio_context_acquire(blk_get_aio_context(bmds->blk)); 612 ret = mig_save_device_dirty(f, bmds, is_async); 613 aio_context_release(blk_get_aio_context(bmds->blk)); 614 if (ret <= 0) { 615 break; 616 } 617 } 618 619 return ret; 620 } 621 622 /* Called with no locks taken. */ 623 624 static int flush_blks(QEMUFile *f) 625 { 626 BlkMigBlock *blk; 627 int ret = 0; 628 629 DPRINTF("%s Enter submitted %d read_done %d transferred %d\n", 630 __func__, block_mig_state.submitted, block_mig_state.read_done, 631 block_mig_state.transferred); 632 633 blk_mig_lock(); 634 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 635 if (qemu_file_rate_limit(f)) { 636 break; 637 } 638 if (blk->ret < 0) { 639 ret = blk->ret; 640 break; 641 } 642 643 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 644 blk_mig_unlock(); 645 blk_send(f, blk); 646 blk_mig_lock(); 647 648 g_free(blk->buf); 649 g_free(blk); 650 651 block_mig_state.read_done--; 652 block_mig_state.transferred++; 653 assert(block_mig_state.read_done >= 0); 654 } 655 blk_mig_unlock(); 656 657 DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __func__, 658 block_mig_state.submitted, block_mig_state.read_done, 659 block_mig_state.transferred); 660 return ret; 661 } 662 663 /* Called with iothread lock taken. */ 664 665 static int64_t get_remaining_dirty(void) 666 { 667 BlkMigDevState *bmds; 668 int64_t dirty = 0; 669 670 QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { 671 aio_context_acquire(blk_get_aio_context(bmds->blk)); 672 dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); 673 aio_context_release(blk_get_aio_context(bmds->blk)); 674 } 675 676 return dirty; 677 } 678 679 680 681 /* Called with iothread lock taken. */ 682 static void block_migration_cleanup_bmds(void) 683 { 684 BlkMigDevState *bmds; 685 AioContext *ctx; 686 687 unset_dirty_tracking(); 688 689 while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { 690 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); 691 bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); 692 error_free(bmds->blocker); 693 694 /* Save ctx, because bmds->blk can disappear during blk_unref. */ 695 ctx = blk_get_aio_context(bmds->blk); 696 aio_context_acquire(ctx); 697 blk_unref(bmds->blk); 698 aio_context_release(ctx); 699 700 g_free(bmds->blk_name); 701 g_free(bmds->aio_bitmap); 702 g_free(bmds); 703 } 704 } 705 706 /* Called with iothread lock taken. */ 707 static void block_migration_cleanup(void *opaque) 708 { 709 BlkMigBlock *blk; 710 711 bdrv_drain_all(); 712 713 block_migration_cleanup_bmds(); 714 715 blk_mig_lock(); 716 while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { 717 QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); 718 g_free(blk->buf); 719 g_free(blk); 720 } 721 blk_mig_unlock(); 722 } 723 724 static int block_save_setup(QEMUFile *f, void *opaque) 725 { 726 int ret; 727 728 DPRINTF("Enter save live setup submitted %d transferred %d\n", 729 block_mig_state.submitted, block_mig_state.transferred); 730 731 qemu_mutex_lock_iothread(); 732 ret = init_blk_migration(f); 733 if (ret < 0) { 734 qemu_mutex_unlock_iothread(); 735 return ret; 736 } 737 738 /* start track dirty blocks */ 739 ret = set_dirty_tracking(); 740 741 qemu_mutex_unlock_iothread(); 742 743 if (ret) { 744 return ret; 745 } 746 747 ret = flush_blks(f); 748 blk_mig_reset_dirty_cursor(); 749 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 750 751 return ret; 752 } 753 754 static int block_save_iterate(QEMUFile *f, void *opaque) 755 { 756 int ret; 757 int64_t last_ftell = qemu_ftell(f); 758 int64_t delta_ftell; 759 760 DPRINTF("Enter save live iterate submitted %d transferred %d\n", 761 block_mig_state.submitted, block_mig_state.transferred); 762 763 ret = flush_blks(f); 764 if (ret) { 765 return ret; 766 } 767 768 blk_mig_reset_dirty_cursor(); 769 770 /* control the rate of transfer */ 771 blk_mig_lock(); 772 while (block_mig_state.read_done * BLOCK_SIZE < 773 qemu_file_get_rate_limit(f) && 774 block_mig_state.submitted < MAX_PARALLEL_IO && 775 (block_mig_state.submitted + block_mig_state.read_done) < 776 MAX_IO_BUFFERS) { 777 blk_mig_unlock(); 778 if (block_mig_state.bulk_completed == 0) { 779 /* first finish the bulk phase */ 780 if (blk_mig_save_bulked_block(f) == 0) { 781 /* finished saving bulk on all devices */ 782 block_mig_state.bulk_completed = 1; 783 } 784 ret = 0; 785 } else { 786 /* Always called with iothread lock taken for 787 * simplicity, block_save_complete also calls it. 788 */ 789 qemu_mutex_lock_iothread(); 790 ret = blk_mig_save_dirty_block(f, 1); 791 qemu_mutex_unlock_iothread(); 792 } 793 if (ret < 0) { 794 return ret; 795 } 796 blk_mig_lock(); 797 if (ret != 0) { 798 /* no more dirty blocks */ 799 break; 800 } 801 } 802 blk_mig_unlock(); 803 804 ret = flush_blks(f); 805 if (ret) { 806 return ret; 807 } 808 809 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 810 delta_ftell = qemu_ftell(f) - last_ftell; 811 if (delta_ftell > 0) { 812 return 1; 813 } else if (delta_ftell < 0) { 814 return -1; 815 } else { 816 return 0; 817 } 818 } 819 820 /* Called with iothread lock taken. */ 821 822 static int block_save_complete(QEMUFile *f, void *opaque) 823 { 824 int ret; 825 826 DPRINTF("Enter save live complete submitted %d transferred %d\n", 827 block_mig_state.submitted, block_mig_state.transferred); 828 829 ret = flush_blks(f); 830 if (ret) { 831 return ret; 832 } 833 834 blk_mig_reset_dirty_cursor(); 835 836 /* we know for sure that save bulk is completed and 837 all async read completed */ 838 blk_mig_lock(); 839 assert(block_mig_state.submitted == 0); 840 blk_mig_unlock(); 841 842 do { 843 ret = blk_mig_save_dirty_block(f, 0); 844 if (ret < 0) { 845 return ret; 846 } 847 } while (ret == 0); 848 849 /* report completion */ 850 qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); 851 852 DPRINTF("Block migration completed\n"); 853 854 qemu_put_be64(f, BLK_MIG_FLAG_EOS); 855 856 /* Make sure that our BlockBackends are gone, so that the block driver 857 * nodes can be inactivated. */ 858 block_migration_cleanup_bmds(); 859 860 return 0; 861 } 862 863 static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 864 uint64_t *res_precopy_only, 865 uint64_t *res_compatible, 866 uint64_t *res_postcopy_only) 867 { 868 /* Estimate pending number of bytes to send */ 869 uint64_t pending; 870 871 qemu_mutex_lock_iothread(); 872 pending = get_remaining_dirty(); 873 qemu_mutex_unlock_iothread(); 874 875 blk_mig_lock(); 876 pending += block_mig_state.submitted * BLOCK_SIZE + 877 block_mig_state.read_done * BLOCK_SIZE; 878 blk_mig_unlock(); 879 880 /* Report at least one block pending during bulk phase */ 881 if (pending <= max_size && !block_mig_state.bulk_completed) { 882 pending = max_size + BLOCK_SIZE; 883 } 884 885 DPRINTF("Enter save live pending %" PRIu64 "\n", pending); 886 /* We don't do postcopy */ 887 *res_precopy_only += pending; 888 } 889 890 static int block_load(QEMUFile *f, void *opaque, int version_id) 891 { 892 static int banner_printed; 893 int len, flags; 894 char device_name[256]; 895 int64_t addr; 896 BlockBackend *blk, *blk_prev = NULL; 897 Error *local_err = NULL; 898 uint8_t *buf; 899 int64_t total_sectors = 0; 900 int nr_sectors; 901 int ret; 902 BlockDriverInfo bdi; 903 int cluster_size = BLOCK_SIZE; 904 905 do { 906 addr = qemu_get_be64(f); 907 908 flags = addr & ~BDRV_SECTOR_MASK; 909 addr >>= BDRV_SECTOR_BITS; 910 911 if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { 912 /* get device name */ 913 len = qemu_get_byte(f); 914 qemu_get_buffer(f, (uint8_t *)device_name, len); 915 device_name[len] = '\0'; 916 917 blk = blk_by_name(device_name); 918 if (!blk) { 919 fprintf(stderr, "Error unknown block device %s\n", 920 device_name); 921 return -EINVAL; 922 } 923 924 if (blk != blk_prev) { 925 blk_prev = blk; 926 total_sectors = blk_nb_sectors(blk); 927 if (total_sectors <= 0) { 928 error_report("Error getting length of block device %s", 929 device_name); 930 return -EINVAL; 931 } 932 933 blk_invalidate_cache(blk, &local_err); 934 if (local_err) { 935 error_report_err(local_err); 936 return -EINVAL; 937 } 938 939 ret = bdrv_get_info(blk_bs(blk), &bdi); 940 if (ret == 0 && bdi.cluster_size > 0 && 941 bdi.cluster_size <= BLOCK_SIZE && 942 BLOCK_SIZE % bdi.cluster_size == 0) { 943 cluster_size = bdi.cluster_size; 944 } else { 945 cluster_size = BLOCK_SIZE; 946 } 947 } 948 949 if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { 950 nr_sectors = total_sectors - addr; 951 } else { 952 nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; 953 } 954 955 if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { 956 ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE, 957 nr_sectors * BDRV_SECTOR_SIZE, 958 BDRV_REQ_MAY_UNMAP); 959 } else { 960 int i; 961 int64_t cur_addr; 962 uint8_t *cur_buf; 963 964 buf = g_malloc(BLOCK_SIZE); 965 qemu_get_buffer(f, buf, BLOCK_SIZE); 966 for (i = 0; i < BLOCK_SIZE / cluster_size; i++) { 967 cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size; 968 cur_buf = buf + i * cluster_size; 969 970 if ((!block_mig_state.zero_blocks || 971 cluster_size < BLOCK_SIZE) && 972 buffer_is_zero(cur_buf, cluster_size)) { 973 ret = blk_pwrite_zeroes(blk, cur_addr, 974 cluster_size, 975 BDRV_REQ_MAY_UNMAP); 976 } else { 977 ret = blk_pwrite(blk, cur_addr, cur_buf, 978 cluster_size, 0); 979 } 980 if (ret < 0) { 981 break; 982 } 983 } 984 g_free(buf); 985 } 986 987 if (ret < 0) { 988 return ret; 989 } 990 } else if (flags & BLK_MIG_FLAG_PROGRESS) { 991 if (!banner_printed) { 992 printf("Receiving block device images\n"); 993 banner_printed = 1; 994 } 995 printf("Completed %d %%%c", (int)addr, 996 (addr == 100) ? '\n' : '\r'); 997 fflush(stdout); 998 } else if (!(flags & BLK_MIG_FLAG_EOS)) { 999 fprintf(stderr, "Unknown block migration flags: %#x\n", flags); 1000 return -EINVAL; 1001 } 1002 ret = qemu_file_get_error(f); 1003 if (ret != 0) { 1004 return ret; 1005 } 1006 } while (!(flags & BLK_MIG_FLAG_EOS)); 1007 1008 return 0; 1009 } 1010 1011 static bool block_is_active(void *opaque) 1012 { 1013 return migrate_use_block(); 1014 } 1015 1016 static SaveVMHandlers savevm_block_handlers = { 1017 .save_setup = block_save_setup, 1018 .save_live_iterate = block_save_iterate, 1019 .save_live_complete_precopy = block_save_complete, 1020 .save_live_pending = block_save_pending, 1021 .load_state = block_load, 1022 .save_cleanup = block_migration_cleanup, 1023 .is_active = block_is_active, 1024 }; 1025 1026 void blk_mig_init(void) 1027 { 1028 QSIMPLEQ_INIT(&block_mig_state.bmds_list); 1029 QSIMPLEQ_INIT(&block_mig_state.blk_list); 1030 qemu_mutex_init(&block_mig_state.lock); 1031 1032 register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers, 1033 &block_mig_state); 1034 } 1035