1 /* 2 * QEMU live migration 3 * 4 * Copyright IBM, Corp. 2008 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu-common.h" 17 #include "qemu/main-loop.h" 18 #include "migration/migration.h" 19 #include "monitor/monitor.h" 20 #include "migration/qemu-file.h" 21 #include "sysemu/sysemu.h" 22 #include "block/block.h" 23 #include "qemu/sockets.h" 24 #include "migration/block.h" 25 #include "qemu/thread.h" 26 #include "qmp-commands.h" 27 #include "trace.h" 28 29 enum { 30 MIG_STATE_ERROR = -1, 31 MIG_STATE_NONE, 32 MIG_STATE_SETUP, 33 MIG_STATE_CANCELLING, 34 MIG_STATE_CANCELLED, 35 MIG_STATE_ACTIVE, 36 MIG_STATE_COMPLETED, 37 }; 38 39 #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ 40 41 /* Amount of time to allocate to each "chunk" of bandwidth-throttled 42 * data. */ 43 #define BUFFER_DELAY 100 44 #define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY) 45 46 /* Migration XBZRLE default cache size */ 47 #define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024) 48 49 static NotifierList migration_state_notifiers = 50 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); 51 52 /* When we add fault tolerance, we could have several 53 migrations at once. For now we don't need to add 54 dynamic creation of migration */ 55 56 MigrationState *migrate_get_current(void) 57 { 58 static MigrationState current_migration = { 59 .state = MIG_STATE_NONE, 60 .bandwidth_limit = MAX_THROTTLE, 61 .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, 62 .mbps = -1, 63 }; 64 65 return ¤t_migration; 66 } 67 68 void qemu_start_incoming_migration(const char *uri, Error **errp) 69 { 70 const char *p; 71 72 if (strstart(uri, "tcp:", &p)) 73 tcp_start_incoming_migration(p, errp); 74 #ifdef CONFIG_RDMA 75 else if (strstart(uri, "rdma:", &p)) 76 rdma_start_incoming_migration(p, errp); 77 #endif 78 #if !defined(WIN32) 79 else if (strstart(uri, "exec:", &p)) 80 exec_start_incoming_migration(p, errp); 81 else if (strstart(uri, "unix:", &p)) 82 unix_start_incoming_migration(p, errp); 83 else if (strstart(uri, "fd:", &p)) 84 fd_start_incoming_migration(p, errp); 85 #endif 86 else { 87 error_setg(errp, "unknown migration protocol: %s", uri); 88 } 89 } 90 91 static void process_incoming_migration_co(void *opaque) 92 { 93 QEMUFile *f = opaque; 94 Error *local_err = NULL; 95 int ret; 96 97 ret = qemu_loadvm_state(f); 98 qemu_fclose(f); 99 free_xbzrle_decoded_buf(); 100 if (ret < 0) { 101 error_report("load of migration failed: %s", strerror(-ret)); 102 exit(EXIT_FAILURE); 103 } 104 qemu_announce_self(); 105 106 /* Make sure all file formats flush their mutable metadata */ 107 bdrv_invalidate_cache_all(&local_err); 108 if (local_err) { 109 qerror_report_err(local_err); 110 error_free(local_err); 111 exit(EXIT_FAILURE); 112 } 113 114 if (autostart) { 115 vm_start(); 116 } else { 117 runstate_set(RUN_STATE_PAUSED); 118 } 119 } 120 121 void process_incoming_migration(QEMUFile *f) 122 { 123 Coroutine *co = qemu_coroutine_create(process_incoming_migration_co); 124 int fd = qemu_get_fd(f); 125 126 assert(fd != -1); 127 qemu_set_nonblock(fd); 128 qemu_coroutine_enter(co, f); 129 } 130 131 /* amount of nanoseconds we are willing to wait for migration to be down. 132 * the choice of nanoseconds is because it is the maximum resolution that 133 * get_clock() can achieve. It is an internal measure. All user-visible 134 * units must be in seconds */ 135 static uint64_t max_downtime = 300000000; 136 137 uint64_t migrate_max_downtime(void) 138 { 139 return max_downtime; 140 } 141 142 MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) 143 { 144 MigrationCapabilityStatusList *head = NULL; 145 MigrationCapabilityStatusList *caps; 146 MigrationState *s = migrate_get_current(); 147 int i; 148 149 caps = NULL; /* silence compiler warning */ 150 for (i = 0; i < MIGRATION_CAPABILITY_MAX; i++) { 151 if (head == NULL) { 152 head = g_malloc0(sizeof(*caps)); 153 caps = head; 154 } else { 155 caps->next = g_malloc0(sizeof(*caps)); 156 caps = caps->next; 157 } 158 caps->value = 159 g_malloc(sizeof(*caps->value)); 160 caps->value->capability = i; 161 caps->value->state = s->enabled_capabilities[i]; 162 } 163 164 return head; 165 } 166 167 static void get_xbzrle_cache_stats(MigrationInfo *info) 168 { 169 if (migrate_use_xbzrle()) { 170 info->has_xbzrle_cache = true; 171 info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); 172 info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); 173 info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred(); 174 info->xbzrle_cache->pages = xbzrle_mig_pages_transferred(); 175 info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss(); 176 info->xbzrle_cache->cache_miss_rate = xbzrle_mig_cache_miss_rate(); 177 info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow(); 178 } 179 } 180 181 MigrationInfo *qmp_query_migrate(Error **errp) 182 { 183 MigrationInfo *info = g_malloc0(sizeof(*info)); 184 MigrationState *s = migrate_get_current(); 185 186 switch (s->state) { 187 case MIG_STATE_NONE: 188 /* no migration has happened ever */ 189 break; 190 case MIG_STATE_SETUP: 191 info->has_status = true; 192 info->status = g_strdup("setup"); 193 info->has_total_time = false; 194 break; 195 case MIG_STATE_ACTIVE: 196 case MIG_STATE_CANCELLING: 197 info->has_status = true; 198 info->status = g_strdup("active"); 199 info->has_total_time = true; 200 info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) 201 - s->total_time; 202 info->has_expected_downtime = true; 203 info->expected_downtime = s->expected_downtime; 204 info->has_setup_time = true; 205 info->setup_time = s->setup_time; 206 207 info->has_ram = true; 208 info->ram = g_malloc0(sizeof(*info->ram)); 209 info->ram->transferred = ram_bytes_transferred(); 210 info->ram->remaining = ram_bytes_remaining(); 211 info->ram->total = ram_bytes_total(); 212 info->ram->duplicate = dup_mig_pages_transferred(); 213 info->ram->skipped = skipped_mig_pages_transferred(); 214 info->ram->normal = norm_mig_pages_transferred(); 215 info->ram->normal_bytes = norm_mig_bytes_transferred(); 216 info->ram->dirty_pages_rate = s->dirty_pages_rate; 217 info->ram->mbps = s->mbps; 218 info->ram->dirty_sync_count = s->dirty_sync_count; 219 220 if (blk_mig_active()) { 221 info->has_disk = true; 222 info->disk = g_malloc0(sizeof(*info->disk)); 223 info->disk->transferred = blk_mig_bytes_transferred(); 224 info->disk->remaining = blk_mig_bytes_remaining(); 225 info->disk->total = blk_mig_bytes_total(); 226 } 227 228 get_xbzrle_cache_stats(info); 229 break; 230 case MIG_STATE_COMPLETED: 231 get_xbzrle_cache_stats(info); 232 233 info->has_status = true; 234 info->status = g_strdup("completed"); 235 info->has_total_time = true; 236 info->total_time = s->total_time; 237 info->has_downtime = true; 238 info->downtime = s->downtime; 239 info->has_setup_time = true; 240 info->setup_time = s->setup_time; 241 242 info->has_ram = true; 243 info->ram = g_malloc0(sizeof(*info->ram)); 244 info->ram->transferred = ram_bytes_transferred(); 245 info->ram->remaining = 0; 246 info->ram->total = ram_bytes_total(); 247 info->ram->duplicate = dup_mig_pages_transferred(); 248 info->ram->skipped = skipped_mig_pages_transferred(); 249 info->ram->normal = norm_mig_pages_transferred(); 250 info->ram->normal_bytes = norm_mig_bytes_transferred(); 251 info->ram->mbps = s->mbps; 252 info->ram->dirty_sync_count = s->dirty_sync_count; 253 break; 254 case MIG_STATE_ERROR: 255 info->has_status = true; 256 info->status = g_strdup("failed"); 257 break; 258 case MIG_STATE_CANCELLED: 259 info->has_status = true; 260 info->status = g_strdup("cancelled"); 261 break; 262 } 263 264 return info; 265 } 266 267 void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, 268 Error **errp) 269 { 270 MigrationState *s = migrate_get_current(); 271 MigrationCapabilityStatusList *cap; 272 273 if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) { 274 error_set(errp, QERR_MIGRATION_ACTIVE); 275 return; 276 } 277 278 for (cap = params; cap; cap = cap->next) { 279 s->enabled_capabilities[cap->value->capability] = cap->value->state; 280 } 281 } 282 283 /* shared migration helpers */ 284 285 static void migrate_set_state(MigrationState *s, int old_state, int new_state) 286 { 287 if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) { 288 trace_migrate_set_state(new_state); 289 } 290 } 291 292 static void migrate_fd_cleanup(void *opaque) 293 { 294 MigrationState *s = opaque; 295 296 qemu_bh_delete(s->cleanup_bh); 297 s->cleanup_bh = NULL; 298 299 if (s->file) { 300 trace_migrate_fd_cleanup(); 301 qemu_mutex_unlock_iothread(); 302 qemu_thread_join(&s->thread); 303 qemu_mutex_lock_iothread(); 304 305 qemu_fclose(s->file); 306 s->file = NULL; 307 } 308 309 assert(s->state != MIG_STATE_ACTIVE); 310 311 if (s->state != MIG_STATE_COMPLETED) { 312 qemu_savevm_state_cancel(); 313 if (s->state == MIG_STATE_CANCELLING) { 314 migrate_set_state(s, MIG_STATE_CANCELLING, MIG_STATE_CANCELLED); 315 } 316 } 317 318 notifier_list_notify(&migration_state_notifiers, s); 319 } 320 321 void migrate_fd_error(MigrationState *s) 322 { 323 trace_migrate_fd_error(); 324 assert(s->file == NULL); 325 s->state = MIG_STATE_ERROR; 326 trace_migrate_set_state(MIG_STATE_ERROR); 327 notifier_list_notify(&migration_state_notifiers, s); 328 } 329 330 static void migrate_fd_cancel(MigrationState *s) 331 { 332 int old_state ; 333 trace_migrate_fd_cancel(); 334 335 do { 336 old_state = s->state; 337 if (old_state != MIG_STATE_SETUP && old_state != MIG_STATE_ACTIVE) { 338 break; 339 } 340 migrate_set_state(s, old_state, MIG_STATE_CANCELLING); 341 } while (s->state != MIG_STATE_CANCELLING); 342 } 343 344 void add_migration_state_change_notifier(Notifier *notify) 345 { 346 notifier_list_add(&migration_state_notifiers, notify); 347 } 348 349 void remove_migration_state_change_notifier(Notifier *notify) 350 { 351 notifier_remove(notify); 352 } 353 354 bool migration_in_setup(MigrationState *s) 355 { 356 return s->state == MIG_STATE_SETUP; 357 } 358 359 bool migration_has_finished(MigrationState *s) 360 { 361 return s->state == MIG_STATE_COMPLETED; 362 } 363 364 bool migration_has_failed(MigrationState *s) 365 { 366 return (s->state == MIG_STATE_CANCELLED || 367 s->state == MIG_STATE_ERROR); 368 } 369 370 static MigrationState *migrate_init(const MigrationParams *params) 371 { 372 MigrationState *s = migrate_get_current(); 373 int64_t bandwidth_limit = s->bandwidth_limit; 374 bool enabled_capabilities[MIGRATION_CAPABILITY_MAX]; 375 int64_t xbzrle_cache_size = s->xbzrle_cache_size; 376 377 memcpy(enabled_capabilities, s->enabled_capabilities, 378 sizeof(enabled_capabilities)); 379 380 memset(s, 0, sizeof(*s)); 381 s->params = *params; 382 memcpy(s->enabled_capabilities, enabled_capabilities, 383 sizeof(enabled_capabilities)); 384 s->xbzrle_cache_size = xbzrle_cache_size; 385 386 s->bandwidth_limit = bandwidth_limit; 387 s->state = MIG_STATE_SETUP; 388 trace_migrate_set_state(MIG_STATE_SETUP); 389 390 s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 391 return s; 392 } 393 394 static GSList *migration_blockers; 395 396 void migrate_add_blocker(Error *reason) 397 { 398 migration_blockers = g_slist_prepend(migration_blockers, reason); 399 } 400 401 void migrate_del_blocker(Error *reason) 402 { 403 migration_blockers = g_slist_remove(migration_blockers, reason); 404 } 405 406 void qmp_migrate(const char *uri, bool has_blk, bool blk, 407 bool has_inc, bool inc, bool has_detach, bool detach, 408 Error **errp) 409 { 410 Error *local_err = NULL; 411 MigrationState *s = migrate_get_current(); 412 MigrationParams params; 413 const char *p; 414 415 params.blk = has_blk && blk; 416 params.shared = has_inc && inc; 417 418 if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP || 419 s->state == MIG_STATE_CANCELLING) { 420 error_set(errp, QERR_MIGRATION_ACTIVE); 421 return; 422 } 423 424 if (runstate_check(RUN_STATE_INMIGRATE)) { 425 error_setg(errp, "Guest is waiting for an incoming migration"); 426 return; 427 } 428 429 if (qemu_savevm_state_blocked(errp)) { 430 return; 431 } 432 433 if (migration_blockers) { 434 *errp = error_copy(migration_blockers->data); 435 return; 436 } 437 438 s = migrate_init(¶ms); 439 440 if (strstart(uri, "tcp:", &p)) { 441 tcp_start_outgoing_migration(s, p, &local_err); 442 #ifdef CONFIG_RDMA 443 } else if (strstart(uri, "rdma:", &p)) { 444 rdma_start_outgoing_migration(s, p, &local_err); 445 #endif 446 #if !defined(WIN32) 447 } else if (strstart(uri, "exec:", &p)) { 448 exec_start_outgoing_migration(s, p, &local_err); 449 } else if (strstart(uri, "unix:", &p)) { 450 unix_start_outgoing_migration(s, p, &local_err); 451 } else if (strstart(uri, "fd:", &p)) { 452 fd_start_outgoing_migration(s, p, &local_err); 453 #endif 454 } else { 455 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "uri", "a valid migration protocol"); 456 s->state = MIG_STATE_ERROR; 457 return; 458 } 459 460 if (local_err) { 461 migrate_fd_error(s); 462 error_propagate(errp, local_err); 463 return; 464 } 465 } 466 467 void qmp_migrate_cancel(Error **errp) 468 { 469 migrate_fd_cancel(migrate_get_current()); 470 } 471 472 void qmp_migrate_set_cache_size(int64_t value, Error **errp) 473 { 474 MigrationState *s = migrate_get_current(); 475 int64_t new_size; 476 477 /* Check for truncation */ 478 if (value != (size_t)value) { 479 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 480 "exceeding address space"); 481 return; 482 } 483 484 /* Cache should not be larger than guest ram size */ 485 if (value > ram_bytes_total()) { 486 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 487 "exceeds guest ram size "); 488 return; 489 } 490 491 new_size = xbzrle_cache_resize(value); 492 if (new_size < 0) { 493 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 494 "is smaller than page size"); 495 return; 496 } 497 498 s->xbzrle_cache_size = new_size; 499 } 500 501 int64_t qmp_query_migrate_cache_size(Error **errp) 502 { 503 return migrate_xbzrle_cache_size(); 504 } 505 506 void qmp_migrate_set_speed(int64_t value, Error **errp) 507 { 508 MigrationState *s; 509 510 if (value < 0) { 511 value = 0; 512 } 513 if (value > SIZE_MAX) { 514 value = SIZE_MAX; 515 } 516 517 s = migrate_get_current(); 518 s->bandwidth_limit = value; 519 if (s->file) { 520 qemu_file_set_rate_limit(s->file, s->bandwidth_limit / XFER_LIMIT_RATIO); 521 } 522 } 523 524 void qmp_migrate_set_downtime(double value, Error **errp) 525 { 526 value *= 1e9; 527 value = MAX(0, MIN(UINT64_MAX, value)); 528 max_downtime = (uint64_t)value; 529 } 530 531 bool migrate_rdma_pin_all(void) 532 { 533 MigrationState *s; 534 535 s = migrate_get_current(); 536 537 return s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]; 538 } 539 540 bool migrate_auto_converge(void) 541 { 542 MigrationState *s; 543 544 s = migrate_get_current(); 545 546 return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; 547 } 548 549 bool migrate_zero_blocks(void) 550 { 551 MigrationState *s; 552 553 s = migrate_get_current(); 554 555 return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; 556 } 557 558 int migrate_use_xbzrle(void) 559 { 560 MigrationState *s; 561 562 s = migrate_get_current(); 563 564 return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; 565 } 566 567 int64_t migrate_xbzrle_cache_size(void) 568 { 569 MigrationState *s; 570 571 s = migrate_get_current(); 572 573 return s->xbzrle_cache_size; 574 } 575 576 /* migration thread support */ 577 578 static void *migration_thread(void *opaque) 579 { 580 MigrationState *s = opaque; 581 int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 582 int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); 583 int64_t initial_bytes = 0; 584 int64_t max_size = 0; 585 int64_t start_time = initial_time; 586 bool old_vm_running = false; 587 588 qemu_savevm_state_begin(s->file, &s->params); 589 590 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; 591 migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE); 592 593 while (s->state == MIG_STATE_ACTIVE) { 594 int64_t current_time; 595 uint64_t pending_size; 596 597 if (!qemu_file_rate_limit(s->file)) { 598 pending_size = qemu_savevm_state_pending(s->file, max_size); 599 trace_migrate_pending(pending_size, max_size); 600 if (pending_size && pending_size >= max_size) { 601 qemu_savevm_state_iterate(s->file); 602 } else { 603 int ret; 604 605 qemu_mutex_lock_iothread(); 606 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 607 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); 608 old_vm_running = runstate_is_running(); 609 610 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); 611 if (ret >= 0) { 612 qemu_file_set_rate_limit(s->file, INT64_MAX); 613 qemu_savevm_state_complete(s->file); 614 } 615 qemu_mutex_unlock_iothread(); 616 617 if (ret < 0) { 618 migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR); 619 break; 620 } 621 622 if (!qemu_file_get_error(s->file)) { 623 migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED); 624 break; 625 } 626 } 627 } 628 629 if (qemu_file_get_error(s->file)) { 630 migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR); 631 break; 632 } 633 current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 634 if (current_time >= initial_time + BUFFER_DELAY) { 635 uint64_t transferred_bytes = qemu_ftell(s->file) - initial_bytes; 636 uint64_t time_spent = current_time - initial_time; 637 double bandwidth = transferred_bytes / time_spent; 638 max_size = bandwidth * migrate_max_downtime() / 1000000; 639 640 s->mbps = time_spent ? (((double) transferred_bytes * 8.0) / 641 ((double) time_spent / 1000.0)) / 1000.0 / 1000.0 : -1; 642 643 trace_migrate_transferred(transferred_bytes, time_spent, 644 bandwidth, max_size); 645 /* if we haven't sent anything, we don't want to recalculate 646 10000 is a small enough number for our purposes */ 647 if (s->dirty_bytes_rate && transferred_bytes > 10000) { 648 s->expected_downtime = s->dirty_bytes_rate / bandwidth; 649 } 650 651 qemu_file_reset_rate_limit(s->file); 652 initial_time = current_time; 653 initial_bytes = qemu_ftell(s->file); 654 } 655 if (qemu_file_rate_limit(s->file)) { 656 /* usleep expects microseconds */ 657 g_usleep((initial_time + BUFFER_DELAY - current_time)*1000); 658 } 659 } 660 661 qemu_mutex_lock_iothread(); 662 if (s->state == MIG_STATE_COMPLETED) { 663 int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 664 uint64_t transferred_bytes = qemu_ftell(s->file); 665 s->total_time = end_time - s->total_time; 666 s->downtime = end_time - start_time; 667 if (s->total_time) { 668 s->mbps = (((double) transferred_bytes * 8.0) / 669 ((double) s->total_time)) / 1000; 670 } 671 runstate_set(RUN_STATE_POSTMIGRATE); 672 } else { 673 if (old_vm_running) { 674 vm_start(); 675 } 676 } 677 qemu_bh_schedule(s->cleanup_bh); 678 qemu_mutex_unlock_iothread(); 679 680 return NULL; 681 } 682 683 void migrate_fd_connect(MigrationState *s) 684 { 685 s->state = MIG_STATE_SETUP; 686 trace_migrate_set_state(MIG_STATE_SETUP); 687 688 /* This is a best 1st approximation. ns to ms */ 689 s->expected_downtime = max_downtime/1000000; 690 s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); 691 692 qemu_file_set_rate_limit(s->file, 693 s->bandwidth_limit / XFER_LIMIT_RATIO); 694 695 /* Notify before starting migration thread */ 696 notifier_list_notify(&migration_state_notifiers, s); 697 698 qemu_thread_create(&s->thread, "migration", migration_thread, s, 699 QEMU_THREAD_JOINABLE); 700 } 701