1 /* 2 * QEMU live migration 3 * 4 * Copyright IBM, Corp. 2008 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qemu/cutils.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "migration/migration.h" 21 #include "migration/qemu-file.h" 22 #include "sysemu/sysemu.h" 23 #include "block/block.h" 24 #include "qapi/qmp/qerror.h" 25 #include "qapi/util.h" 26 #include "qemu/sockets.h" 27 #include "qemu/rcu.h" 28 #include "migration/block.h" 29 #include "migration/postcopy-ram.h" 30 #include "qemu/thread.h" 31 #include "qmp-commands.h" 32 #include "trace.h" 33 #include "qapi-event.h" 34 #include "qom/cpu.h" 35 #include "exec/memory.h" 36 #include "exec/address-spaces.h" 37 #include "io/channel-buffer.h" 38 #include "io/channel-tls.h" 39 40 #define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */ 41 42 /* Amount of time to allocate to each "chunk" of bandwidth-throttled 43 * data. */ 44 #define BUFFER_DELAY 100 45 #define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY) 46 47 /* Default compression thread count */ 48 #define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8 49 /* Default decompression thread count, usually decompression is at 50 * least 4 times as fast as compression.*/ 51 #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2 52 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */ 53 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1 54 /* Define default autoconverge cpu throttle migration parameters */ 55 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20 56 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10 57 58 /* Migration XBZRLE default cache size */ 59 #define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024) 60 61 static NotifierList migration_state_notifiers = 62 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); 63 64 static bool deferred_incoming; 65 66 /* 67 * Current state of incoming postcopy; note this is not part of 68 * MigrationIncomingState since it's state is used during cleanup 69 * at the end as MIS is being freed. 70 */ 71 static PostcopyState incoming_postcopy_state; 72 73 /* When we add fault tolerance, we could have several 74 migrations at once. For now we don't need to add 75 dynamic creation of migration */ 76 77 /* For outgoing */ 78 MigrationState *migrate_get_current(void) 79 { 80 static bool once; 81 static MigrationState current_migration = { 82 .state = MIGRATION_STATUS_NONE, 83 .bandwidth_limit = MAX_THROTTLE, 84 .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, 85 .mbps = -1, 86 .parameters = { 87 .compress_level = DEFAULT_MIGRATE_COMPRESS_LEVEL, 88 .compress_threads = DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT, 89 .decompress_threads = DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT, 90 .cpu_throttle_initial = DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL, 91 .cpu_throttle_increment = DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT, 92 }, 93 }; 94 95 if (!once) { 96 qemu_mutex_init(¤t_migration.src_page_req_mutex); 97 once = true; 98 } 99 return ¤t_migration; 100 } 101 102 /* For incoming */ 103 static MigrationIncomingState *mis_current; 104 105 MigrationIncomingState *migration_incoming_get_current(void) 106 { 107 return mis_current; 108 } 109 110 MigrationIncomingState *migration_incoming_state_new(QEMUFile* f) 111 { 112 mis_current = g_new0(MigrationIncomingState, 1); 113 mis_current->from_src_file = f; 114 mis_current->state = MIGRATION_STATUS_NONE; 115 QLIST_INIT(&mis_current->loadvm_handlers); 116 qemu_mutex_init(&mis_current->rp_mutex); 117 qemu_event_init(&mis_current->main_thread_load_event, false); 118 119 return mis_current; 120 } 121 122 void migration_incoming_state_destroy(void) 123 { 124 qemu_event_destroy(&mis_current->main_thread_load_event); 125 loadvm_free_handlers(mis_current); 126 g_free(mis_current); 127 mis_current = NULL; 128 } 129 130 131 typedef struct { 132 bool optional; 133 uint32_t size; 134 uint8_t runstate[100]; 135 RunState state; 136 bool received; 137 } GlobalState; 138 139 static GlobalState global_state; 140 141 int global_state_store(void) 142 { 143 if (!runstate_store((char *)global_state.runstate, 144 sizeof(global_state.runstate))) { 145 error_report("runstate name too big: %s", global_state.runstate); 146 trace_migrate_state_too_big(); 147 return -EINVAL; 148 } 149 return 0; 150 } 151 152 void global_state_store_running(void) 153 { 154 const char *state = RunState_lookup[RUN_STATE_RUNNING]; 155 strncpy((char *)global_state.runstate, 156 state, sizeof(global_state.runstate)); 157 } 158 159 static bool global_state_received(void) 160 { 161 return global_state.received; 162 } 163 164 static RunState global_state_get_runstate(void) 165 { 166 return global_state.state; 167 } 168 169 void global_state_set_optional(void) 170 { 171 global_state.optional = true; 172 } 173 174 static bool global_state_needed(void *opaque) 175 { 176 GlobalState *s = opaque; 177 char *runstate = (char *)s->runstate; 178 179 /* If it is not optional, it is mandatory */ 180 181 if (s->optional == false) { 182 return true; 183 } 184 185 /* If state is running or paused, it is not needed */ 186 187 if (strcmp(runstate, "running") == 0 || 188 strcmp(runstate, "paused") == 0) { 189 return false; 190 } 191 192 /* for any other state it is needed */ 193 return true; 194 } 195 196 static int global_state_post_load(void *opaque, int version_id) 197 { 198 GlobalState *s = opaque; 199 Error *local_err = NULL; 200 int r; 201 char *runstate = (char *)s->runstate; 202 203 s->received = true; 204 trace_migrate_global_state_post_load(runstate); 205 206 r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE__MAX, 207 -1, &local_err); 208 209 if (r == -1) { 210 if (local_err) { 211 error_report_err(local_err); 212 } 213 return -EINVAL; 214 } 215 s->state = r; 216 217 return 0; 218 } 219 220 static void global_state_pre_save(void *opaque) 221 { 222 GlobalState *s = opaque; 223 224 trace_migrate_global_state_pre_save((char *)s->runstate); 225 s->size = strlen((char *)s->runstate) + 1; 226 } 227 228 static const VMStateDescription vmstate_globalstate = { 229 .name = "globalstate", 230 .version_id = 1, 231 .minimum_version_id = 1, 232 .post_load = global_state_post_load, 233 .pre_save = global_state_pre_save, 234 .needed = global_state_needed, 235 .fields = (VMStateField[]) { 236 VMSTATE_UINT32(size, GlobalState), 237 VMSTATE_BUFFER(runstate, GlobalState), 238 VMSTATE_END_OF_LIST() 239 }, 240 }; 241 242 void register_global_state(void) 243 { 244 /* We would use it independently that we receive it */ 245 strcpy((char *)&global_state.runstate, ""); 246 global_state.received = false; 247 vmstate_register(NULL, 0, &vmstate_globalstate, &global_state); 248 } 249 250 static void migrate_generate_event(int new_state) 251 { 252 if (migrate_use_events()) { 253 qapi_event_send_migration(new_state, &error_abort); 254 } 255 } 256 257 /* 258 * Called on -incoming with a defer: uri. 259 * The migration can be started later after any parameters have been 260 * changed. 261 */ 262 static void deferred_incoming_migration(Error **errp) 263 { 264 if (deferred_incoming) { 265 error_setg(errp, "Incoming migration already deferred"); 266 } 267 deferred_incoming = true; 268 } 269 270 /* Request a range of pages from the source VM at the given 271 * start address. 272 * rbname: Name of the RAMBlock to request the page in, if NULL it's the same 273 * as the last request (a name must have been given previously) 274 * Start: Address offset within the RB 275 * Len: Length in bytes required - must be a multiple of pagesize 276 */ 277 void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname, 278 ram_addr_t start, size_t len) 279 { 280 uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */ 281 size_t msglen = 12; /* start + len */ 282 283 *(uint64_t *)bufc = cpu_to_be64((uint64_t)start); 284 *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len); 285 286 if (rbname) { 287 int rbname_len = strlen(rbname); 288 assert(rbname_len < 256); 289 290 bufc[msglen++] = rbname_len; 291 memcpy(bufc + msglen, rbname, rbname_len); 292 msglen += rbname_len; 293 migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc); 294 } else { 295 migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc); 296 } 297 } 298 299 void qemu_start_incoming_migration(const char *uri, Error **errp) 300 { 301 const char *p; 302 303 qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort); 304 if (!strcmp(uri, "defer")) { 305 deferred_incoming_migration(errp); 306 } else if (strstart(uri, "tcp:", &p)) { 307 tcp_start_incoming_migration(p, errp); 308 #ifdef CONFIG_RDMA 309 } else if (strstart(uri, "rdma:", &p)) { 310 rdma_start_incoming_migration(p, errp); 311 #endif 312 } else if (strstart(uri, "exec:", &p)) { 313 exec_start_incoming_migration(p, errp); 314 } else if (strstart(uri, "unix:", &p)) { 315 unix_start_incoming_migration(p, errp); 316 } else if (strstart(uri, "fd:", &p)) { 317 fd_start_incoming_migration(p, errp); 318 } else { 319 error_setg(errp, "unknown migration protocol: %s", uri); 320 } 321 } 322 323 static void process_incoming_migration_bh(void *opaque) 324 { 325 Error *local_err = NULL; 326 MigrationIncomingState *mis = opaque; 327 328 /* Make sure all file formats flush their mutable metadata */ 329 bdrv_invalidate_cache_all(&local_err); 330 if (local_err) { 331 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, 332 MIGRATION_STATUS_FAILED); 333 error_report_err(local_err); 334 migrate_decompress_threads_join(); 335 exit(EXIT_FAILURE); 336 } 337 338 /* 339 * This must happen after all error conditions are dealt with and 340 * we're sure the VM is going to be running on this host. 341 */ 342 qemu_announce_self(); 343 344 /* If global state section was not received or we are in running 345 state, we need to obey autostart. Any other state is set with 346 runstate_set. */ 347 348 if (!global_state_received() || 349 global_state_get_runstate() == RUN_STATE_RUNNING) { 350 if (autostart) { 351 vm_start(); 352 } else { 353 runstate_set(RUN_STATE_PAUSED); 354 } 355 } else { 356 runstate_set(global_state_get_runstate()); 357 } 358 migrate_decompress_threads_join(); 359 /* 360 * This must happen after any state changes since as soon as an external 361 * observer sees this event they might start to prod at the VM assuming 362 * it's ready to use. 363 */ 364 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, 365 MIGRATION_STATUS_COMPLETED); 366 qemu_bh_delete(mis->bh); 367 migration_incoming_state_destroy(); 368 } 369 370 static void process_incoming_migration_co(void *opaque) 371 { 372 QEMUFile *f = opaque; 373 MigrationIncomingState *mis; 374 PostcopyState ps; 375 int ret; 376 377 mis = migration_incoming_state_new(f); 378 postcopy_state_set(POSTCOPY_INCOMING_NONE); 379 migrate_set_state(&mis->state, MIGRATION_STATUS_NONE, 380 MIGRATION_STATUS_ACTIVE); 381 ret = qemu_loadvm_state(f); 382 383 ps = postcopy_state_get(); 384 trace_process_incoming_migration_co_end(ret, ps); 385 if (ps != POSTCOPY_INCOMING_NONE) { 386 if (ps == POSTCOPY_INCOMING_ADVISE) { 387 /* 388 * Where a migration had postcopy enabled (and thus went to advise) 389 * but managed to complete within the precopy period, we can use 390 * the normal exit. 391 */ 392 postcopy_ram_incoming_cleanup(mis); 393 } else if (ret >= 0) { 394 /* 395 * Postcopy was started, cleanup should happen at the end of the 396 * postcopy thread. 397 */ 398 trace_process_incoming_migration_co_postcopy_end_main(); 399 return; 400 } 401 /* Else if something went wrong then just fall out of the normal exit */ 402 } 403 404 qemu_fclose(f); 405 free_xbzrle_decoded_buf(); 406 407 if (ret < 0) { 408 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, 409 MIGRATION_STATUS_FAILED); 410 error_report("load of migration failed: %s", strerror(-ret)); 411 migrate_decompress_threads_join(); 412 exit(EXIT_FAILURE); 413 } 414 415 mis->bh = qemu_bh_new(process_incoming_migration_bh, mis); 416 qemu_bh_schedule(mis->bh); 417 } 418 419 void migration_fd_process_incoming(QEMUFile *f) 420 { 421 Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, f); 422 423 migrate_decompress_threads_create(); 424 qemu_file_set_blocking(f, false); 425 qemu_coroutine_enter(co); 426 } 427 428 429 void migration_channel_process_incoming(MigrationState *s, 430 QIOChannel *ioc) 431 { 432 trace_migration_set_incoming_channel( 433 ioc, object_get_typename(OBJECT(ioc))); 434 435 if (s->parameters.tls_creds && 436 !object_dynamic_cast(OBJECT(ioc), 437 TYPE_QIO_CHANNEL_TLS)) { 438 Error *local_err = NULL; 439 migration_tls_channel_process_incoming(s, ioc, &local_err); 440 if (local_err) { 441 error_report_err(local_err); 442 } 443 } else { 444 QEMUFile *f = qemu_fopen_channel_input(ioc); 445 migration_fd_process_incoming(f); 446 } 447 } 448 449 450 void migration_channel_connect(MigrationState *s, 451 QIOChannel *ioc, 452 const char *hostname) 453 { 454 trace_migration_set_outgoing_channel( 455 ioc, object_get_typename(OBJECT(ioc)), hostname); 456 457 if (s->parameters.tls_creds && 458 !object_dynamic_cast(OBJECT(ioc), 459 TYPE_QIO_CHANNEL_TLS)) { 460 Error *local_err = NULL; 461 migration_tls_channel_connect(s, ioc, hostname, &local_err); 462 if (local_err) { 463 migrate_fd_error(s, local_err); 464 error_free(local_err); 465 } 466 } else { 467 QEMUFile *f = qemu_fopen_channel_output(ioc); 468 469 s->to_dst_file = f; 470 471 migrate_fd_connect(s); 472 } 473 } 474 475 476 /* 477 * Send a message on the return channel back to the source 478 * of the migration. 479 */ 480 void migrate_send_rp_message(MigrationIncomingState *mis, 481 enum mig_rp_message_type message_type, 482 uint16_t len, void *data) 483 { 484 trace_migrate_send_rp_message((int)message_type, len); 485 qemu_mutex_lock(&mis->rp_mutex); 486 qemu_put_be16(mis->to_src_file, (unsigned int)message_type); 487 qemu_put_be16(mis->to_src_file, len); 488 qemu_put_buffer(mis->to_src_file, data, len); 489 qemu_fflush(mis->to_src_file); 490 qemu_mutex_unlock(&mis->rp_mutex); 491 } 492 493 /* 494 * Send a 'SHUT' message on the return channel with the given value 495 * to indicate that we've finished with the RP. Non-0 value indicates 496 * error. 497 */ 498 void migrate_send_rp_shut(MigrationIncomingState *mis, 499 uint32_t value) 500 { 501 uint32_t buf; 502 503 buf = cpu_to_be32(value); 504 migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf); 505 } 506 507 /* 508 * Send a 'PONG' message on the return channel with the given value 509 * (normally in response to a 'PING') 510 */ 511 void migrate_send_rp_pong(MigrationIncomingState *mis, 512 uint32_t value) 513 { 514 uint32_t buf; 515 516 buf = cpu_to_be32(value); 517 migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf); 518 } 519 520 /* amount of nanoseconds we are willing to wait for migration to be down. 521 * the choice of nanoseconds is because it is the maximum resolution that 522 * get_clock() can achieve. It is an internal measure. All user-visible 523 * units must be in seconds */ 524 static uint64_t max_downtime = 300000000; 525 526 uint64_t migrate_max_downtime(void) 527 { 528 return max_downtime; 529 } 530 531 MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) 532 { 533 MigrationCapabilityStatusList *head = NULL; 534 MigrationCapabilityStatusList *caps; 535 MigrationState *s = migrate_get_current(); 536 int i; 537 538 caps = NULL; /* silence compiler warning */ 539 for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { 540 if (head == NULL) { 541 head = g_malloc0(sizeof(*caps)); 542 caps = head; 543 } else { 544 caps->next = g_malloc0(sizeof(*caps)); 545 caps = caps->next; 546 } 547 caps->value = 548 g_malloc(sizeof(*caps->value)); 549 caps->value->capability = i; 550 caps->value->state = s->enabled_capabilities[i]; 551 } 552 553 return head; 554 } 555 556 MigrationParameters *qmp_query_migrate_parameters(Error **errp) 557 { 558 MigrationParameters *params; 559 MigrationState *s = migrate_get_current(); 560 561 params = g_malloc0(sizeof(*params)); 562 params->compress_level = s->parameters.compress_level; 563 params->compress_threads = s->parameters.compress_threads; 564 params->decompress_threads = s->parameters.decompress_threads; 565 params->cpu_throttle_initial = s->parameters.cpu_throttle_initial; 566 params->cpu_throttle_increment = s->parameters.cpu_throttle_increment; 567 params->tls_creds = g_strdup(s->parameters.tls_creds); 568 params->tls_hostname = g_strdup(s->parameters.tls_hostname); 569 570 return params; 571 } 572 573 /* 574 * Return true if we're already in the middle of a migration 575 * (i.e. any of the active or setup states) 576 */ 577 static bool migration_is_setup_or_active(int state) 578 { 579 switch (state) { 580 case MIGRATION_STATUS_ACTIVE: 581 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 582 case MIGRATION_STATUS_SETUP: 583 return true; 584 585 default: 586 return false; 587 588 } 589 } 590 591 static void get_xbzrle_cache_stats(MigrationInfo *info) 592 { 593 if (migrate_use_xbzrle()) { 594 info->has_xbzrle_cache = true; 595 info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); 596 info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); 597 info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred(); 598 info->xbzrle_cache->pages = xbzrle_mig_pages_transferred(); 599 info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss(); 600 info->xbzrle_cache->cache_miss_rate = xbzrle_mig_cache_miss_rate(); 601 info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow(); 602 } 603 } 604 605 static void populate_ram_info(MigrationInfo *info, MigrationState *s) 606 { 607 info->has_ram = true; 608 info->ram = g_malloc0(sizeof(*info->ram)); 609 info->ram->transferred = ram_bytes_transferred(); 610 info->ram->total = ram_bytes_total(); 611 info->ram->duplicate = dup_mig_pages_transferred(); 612 info->ram->skipped = skipped_mig_pages_transferred(); 613 info->ram->normal = norm_mig_pages_transferred(); 614 info->ram->normal_bytes = norm_mig_bytes_transferred(); 615 info->ram->mbps = s->mbps; 616 info->ram->dirty_sync_count = s->dirty_sync_count; 617 info->ram->postcopy_requests = s->postcopy_requests; 618 619 if (s->state != MIGRATION_STATUS_COMPLETED) { 620 info->ram->remaining = ram_bytes_remaining(); 621 info->ram->dirty_pages_rate = s->dirty_pages_rate; 622 } 623 } 624 625 MigrationInfo *qmp_query_migrate(Error **errp) 626 { 627 MigrationInfo *info = g_malloc0(sizeof(*info)); 628 MigrationState *s = migrate_get_current(); 629 630 switch (s->state) { 631 case MIGRATION_STATUS_NONE: 632 /* no migration has happened ever */ 633 break; 634 case MIGRATION_STATUS_SETUP: 635 info->has_status = true; 636 info->has_total_time = false; 637 break; 638 case MIGRATION_STATUS_ACTIVE: 639 case MIGRATION_STATUS_CANCELLING: 640 info->has_status = true; 641 info->has_total_time = true; 642 info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) 643 - s->total_time; 644 info->has_expected_downtime = true; 645 info->expected_downtime = s->expected_downtime; 646 info->has_setup_time = true; 647 info->setup_time = s->setup_time; 648 649 populate_ram_info(info, s); 650 651 if (blk_mig_active()) { 652 info->has_disk = true; 653 info->disk = g_malloc0(sizeof(*info->disk)); 654 info->disk->transferred = blk_mig_bytes_transferred(); 655 info->disk->remaining = blk_mig_bytes_remaining(); 656 info->disk->total = blk_mig_bytes_total(); 657 } 658 659 if (cpu_throttle_active()) { 660 info->has_cpu_throttle_percentage = true; 661 info->cpu_throttle_percentage = cpu_throttle_get_percentage(); 662 } 663 664 get_xbzrle_cache_stats(info); 665 break; 666 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 667 /* Mostly the same as active; TODO add some postcopy stats */ 668 info->has_status = true; 669 info->has_total_time = true; 670 info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) 671 - s->total_time; 672 info->has_expected_downtime = true; 673 info->expected_downtime = s->expected_downtime; 674 info->has_setup_time = true; 675 info->setup_time = s->setup_time; 676 677 populate_ram_info(info, s); 678 679 if (blk_mig_active()) { 680 info->has_disk = true; 681 info->disk = g_malloc0(sizeof(*info->disk)); 682 info->disk->transferred = blk_mig_bytes_transferred(); 683 info->disk->remaining = blk_mig_bytes_remaining(); 684 info->disk->total = blk_mig_bytes_total(); 685 } 686 687 get_xbzrle_cache_stats(info); 688 break; 689 case MIGRATION_STATUS_COMPLETED: 690 get_xbzrle_cache_stats(info); 691 692 info->has_status = true; 693 info->has_total_time = true; 694 info->total_time = s->total_time; 695 info->has_downtime = true; 696 info->downtime = s->downtime; 697 info->has_setup_time = true; 698 info->setup_time = s->setup_time; 699 700 populate_ram_info(info, s); 701 break; 702 case MIGRATION_STATUS_FAILED: 703 info->has_status = true; 704 if (s->error) { 705 info->has_error_desc = true; 706 info->error_desc = g_strdup(error_get_pretty(s->error)); 707 } 708 break; 709 case MIGRATION_STATUS_CANCELLED: 710 info->has_status = true; 711 break; 712 } 713 info->status = s->state; 714 715 return info; 716 } 717 718 void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, 719 Error **errp) 720 { 721 MigrationState *s = migrate_get_current(); 722 MigrationCapabilityStatusList *cap; 723 bool old_postcopy_cap = migrate_postcopy_ram(); 724 725 if (migration_is_setup_or_active(s->state)) { 726 error_setg(errp, QERR_MIGRATION_ACTIVE); 727 return; 728 } 729 730 for (cap = params; cap; cap = cap->next) { 731 s->enabled_capabilities[cap->value->capability] = cap->value->state; 732 } 733 734 if (migrate_postcopy_ram()) { 735 if (migrate_use_compression()) { 736 /* The decompression threads asynchronously write into RAM 737 * rather than use the atomic copies needed to avoid 738 * userfaulting. It should be possible to fix the decompression 739 * threads for compatibility in future. 740 */ 741 error_report("Postcopy is not currently compatible with " 742 "compression"); 743 s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM] = 744 false; 745 } 746 /* This check is reasonably expensive, so only when it's being 747 * set the first time, also it's only the destination that needs 748 * special support. 749 */ 750 if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) && 751 !postcopy_ram_supported_by_host()) { 752 /* postcopy_ram_supported_by_host will have emitted a more 753 * detailed message 754 */ 755 error_report("Postcopy is not supported"); 756 s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM] = 757 false; 758 } 759 } 760 } 761 762 void qmp_migrate_set_parameters(bool has_compress_level, 763 int64_t compress_level, 764 bool has_compress_threads, 765 int64_t compress_threads, 766 bool has_decompress_threads, 767 int64_t decompress_threads, 768 bool has_cpu_throttle_initial, 769 int64_t cpu_throttle_initial, 770 bool has_cpu_throttle_increment, 771 int64_t cpu_throttle_increment, 772 bool has_tls_creds, 773 const char *tls_creds, 774 bool has_tls_hostname, 775 const char *tls_hostname, 776 Error **errp) 777 { 778 MigrationState *s = migrate_get_current(); 779 780 if (has_compress_level && (compress_level < 0 || compress_level > 9)) { 781 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", 782 "is invalid, it should be in the range of 0 to 9"); 783 return; 784 } 785 if (has_compress_threads && 786 (compress_threads < 1 || compress_threads > 255)) { 787 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, 788 "compress_threads", 789 "is invalid, it should be in the range of 1 to 255"); 790 return; 791 } 792 if (has_decompress_threads && 793 (decompress_threads < 1 || decompress_threads > 255)) { 794 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, 795 "decompress_threads", 796 "is invalid, it should be in the range of 1 to 255"); 797 return; 798 } 799 if (has_cpu_throttle_initial && 800 (cpu_throttle_initial < 1 || cpu_throttle_initial > 99)) { 801 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, 802 "cpu_throttle_initial", 803 "an integer in the range of 1 to 99"); 804 } 805 if (has_cpu_throttle_increment && 806 (cpu_throttle_increment < 1 || cpu_throttle_increment > 99)) { 807 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, 808 "cpu_throttle_increment", 809 "an integer in the range of 1 to 99"); 810 } 811 812 if (has_compress_level) { 813 s->parameters.compress_level = compress_level; 814 } 815 if (has_compress_threads) { 816 s->parameters.compress_threads = compress_threads; 817 } 818 if (has_decompress_threads) { 819 s->parameters.decompress_threads = decompress_threads; 820 } 821 if (has_cpu_throttle_initial) { 822 s->parameters.cpu_throttle_initial = cpu_throttle_initial; 823 } 824 if (has_cpu_throttle_increment) { 825 s->parameters.cpu_throttle_increment = cpu_throttle_increment; 826 } 827 if (has_tls_creds) { 828 g_free(s->parameters.tls_creds); 829 s->parameters.tls_creds = g_strdup(tls_creds); 830 } 831 if (has_tls_hostname) { 832 g_free(s->parameters.tls_hostname); 833 s->parameters.tls_hostname = g_strdup(tls_hostname); 834 } 835 } 836 837 838 void qmp_migrate_start_postcopy(Error **errp) 839 { 840 MigrationState *s = migrate_get_current(); 841 842 if (!migrate_postcopy_ram()) { 843 error_setg(errp, "Enable postcopy with migrate_set_capability before" 844 " the start of migration"); 845 return; 846 } 847 848 if (s->state == MIGRATION_STATUS_NONE) { 849 error_setg(errp, "Postcopy must be started after migration has been" 850 " started"); 851 return; 852 } 853 /* 854 * we don't error if migration has finished since that would be racy 855 * with issuing this command. 856 */ 857 atomic_set(&s->start_postcopy, true); 858 } 859 860 /* shared migration helpers */ 861 862 void migrate_set_state(int *state, int old_state, int new_state) 863 { 864 if (atomic_cmpxchg(state, old_state, new_state) == old_state) { 865 trace_migrate_set_state(new_state); 866 migrate_generate_event(new_state); 867 } 868 } 869 870 static void migrate_fd_cleanup(void *opaque) 871 { 872 MigrationState *s = opaque; 873 874 qemu_bh_delete(s->cleanup_bh); 875 s->cleanup_bh = NULL; 876 877 flush_page_queue(s); 878 879 if (s->to_dst_file) { 880 trace_migrate_fd_cleanup(); 881 qemu_mutex_unlock_iothread(); 882 if (s->migration_thread_running) { 883 qemu_thread_join(&s->thread); 884 s->migration_thread_running = false; 885 } 886 qemu_mutex_lock_iothread(); 887 888 migrate_compress_threads_join(); 889 qemu_fclose(s->to_dst_file); 890 s->to_dst_file = NULL; 891 } 892 893 assert((s->state != MIGRATION_STATUS_ACTIVE) && 894 (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE)); 895 896 if (s->state == MIGRATION_STATUS_CANCELLING) { 897 migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, 898 MIGRATION_STATUS_CANCELLED); 899 } 900 901 notifier_list_notify(&migration_state_notifiers, s); 902 } 903 904 void migrate_fd_error(MigrationState *s, const Error *error) 905 { 906 trace_migrate_fd_error(error ? error_get_pretty(error) : ""); 907 assert(s->to_dst_file == NULL); 908 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 909 MIGRATION_STATUS_FAILED); 910 if (!s->error) { 911 s->error = error_copy(error); 912 } 913 notifier_list_notify(&migration_state_notifiers, s); 914 } 915 916 static void migrate_fd_cancel(MigrationState *s) 917 { 918 int old_state ; 919 QEMUFile *f = migrate_get_current()->to_dst_file; 920 trace_migrate_fd_cancel(); 921 922 if (s->rp_state.from_dst_file) { 923 /* shutdown the rp socket, so causing the rp thread to shutdown */ 924 qemu_file_shutdown(s->rp_state.from_dst_file); 925 } 926 927 do { 928 old_state = s->state; 929 if (!migration_is_setup_or_active(old_state)) { 930 break; 931 } 932 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); 933 } while (s->state != MIGRATION_STATUS_CANCELLING); 934 935 /* 936 * If we're unlucky the migration code might be stuck somewhere in a 937 * send/write while the network has failed and is waiting to timeout; 938 * if we've got shutdown(2) available then we can force it to quit. 939 * The outgoing qemu file gets closed in migrate_fd_cleanup that is 940 * called in a bh, so there is no race against this cancel. 941 */ 942 if (s->state == MIGRATION_STATUS_CANCELLING && f) { 943 qemu_file_shutdown(f); 944 } 945 } 946 947 void add_migration_state_change_notifier(Notifier *notify) 948 { 949 notifier_list_add(&migration_state_notifiers, notify); 950 } 951 952 void remove_migration_state_change_notifier(Notifier *notify) 953 { 954 notifier_remove(notify); 955 } 956 957 bool migration_in_setup(MigrationState *s) 958 { 959 return s->state == MIGRATION_STATUS_SETUP; 960 } 961 962 bool migration_has_finished(MigrationState *s) 963 { 964 return s->state == MIGRATION_STATUS_COMPLETED; 965 } 966 967 bool migration_has_failed(MigrationState *s) 968 { 969 return (s->state == MIGRATION_STATUS_CANCELLED || 970 s->state == MIGRATION_STATUS_FAILED); 971 } 972 973 bool migration_in_postcopy(MigrationState *s) 974 { 975 return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); 976 } 977 978 bool migration_in_postcopy_after_devices(MigrationState *s) 979 { 980 return migration_in_postcopy(s) && s->postcopy_after_devices; 981 } 982 983 MigrationState *migrate_init(const MigrationParams *params) 984 { 985 MigrationState *s = migrate_get_current(); 986 987 /* 988 * Reinitialise all migration state, except 989 * parameters/capabilities that the user set, and 990 * locks. 991 */ 992 s->bytes_xfer = 0; 993 s->xfer_limit = 0; 994 s->cleanup_bh = 0; 995 s->to_dst_file = NULL; 996 s->state = MIGRATION_STATUS_NONE; 997 s->params = *params; 998 s->rp_state.from_dst_file = NULL; 999 s->rp_state.error = false; 1000 s->mbps = 0.0; 1001 s->downtime = 0; 1002 s->expected_downtime = 0; 1003 s->dirty_pages_rate = 0; 1004 s->dirty_bytes_rate = 0; 1005 s->setup_time = 0; 1006 s->dirty_sync_count = 0; 1007 s->start_postcopy = false; 1008 s->postcopy_after_devices = false; 1009 s->postcopy_requests = 0; 1010 s->migration_thread_running = false; 1011 s->last_req_rb = NULL; 1012 error_free(s->error); 1013 s->error = NULL; 1014 1015 migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); 1016 1017 QSIMPLEQ_INIT(&s->src_page_requests); 1018 1019 s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1020 return s; 1021 } 1022 1023 static GSList *migration_blockers; 1024 1025 void migrate_add_blocker(Error *reason) 1026 { 1027 migration_blockers = g_slist_prepend(migration_blockers, reason); 1028 } 1029 1030 void migrate_del_blocker(Error *reason) 1031 { 1032 migration_blockers = g_slist_remove(migration_blockers, reason); 1033 } 1034 1035 void qmp_migrate_incoming(const char *uri, Error **errp) 1036 { 1037 Error *local_err = NULL; 1038 static bool once = true; 1039 1040 if (!deferred_incoming) { 1041 error_setg(errp, "For use with '-incoming defer'"); 1042 return; 1043 } 1044 if (!once) { 1045 error_setg(errp, "The incoming migration has already been started"); 1046 } 1047 1048 qemu_start_incoming_migration(uri, &local_err); 1049 1050 if (local_err) { 1051 error_propagate(errp, local_err); 1052 return; 1053 } 1054 1055 once = false; 1056 } 1057 1058 bool migration_is_blocked(Error **errp) 1059 { 1060 if (qemu_savevm_state_blocked(errp)) { 1061 return true; 1062 } 1063 1064 if (migration_blockers) { 1065 *errp = error_copy(migration_blockers->data); 1066 return true; 1067 } 1068 1069 return false; 1070 } 1071 1072 void qmp_migrate(const char *uri, bool has_blk, bool blk, 1073 bool has_inc, bool inc, bool has_detach, bool detach, 1074 Error **errp) 1075 { 1076 Error *local_err = NULL; 1077 MigrationState *s = migrate_get_current(); 1078 MigrationParams params; 1079 const char *p; 1080 1081 params.blk = has_blk && blk; 1082 params.shared = has_inc && inc; 1083 1084 if (migration_is_setup_or_active(s->state) || 1085 s->state == MIGRATION_STATUS_CANCELLING) { 1086 error_setg(errp, QERR_MIGRATION_ACTIVE); 1087 return; 1088 } 1089 if (runstate_check(RUN_STATE_INMIGRATE)) { 1090 error_setg(errp, "Guest is waiting for an incoming migration"); 1091 return; 1092 } 1093 1094 if (migration_is_blocked(errp)) { 1095 return; 1096 } 1097 1098 s = migrate_init(¶ms); 1099 1100 if (strstart(uri, "tcp:", &p)) { 1101 tcp_start_outgoing_migration(s, p, &local_err); 1102 #ifdef CONFIG_RDMA 1103 } else if (strstart(uri, "rdma:", &p)) { 1104 rdma_start_outgoing_migration(s, p, &local_err); 1105 #endif 1106 } else if (strstart(uri, "exec:", &p)) { 1107 exec_start_outgoing_migration(s, p, &local_err); 1108 } else if (strstart(uri, "unix:", &p)) { 1109 unix_start_outgoing_migration(s, p, &local_err); 1110 } else if (strstart(uri, "fd:", &p)) { 1111 fd_start_outgoing_migration(s, p, &local_err); 1112 } else { 1113 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri", 1114 "a valid migration protocol"); 1115 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 1116 MIGRATION_STATUS_FAILED); 1117 return; 1118 } 1119 1120 if (local_err) { 1121 migrate_fd_error(s, local_err); 1122 error_propagate(errp, local_err); 1123 return; 1124 } 1125 } 1126 1127 void qmp_migrate_cancel(Error **errp) 1128 { 1129 migrate_fd_cancel(migrate_get_current()); 1130 } 1131 1132 void qmp_migrate_set_cache_size(int64_t value, Error **errp) 1133 { 1134 MigrationState *s = migrate_get_current(); 1135 int64_t new_size; 1136 1137 /* Check for truncation */ 1138 if (value != (size_t)value) { 1139 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 1140 "exceeding address space"); 1141 return; 1142 } 1143 1144 /* Cache should not be larger than guest ram size */ 1145 if (value > ram_bytes_total()) { 1146 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 1147 "exceeds guest ram size "); 1148 return; 1149 } 1150 1151 new_size = xbzrle_cache_resize(value); 1152 if (new_size < 0) { 1153 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 1154 "is smaller than page size"); 1155 return; 1156 } 1157 1158 s->xbzrle_cache_size = new_size; 1159 } 1160 1161 int64_t qmp_query_migrate_cache_size(Error **errp) 1162 { 1163 return migrate_xbzrle_cache_size(); 1164 } 1165 1166 void qmp_migrate_set_speed(int64_t value, Error **errp) 1167 { 1168 MigrationState *s; 1169 1170 if (value < 0) { 1171 value = 0; 1172 } 1173 if (value > SIZE_MAX) { 1174 value = SIZE_MAX; 1175 } 1176 1177 s = migrate_get_current(); 1178 s->bandwidth_limit = value; 1179 if (s->to_dst_file) { 1180 qemu_file_set_rate_limit(s->to_dst_file, 1181 s->bandwidth_limit / XFER_LIMIT_RATIO); 1182 } 1183 } 1184 1185 void qmp_migrate_set_downtime(double value, Error **errp) 1186 { 1187 value *= 1e9; 1188 value = MAX(0, MIN(UINT64_MAX, value)); 1189 max_downtime = (uint64_t)value; 1190 } 1191 1192 bool migrate_postcopy_ram(void) 1193 { 1194 MigrationState *s; 1195 1196 s = migrate_get_current(); 1197 1198 return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM]; 1199 } 1200 1201 bool migrate_auto_converge(void) 1202 { 1203 MigrationState *s; 1204 1205 s = migrate_get_current(); 1206 1207 return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; 1208 } 1209 1210 bool migrate_zero_blocks(void) 1211 { 1212 MigrationState *s; 1213 1214 s = migrate_get_current(); 1215 1216 return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; 1217 } 1218 1219 bool migrate_use_compression(void) 1220 { 1221 MigrationState *s; 1222 1223 s = migrate_get_current(); 1224 1225 return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS]; 1226 } 1227 1228 int migrate_compress_level(void) 1229 { 1230 MigrationState *s; 1231 1232 s = migrate_get_current(); 1233 1234 return s->parameters.compress_level; 1235 } 1236 1237 int migrate_compress_threads(void) 1238 { 1239 MigrationState *s; 1240 1241 s = migrate_get_current(); 1242 1243 return s->parameters.compress_threads; 1244 } 1245 1246 int migrate_decompress_threads(void) 1247 { 1248 MigrationState *s; 1249 1250 s = migrate_get_current(); 1251 1252 return s->parameters.decompress_threads; 1253 } 1254 1255 bool migrate_use_events(void) 1256 { 1257 MigrationState *s; 1258 1259 s = migrate_get_current(); 1260 1261 return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS]; 1262 } 1263 1264 int migrate_use_xbzrle(void) 1265 { 1266 MigrationState *s; 1267 1268 s = migrate_get_current(); 1269 1270 return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; 1271 } 1272 1273 int64_t migrate_xbzrle_cache_size(void) 1274 { 1275 MigrationState *s; 1276 1277 s = migrate_get_current(); 1278 1279 return s->xbzrle_cache_size; 1280 } 1281 1282 /* migration thread support */ 1283 /* 1284 * Something bad happened to the RP stream, mark an error 1285 * The caller shall print or trace something to indicate why 1286 */ 1287 static void mark_source_rp_bad(MigrationState *s) 1288 { 1289 s->rp_state.error = true; 1290 } 1291 1292 static struct rp_cmd_args { 1293 ssize_t len; /* -1 = variable */ 1294 const char *name; 1295 } rp_cmd_args[] = { 1296 [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" }, 1297 [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" }, 1298 [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" }, 1299 [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" }, 1300 [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, 1301 [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, 1302 }; 1303 1304 /* 1305 * Process a request for pages received on the return path, 1306 * We're allowed to send more than requested (e.g. to round to our page size) 1307 * and we don't need to send pages that have already been sent. 1308 */ 1309 static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, 1310 ram_addr_t start, size_t len) 1311 { 1312 long our_host_ps = getpagesize(); 1313 1314 trace_migrate_handle_rp_req_pages(rbname, start, len); 1315 1316 /* 1317 * Since we currently insist on matching page sizes, just sanity check 1318 * we're being asked for whole host pages. 1319 */ 1320 if (start & (our_host_ps-1) || 1321 (len & (our_host_ps-1))) { 1322 error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT 1323 " len: %zd", __func__, start, len); 1324 mark_source_rp_bad(ms); 1325 return; 1326 } 1327 1328 if (ram_save_queue_pages(ms, rbname, start, len)) { 1329 mark_source_rp_bad(ms); 1330 } 1331 } 1332 1333 /* 1334 * Handles messages sent on the return path towards the source VM 1335 * 1336 */ 1337 static void *source_return_path_thread(void *opaque) 1338 { 1339 MigrationState *ms = opaque; 1340 QEMUFile *rp = ms->rp_state.from_dst_file; 1341 uint16_t header_len, header_type; 1342 uint8_t buf[512]; 1343 uint32_t tmp32, sibling_error; 1344 ram_addr_t start = 0; /* =0 to silence warning */ 1345 size_t len = 0, expected_len; 1346 int res; 1347 1348 trace_source_return_path_thread_entry(); 1349 while (!ms->rp_state.error && !qemu_file_get_error(rp) && 1350 migration_is_setup_or_active(ms->state)) { 1351 trace_source_return_path_thread_loop_top(); 1352 header_type = qemu_get_be16(rp); 1353 header_len = qemu_get_be16(rp); 1354 1355 if (header_type >= MIG_RP_MSG_MAX || 1356 header_type == MIG_RP_MSG_INVALID) { 1357 error_report("RP: Received invalid message 0x%04x length 0x%04x", 1358 header_type, header_len); 1359 mark_source_rp_bad(ms); 1360 goto out; 1361 } 1362 1363 if ((rp_cmd_args[header_type].len != -1 && 1364 header_len != rp_cmd_args[header_type].len) || 1365 header_len > sizeof(buf)) { 1366 error_report("RP: Received '%s' message (0x%04x) with" 1367 "incorrect length %d expecting %zu", 1368 rp_cmd_args[header_type].name, header_type, header_len, 1369 (size_t)rp_cmd_args[header_type].len); 1370 mark_source_rp_bad(ms); 1371 goto out; 1372 } 1373 1374 /* We know we've got a valid header by this point */ 1375 res = qemu_get_buffer(rp, buf, header_len); 1376 if (res != header_len) { 1377 error_report("RP: Failed reading data for message 0x%04x" 1378 " read %d expected %d", 1379 header_type, res, header_len); 1380 mark_source_rp_bad(ms); 1381 goto out; 1382 } 1383 1384 /* OK, we have the message and the data */ 1385 switch (header_type) { 1386 case MIG_RP_MSG_SHUT: 1387 sibling_error = ldl_be_p(buf); 1388 trace_source_return_path_thread_shut(sibling_error); 1389 if (sibling_error) { 1390 error_report("RP: Sibling indicated error %d", sibling_error); 1391 mark_source_rp_bad(ms); 1392 } 1393 /* 1394 * We'll let the main thread deal with closing the RP 1395 * we could do a shutdown(2) on it, but we're the only user 1396 * anyway, so there's nothing gained. 1397 */ 1398 goto out; 1399 1400 case MIG_RP_MSG_PONG: 1401 tmp32 = ldl_be_p(buf); 1402 trace_source_return_path_thread_pong(tmp32); 1403 break; 1404 1405 case MIG_RP_MSG_REQ_PAGES: 1406 start = ldq_be_p(buf); 1407 len = ldl_be_p(buf + 8); 1408 migrate_handle_rp_req_pages(ms, NULL, start, len); 1409 break; 1410 1411 case MIG_RP_MSG_REQ_PAGES_ID: 1412 expected_len = 12 + 1; /* header + termination */ 1413 1414 if (header_len >= expected_len) { 1415 start = ldq_be_p(buf); 1416 len = ldl_be_p(buf + 8); 1417 /* Now we expect an idstr */ 1418 tmp32 = buf[12]; /* Length of the following idstr */ 1419 buf[13 + tmp32] = '\0'; 1420 expected_len += tmp32; 1421 } 1422 if (header_len != expected_len) { 1423 error_report("RP: Req_Page_id with length %d expecting %zd", 1424 header_len, expected_len); 1425 mark_source_rp_bad(ms); 1426 goto out; 1427 } 1428 migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len); 1429 break; 1430 1431 default: 1432 break; 1433 } 1434 } 1435 if (qemu_file_get_error(rp)) { 1436 trace_source_return_path_thread_bad_end(); 1437 mark_source_rp_bad(ms); 1438 } 1439 1440 trace_source_return_path_thread_end(); 1441 out: 1442 ms->rp_state.from_dst_file = NULL; 1443 qemu_fclose(rp); 1444 return NULL; 1445 } 1446 1447 static int open_return_path_on_source(MigrationState *ms) 1448 { 1449 1450 ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file); 1451 if (!ms->rp_state.from_dst_file) { 1452 return -1; 1453 } 1454 1455 trace_open_return_path_on_source(); 1456 qemu_thread_create(&ms->rp_state.rp_thread, "return path", 1457 source_return_path_thread, ms, QEMU_THREAD_JOINABLE); 1458 1459 trace_open_return_path_on_source_continue(); 1460 1461 return 0; 1462 } 1463 1464 /* Returns 0 if the RP was ok, otherwise there was an error on the RP */ 1465 static int await_return_path_close_on_source(MigrationState *ms) 1466 { 1467 /* 1468 * If this is a normal exit then the destination will send a SHUT and the 1469 * rp_thread will exit, however if there's an error we need to cause 1470 * it to exit. 1471 */ 1472 if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) { 1473 /* 1474 * shutdown(2), if we have it, will cause it to unblock if it's stuck 1475 * waiting for the destination. 1476 */ 1477 qemu_file_shutdown(ms->rp_state.from_dst_file); 1478 mark_source_rp_bad(ms); 1479 } 1480 trace_await_return_path_close_on_source_joining(); 1481 qemu_thread_join(&ms->rp_state.rp_thread); 1482 trace_await_return_path_close_on_source_close(); 1483 return ms->rp_state.error; 1484 } 1485 1486 /* 1487 * Switch from normal iteration to postcopy 1488 * Returns non-0 on error 1489 */ 1490 static int postcopy_start(MigrationState *ms, bool *old_vm_running) 1491 { 1492 int ret; 1493 QIOChannelBuffer *bioc; 1494 QEMUFile *fb; 1495 int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1496 migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, 1497 MIGRATION_STATUS_POSTCOPY_ACTIVE); 1498 1499 trace_postcopy_start(); 1500 qemu_mutex_lock_iothread(); 1501 trace_postcopy_start_set_run(); 1502 1503 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); 1504 *old_vm_running = runstate_is_running(); 1505 global_state_store(); 1506 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); 1507 if (ret < 0) { 1508 goto fail; 1509 } 1510 1511 ret = bdrv_inactivate_all(); 1512 if (ret < 0) { 1513 goto fail; 1514 } 1515 1516 /* 1517 * Cause any non-postcopiable, but iterative devices to 1518 * send out their final data. 1519 */ 1520 qemu_savevm_state_complete_precopy(ms->to_dst_file, true); 1521 1522 /* 1523 * in Finish migrate and with the io-lock held everything should 1524 * be quiet, but we've potentially still got dirty pages and we 1525 * need to tell the destination to throw any pages it's already received 1526 * that are dirty 1527 */ 1528 if (ram_postcopy_send_discard_bitmap(ms)) { 1529 error_report("postcopy send discard bitmap failed"); 1530 goto fail; 1531 } 1532 1533 /* 1534 * send rest of state - note things that are doing postcopy 1535 * will notice we're in POSTCOPY_ACTIVE and not actually 1536 * wrap their state up here 1537 */ 1538 qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX); 1539 /* Ping just for debugging, helps line traces up */ 1540 qemu_savevm_send_ping(ms->to_dst_file, 2); 1541 1542 /* 1543 * While loading the device state we may trigger page transfer 1544 * requests and the fd must be free to process those, and thus 1545 * the destination must read the whole device state off the fd before 1546 * it starts processing it. Unfortunately the ad-hoc migration format 1547 * doesn't allow the destination to know the size to read without fully 1548 * parsing it through each devices load-state code (especially the open 1549 * coded devices that use get/put). 1550 * So we wrap the device state up in a package with a length at the start; 1551 * to do this we use a qemu_buf to hold the whole of the device state. 1552 */ 1553 bioc = qio_channel_buffer_new(4096); 1554 fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc)); 1555 object_unref(OBJECT(bioc)); 1556 1557 /* 1558 * Make sure the receiver can get incoming pages before we send the rest 1559 * of the state 1560 */ 1561 qemu_savevm_send_postcopy_listen(fb); 1562 1563 qemu_savevm_state_complete_precopy(fb, false); 1564 qemu_savevm_send_ping(fb, 3); 1565 1566 qemu_savevm_send_postcopy_run(fb); 1567 1568 /* <><> end of stuff going into the package */ 1569 1570 /* Now send that blob */ 1571 if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) { 1572 goto fail_closefb; 1573 } 1574 qemu_fclose(fb); 1575 1576 /* Send a notify to give a chance for anything that needs to happen 1577 * at the transition to postcopy and after the device state; in particular 1578 * spice needs to trigger a transition now 1579 */ 1580 ms->postcopy_after_devices = true; 1581 notifier_list_notify(&migration_state_notifiers, ms); 1582 1583 ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop; 1584 1585 qemu_mutex_unlock_iothread(); 1586 1587 /* 1588 * Although this ping is just for debug, it could potentially be 1589 * used for getting a better measurement of downtime at the source. 1590 */ 1591 qemu_savevm_send_ping(ms->to_dst_file, 4); 1592 1593 ret = qemu_file_get_error(ms->to_dst_file); 1594 if (ret) { 1595 error_report("postcopy_start: Migration stream errored"); 1596 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 1597 MIGRATION_STATUS_FAILED); 1598 } 1599 1600 return ret; 1601 1602 fail_closefb: 1603 qemu_fclose(fb); 1604 fail: 1605 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 1606 MIGRATION_STATUS_FAILED); 1607 qemu_mutex_unlock_iothread(); 1608 return -1; 1609 } 1610 1611 /** 1612 * migration_completion: Used by migration_thread when there's not much left. 1613 * The caller 'breaks' the loop when this returns. 1614 * 1615 * @s: Current migration state 1616 * @current_active_state: The migration state we expect to be in 1617 * @*old_vm_running: Pointer to old_vm_running flag 1618 * @*start_time: Pointer to time to update 1619 */ 1620 static void migration_completion(MigrationState *s, int current_active_state, 1621 bool *old_vm_running, 1622 int64_t *start_time) 1623 { 1624 int ret; 1625 1626 if (s->state == MIGRATION_STATUS_ACTIVE) { 1627 qemu_mutex_lock_iothread(); 1628 *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1629 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); 1630 *old_vm_running = runstate_is_running(); 1631 ret = global_state_store(); 1632 1633 if (!ret) { 1634 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); 1635 if (ret >= 0) { 1636 ret = bdrv_inactivate_all(); 1637 } 1638 if (ret >= 0) { 1639 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); 1640 qemu_savevm_state_complete_precopy(s->to_dst_file, false); 1641 } 1642 } 1643 qemu_mutex_unlock_iothread(); 1644 1645 if (ret < 0) { 1646 goto fail; 1647 } 1648 } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { 1649 trace_migration_completion_postcopy_end(); 1650 1651 qemu_savevm_state_complete_postcopy(s->to_dst_file); 1652 trace_migration_completion_postcopy_end_after_complete(); 1653 } 1654 1655 /* 1656 * If rp was opened we must clean up the thread before 1657 * cleaning everything else up (since if there are no failures 1658 * it will wait for the destination to send it's status in 1659 * a SHUT command). 1660 * Postcopy opens rp if enabled (even if it's not avtivated) 1661 */ 1662 if (migrate_postcopy_ram()) { 1663 int rp_error; 1664 trace_migration_completion_postcopy_end_before_rp(); 1665 rp_error = await_return_path_close_on_source(s); 1666 trace_migration_completion_postcopy_end_after_rp(rp_error); 1667 if (rp_error) { 1668 goto fail_invalidate; 1669 } 1670 } 1671 1672 if (qemu_file_get_error(s->to_dst_file)) { 1673 trace_migration_completion_file_err(); 1674 goto fail_invalidate; 1675 } 1676 1677 migrate_set_state(&s->state, current_active_state, 1678 MIGRATION_STATUS_COMPLETED); 1679 return; 1680 1681 fail_invalidate: 1682 /* If not doing postcopy, vm_start() will be called: let's regain 1683 * control on images. 1684 */ 1685 if (s->state == MIGRATION_STATUS_ACTIVE) { 1686 Error *local_err = NULL; 1687 1688 bdrv_invalidate_cache_all(&local_err); 1689 if (local_err) { 1690 error_report_err(local_err); 1691 } 1692 } 1693 1694 fail: 1695 migrate_set_state(&s->state, current_active_state, 1696 MIGRATION_STATUS_FAILED); 1697 } 1698 1699 /* 1700 * Master migration thread on the source VM. 1701 * It drives the migration and pumps the data down the outgoing channel. 1702 */ 1703 static void *migration_thread(void *opaque) 1704 { 1705 MigrationState *s = opaque; 1706 /* Used by the bandwidth calcs, updated later */ 1707 int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1708 int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); 1709 int64_t initial_bytes = 0; 1710 int64_t max_size = 0; 1711 int64_t start_time = initial_time; 1712 int64_t end_time; 1713 bool old_vm_running = false; 1714 bool entered_postcopy = false; 1715 /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ 1716 enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; 1717 1718 rcu_register_thread(); 1719 1720 qemu_savevm_state_header(s->to_dst_file); 1721 1722 if (migrate_postcopy_ram()) { 1723 /* Now tell the dest that it should open its end so it can reply */ 1724 qemu_savevm_send_open_return_path(s->to_dst_file); 1725 1726 /* And do a ping that will make stuff easier to debug */ 1727 qemu_savevm_send_ping(s->to_dst_file, 1); 1728 1729 /* 1730 * Tell the destination that we *might* want to do postcopy later; 1731 * if the other end can't do postcopy it should fail now, nice and 1732 * early. 1733 */ 1734 qemu_savevm_send_postcopy_advise(s->to_dst_file); 1735 } 1736 1737 qemu_savevm_state_begin(s->to_dst_file, &s->params); 1738 1739 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; 1740 current_active_state = MIGRATION_STATUS_ACTIVE; 1741 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 1742 MIGRATION_STATUS_ACTIVE); 1743 1744 trace_migration_thread_setup_complete(); 1745 1746 while (s->state == MIGRATION_STATUS_ACTIVE || 1747 s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { 1748 int64_t current_time; 1749 uint64_t pending_size; 1750 1751 if (!qemu_file_rate_limit(s->to_dst_file)) { 1752 uint64_t pend_post, pend_nonpost; 1753 1754 qemu_savevm_state_pending(s->to_dst_file, max_size, &pend_nonpost, 1755 &pend_post); 1756 pending_size = pend_nonpost + pend_post; 1757 trace_migrate_pending(pending_size, max_size, 1758 pend_post, pend_nonpost); 1759 if (pending_size && pending_size >= max_size) { 1760 /* Still a significant amount to transfer */ 1761 1762 if (migrate_postcopy_ram() && 1763 s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE && 1764 pend_nonpost <= max_size && 1765 atomic_read(&s->start_postcopy)) { 1766 1767 if (!postcopy_start(s, &old_vm_running)) { 1768 current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; 1769 entered_postcopy = true; 1770 } 1771 1772 continue; 1773 } 1774 /* Just another iteration step */ 1775 qemu_savevm_state_iterate(s->to_dst_file, entered_postcopy); 1776 } else { 1777 trace_migration_thread_low_pending(pending_size); 1778 migration_completion(s, current_active_state, 1779 &old_vm_running, &start_time); 1780 break; 1781 } 1782 } 1783 1784 if (qemu_file_get_error(s->to_dst_file)) { 1785 migrate_set_state(&s->state, current_active_state, 1786 MIGRATION_STATUS_FAILED); 1787 trace_migration_thread_file_err(); 1788 break; 1789 } 1790 current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1791 if (current_time >= initial_time + BUFFER_DELAY) { 1792 uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) - 1793 initial_bytes; 1794 uint64_t time_spent = current_time - initial_time; 1795 double bandwidth = (double)transferred_bytes / time_spent; 1796 max_size = bandwidth * migrate_max_downtime() / 1000000; 1797 1798 s->mbps = (((double) transferred_bytes * 8.0) / 1799 ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; 1800 1801 trace_migrate_transferred(transferred_bytes, time_spent, 1802 bandwidth, max_size); 1803 /* if we haven't sent anything, we don't want to recalculate 1804 10000 is a small enough number for our purposes */ 1805 if (s->dirty_bytes_rate && transferred_bytes > 10000) { 1806 s->expected_downtime = s->dirty_bytes_rate / bandwidth; 1807 } 1808 1809 qemu_file_reset_rate_limit(s->to_dst_file); 1810 initial_time = current_time; 1811 initial_bytes = qemu_ftell(s->to_dst_file); 1812 } 1813 if (qemu_file_rate_limit(s->to_dst_file)) { 1814 /* usleep expects microseconds */ 1815 g_usleep((initial_time + BUFFER_DELAY - current_time)*1000); 1816 } 1817 } 1818 1819 trace_migration_thread_after_loop(); 1820 /* If we enabled cpu throttling for auto-converge, turn it off. */ 1821 cpu_throttle_stop(); 1822 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1823 1824 qemu_mutex_lock_iothread(); 1825 qemu_savevm_state_cleanup(); 1826 if (s->state == MIGRATION_STATUS_COMPLETED) { 1827 uint64_t transferred_bytes = qemu_ftell(s->to_dst_file); 1828 s->total_time = end_time - s->total_time; 1829 if (!entered_postcopy) { 1830 s->downtime = end_time - start_time; 1831 } 1832 if (s->total_time) { 1833 s->mbps = (((double) transferred_bytes * 8.0) / 1834 ((double) s->total_time)) / 1000; 1835 } 1836 runstate_set(RUN_STATE_POSTMIGRATE); 1837 } else { 1838 if (old_vm_running && !entered_postcopy) { 1839 vm_start(); 1840 } else { 1841 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { 1842 runstate_set(RUN_STATE_POSTMIGRATE); 1843 } 1844 } 1845 } 1846 qemu_bh_schedule(s->cleanup_bh); 1847 qemu_mutex_unlock_iothread(); 1848 1849 rcu_unregister_thread(); 1850 return NULL; 1851 } 1852 1853 void migrate_fd_connect(MigrationState *s) 1854 { 1855 /* This is a best 1st approximation. ns to ms */ 1856 s->expected_downtime = max_downtime/1000000; 1857 s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); 1858 1859 qemu_file_set_blocking(s->to_dst_file, true); 1860 qemu_file_set_rate_limit(s->to_dst_file, 1861 s->bandwidth_limit / XFER_LIMIT_RATIO); 1862 1863 /* Notify before starting migration thread */ 1864 notifier_list_notify(&migration_state_notifiers, s); 1865 1866 /* 1867 * Open the return path; currently for postcopy but other things might 1868 * also want it. 1869 */ 1870 if (migrate_postcopy_ram()) { 1871 if (open_return_path_on_source(s)) { 1872 error_report("Unable to open return-path for postcopy"); 1873 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 1874 MIGRATION_STATUS_FAILED); 1875 migrate_fd_cleanup(s); 1876 return; 1877 } 1878 } 1879 1880 migrate_compress_threads_create(); 1881 qemu_thread_create(&s->thread, "migration", migration_thread, s, 1882 QEMU_THREAD_JOINABLE); 1883 s->migration_thread_running = true; 1884 } 1885 1886 PostcopyState postcopy_state_get(void) 1887 { 1888 return atomic_mb_read(&incoming_postcopy_state); 1889 } 1890 1891 /* Set the state and return the old state */ 1892 PostcopyState postcopy_state_set(PostcopyState new_state) 1893 { 1894 return atomic_xchg(&incoming_postcopy_state, new_state); 1895 } 1896 1897