1 /* 2 * QEMU live migration 3 * 4 * Copyright IBM, Corp. 2008 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qemu/cutils.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "migration/blocker.h" 21 #include "exec.h" 22 #include "fd.h" 23 #include "socket.h" 24 #include "sysemu/runstate.h" 25 #include "sysemu/sysemu.h" 26 #include "sysemu/cpu-throttle.h" 27 #include "rdma.h" 28 #include "ram.h" 29 #include "migration/global_state.h" 30 #include "migration/misc.h" 31 #include "migration.h" 32 #include "migration-stats.h" 33 #include "savevm.h" 34 #include "qemu-file.h" 35 #include "channel.h" 36 #include "migration/vmstate.h" 37 #include "block/block.h" 38 #include "qapi/error.h" 39 #include "qapi/clone-visitor.h" 40 #include "qapi/qapi-visit-migration.h" 41 #include "qapi/qapi-visit-sockets.h" 42 #include "qapi/qapi-commands-migration.h" 43 #include "qapi/qapi-events-migration.h" 44 #include "qapi/qmp/qerror.h" 45 #include "qapi/qmp/qnull.h" 46 #include "qemu/rcu.h" 47 #include "block.h" 48 #include "postcopy-ram.h" 49 #include "qemu/thread.h" 50 #include "trace.h" 51 #include "exec/target_page.h" 52 #include "io/channel-buffer.h" 53 #include "io/channel-tls.h" 54 #include "migration/colo.h" 55 #include "hw/boards.h" 56 #include "monitor/monitor.h" 57 #include "net/announce.h" 58 #include "qemu/queue.h" 59 #include "multifd.h" 60 #include "threadinfo.h" 61 #include "qemu/yank.h" 62 #include "sysemu/cpus.h" 63 #include "yank_functions.h" 64 #include "sysemu/qtest.h" 65 #include "options.h" 66 67 static NotifierList migration_state_notifiers = 68 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); 69 70 /* Messages sent on the return path from destination to source */ 71 enum mig_rp_message_type { 72 MIG_RP_MSG_INVALID = 0, /* Must be 0 */ 73 MIG_RP_MSG_SHUT, /* sibling will not send any more RP messages */ 74 MIG_RP_MSG_PONG, /* Response to a PING; data (seq: be32 ) */ 75 76 MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */ 77 MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */ 78 MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */ 79 MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */ 80 81 MIG_RP_MSG_MAX 82 }; 83 84 /* When we add fault tolerance, we could have several 85 migrations at once. For now we don't need to add 86 dynamic creation of migration */ 87 88 static MigrationState *current_migration; 89 static MigrationIncomingState *current_incoming; 90 91 static GSList *migration_blockers; 92 93 static bool migration_object_check(MigrationState *ms, Error **errp); 94 static int migration_maybe_pause(MigrationState *s, 95 int *current_active_state, 96 int new_state); 97 static void migrate_fd_cancel(MigrationState *s); 98 99 static bool migration_needs_multiple_sockets(void) 100 { 101 return migrate_multifd() || migrate_postcopy_preempt(); 102 } 103 104 static bool uri_supports_multi_channels(const char *uri) 105 { 106 return strstart(uri, "tcp:", NULL) || strstart(uri, "unix:", NULL) || 107 strstart(uri, "vsock:", NULL); 108 } 109 110 static bool 111 migration_channels_and_uri_compatible(const char *uri, Error **errp) 112 { 113 if (migration_needs_multiple_sockets() && 114 !uri_supports_multi_channels(uri)) { 115 error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); 116 return false; 117 } 118 119 return true; 120 } 121 122 static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp) 123 { 124 uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp; 125 126 return (a > b) - (a < b); 127 } 128 129 void migration_object_init(void) 130 { 131 /* This can only be called once. */ 132 assert(!current_migration); 133 current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION)); 134 135 /* 136 * Init the migrate incoming object as well no matter whether 137 * we'll use it or not. 138 */ 139 assert(!current_incoming); 140 current_incoming = g_new0(MigrationIncomingState, 1); 141 current_incoming->state = MIGRATION_STATUS_NONE; 142 current_incoming->postcopy_remote_fds = 143 g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD)); 144 qemu_mutex_init(¤t_incoming->rp_mutex); 145 qemu_mutex_init(¤t_incoming->postcopy_prio_thread_mutex); 146 qemu_event_init(¤t_incoming->main_thread_load_event, false); 147 qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0); 148 qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0); 149 qemu_sem_init(¤t_incoming->postcopy_pause_sem_fast_load, 0); 150 qemu_sem_init(¤t_incoming->postcopy_qemufile_dst_done, 0); 151 152 qemu_mutex_init(¤t_incoming->page_request_mutex); 153 current_incoming->page_requested = g_tree_new(page_request_addr_cmp); 154 155 migration_object_check(current_migration, &error_fatal); 156 157 blk_mig_init(); 158 ram_mig_init(); 159 dirty_bitmap_mig_init(); 160 } 161 162 void migration_cancel(const Error *error) 163 { 164 if (error) { 165 migrate_set_error(current_migration, error); 166 } 167 migrate_fd_cancel(current_migration); 168 } 169 170 void migration_shutdown(void) 171 { 172 /* 173 * When the QEMU main thread exit, the COLO thread 174 * may wait a semaphore. So, we should wakeup the 175 * COLO thread before migration shutdown. 176 */ 177 colo_shutdown(); 178 /* 179 * Cancel the current migration - that will (eventually) 180 * stop the migration using this structure 181 */ 182 migration_cancel(NULL); 183 object_unref(OBJECT(current_migration)); 184 185 /* 186 * Cancel outgoing migration of dirty bitmaps. It should 187 * at least unref used block nodes. 188 */ 189 dirty_bitmap_mig_cancel_outgoing(); 190 191 /* 192 * Cancel incoming migration of dirty bitmaps. Dirty bitmaps 193 * are non-critical data, and their loss never considered as 194 * something serious. 195 */ 196 dirty_bitmap_mig_cancel_incoming(); 197 } 198 199 /* For outgoing */ 200 MigrationState *migrate_get_current(void) 201 { 202 /* This can only be called after the object created. */ 203 assert(current_migration); 204 return current_migration; 205 } 206 207 MigrationIncomingState *migration_incoming_get_current(void) 208 { 209 assert(current_incoming); 210 return current_incoming; 211 } 212 213 void migration_incoming_transport_cleanup(MigrationIncomingState *mis) 214 { 215 if (mis->socket_address_list) { 216 qapi_free_SocketAddressList(mis->socket_address_list); 217 mis->socket_address_list = NULL; 218 } 219 220 if (mis->transport_cleanup) { 221 mis->transport_cleanup(mis->transport_data); 222 mis->transport_data = mis->transport_cleanup = NULL; 223 } 224 } 225 226 void migration_incoming_state_destroy(void) 227 { 228 struct MigrationIncomingState *mis = migration_incoming_get_current(); 229 230 multifd_load_cleanup(); 231 232 if (mis->to_src_file) { 233 /* Tell source that we are done */ 234 migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0); 235 qemu_fclose(mis->to_src_file); 236 mis->to_src_file = NULL; 237 } 238 239 if (mis->from_src_file) { 240 migration_ioc_unregister_yank_from_file(mis->from_src_file); 241 qemu_fclose(mis->from_src_file); 242 mis->from_src_file = NULL; 243 } 244 if (mis->postcopy_remote_fds) { 245 g_array_free(mis->postcopy_remote_fds, TRUE); 246 mis->postcopy_remote_fds = NULL; 247 } 248 249 migration_incoming_transport_cleanup(mis); 250 qemu_event_reset(&mis->main_thread_load_event); 251 252 if (mis->page_requested) { 253 g_tree_destroy(mis->page_requested); 254 mis->page_requested = NULL; 255 } 256 257 if (mis->postcopy_qemufile_dst) { 258 migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst); 259 qemu_fclose(mis->postcopy_qemufile_dst); 260 mis->postcopy_qemufile_dst = NULL; 261 } 262 263 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 264 } 265 266 static void migrate_generate_event(int new_state) 267 { 268 if (migrate_events()) { 269 qapi_event_send_migration(new_state); 270 } 271 } 272 273 /* 274 * Send a message on the return channel back to the source 275 * of the migration. 276 */ 277 static int migrate_send_rp_message(MigrationIncomingState *mis, 278 enum mig_rp_message_type message_type, 279 uint16_t len, void *data) 280 { 281 int ret = 0; 282 283 trace_migrate_send_rp_message((int)message_type, len); 284 QEMU_LOCK_GUARD(&mis->rp_mutex); 285 286 /* 287 * It's possible that the file handle got lost due to network 288 * failures. 289 */ 290 if (!mis->to_src_file) { 291 ret = -EIO; 292 return ret; 293 } 294 295 qemu_put_be16(mis->to_src_file, (unsigned int)message_type); 296 qemu_put_be16(mis->to_src_file, len); 297 qemu_put_buffer(mis->to_src_file, data, len); 298 qemu_fflush(mis->to_src_file); 299 300 /* It's possible that qemu file got error during sending */ 301 ret = qemu_file_get_error(mis->to_src_file); 302 303 return ret; 304 } 305 306 /* Request one page from the source VM at the given start address. 307 * rb: the RAMBlock to request the page in 308 * Start: Address offset within the RB 309 * Len: Length in bytes required - must be a multiple of pagesize 310 */ 311 int migrate_send_rp_message_req_pages(MigrationIncomingState *mis, 312 RAMBlock *rb, ram_addr_t start) 313 { 314 uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */ 315 size_t msglen = 12; /* start + len */ 316 size_t len = qemu_ram_pagesize(rb); 317 enum mig_rp_message_type msg_type; 318 const char *rbname; 319 int rbname_len; 320 321 *(uint64_t *)bufc = cpu_to_be64((uint64_t)start); 322 *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len); 323 324 /* 325 * We maintain the last ramblock that we requested for page. Note that we 326 * don't need locking because this function will only be called within the 327 * postcopy ram fault thread. 328 */ 329 if (rb != mis->last_rb) { 330 mis->last_rb = rb; 331 332 rbname = qemu_ram_get_idstr(rb); 333 rbname_len = strlen(rbname); 334 335 assert(rbname_len < 256); 336 337 bufc[msglen++] = rbname_len; 338 memcpy(bufc + msglen, rbname, rbname_len); 339 msglen += rbname_len; 340 msg_type = MIG_RP_MSG_REQ_PAGES_ID; 341 } else { 342 msg_type = MIG_RP_MSG_REQ_PAGES; 343 } 344 345 return migrate_send_rp_message(mis, msg_type, msglen, bufc); 346 } 347 348 int migrate_send_rp_req_pages(MigrationIncomingState *mis, 349 RAMBlock *rb, ram_addr_t start, uint64_t haddr) 350 { 351 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb)); 352 bool received = false; 353 354 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) { 355 received = ramblock_recv_bitmap_test_byte_offset(rb, start); 356 if (!received && !g_tree_lookup(mis->page_requested, aligned)) { 357 /* 358 * The page has not been received, and it's not yet in the page 359 * request list. Queue it. Set the value of element to 1, so that 360 * things like g_tree_lookup() will return TRUE (1) when found. 361 */ 362 g_tree_insert(mis->page_requested, aligned, (gpointer)1); 363 mis->page_requested_count++; 364 trace_postcopy_page_req_add(aligned, mis->page_requested_count); 365 } 366 } 367 368 /* 369 * If the page is there, skip sending the message. We don't even need the 370 * lock because as long as the page arrived, it'll be there forever. 371 */ 372 if (received) { 373 return 0; 374 } 375 376 return migrate_send_rp_message_req_pages(mis, rb, start); 377 } 378 379 static bool migration_colo_enabled; 380 bool migration_incoming_colo_enabled(void) 381 { 382 return migration_colo_enabled; 383 } 384 385 void migration_incoming_disable_colo(void) 386 { 387 ram_block_discard_disable(false); 388 migration_colo_enabled = false; 389 } 390 391 int migration_incoming_enable_colo(void) 392 { 393 if (ram_block_discard_disable(true)) { 394 error_report("COLO: cannot disable RAM discard"); 395 return -EBUSY; 396 } 397 migration_colo_enabled = true; 398 return 0; 399 } 400 401 void migrate_add_address(SocketAddress *address) 402 { 403 MigrationIncomingState *mis = migration_incoming_get_current(); 404 405 QAPI_LIST_PREPEND(mis->socket_address_list, 406 QAPI_CLONE(SocketAddress, address)); 407 } 408 409 static void qemu_start_incoming_migration(const char *uri, Error **errp) 410 { 411 const char *p = NULL; 412 413 /* URI is not suitable for migration? */ 414 if (!migration_channels_and_uri_compatible(uri, errp)) { 415 return; 416 } 417 418 qapi_event_send_migration(MIGRATION_STATUS_SETUP); 419 if (strstart(uri, "tcp:", &p) || 420 strstart(uri, "unix:", NULL) || 421 strstart(uri, "vsock:", NULL)) { 422 socket_start_incoming_migration(p ? p : uri, errp); 423 #ifdef CONFIG_RDMA 424 } else if (strstart(uri, "rdma:", &p)) { 425 rdma_start_incoming_migration(p, errp); 426 #endif 427 } else if (strstart(uri, "exec:", &p)) { 428 exec_start_incoming_migration(p, errp); 429 } else if (strstart(uri, "fd:", &p)) { 430 fd_start_incoming_migration(p, errp); 431 } else { 432 error_setg(errp, "unknown migration protocol: %s", uri); 433 } 434 } 435 436 static void process_incoming_migration_bh(void *opaque) 437 { 438 Error *local_err = NULL; 439 MigrationIncomingState *mis = opaque; 440 441 /* If capability late_block_activate is set: 442 * Only fire up the block code now if we're going to restart the 443 * VM, else 'cont' will do it. 444 * This causes file locking to happen; so we don't want it to happen 445 * unless we really are starting the VM. 446 */ 447 if (!migrate_late_block_activate() || 448 (autostart && (!global_state_received() || 449 global_state_get_runstate() == RUN_STATE_RUNNING))) { 450 /* Make sure all file formats throw away their mutable metadata. 451 * If we get an error here, just don't restart the VM yet. */ 452 bdrv_activate_all(&local_err); 453 if (local_err) { 454 error_report_err(local_err); 455 local_err = NULL; 456 autostart = false; 457 } 458 } 459 460 /* 461 * This must happen after all error conditions are dealt with and 462 * we're sure the VM is going to be running on this host. 463 */ 464 qemu_announce_self(&mis->announce_timer, migrate_announce_params()); 465 466 multifd_load_shutdown(); 467 468 dirty_bitmap_mig_before_vm_start(); 469 470 if (!global_state_received() || 471 global_state_get_runstate() == RUN_STATE_RUNNING) { 472 if (autostart) { 473 vm_start(); 474 } else { 475 runstate_set(RUN_STATE_PAUSED); 476 } 477 } else if (migration_incoming_colo_enabled()) { 478 migration_incoming_disable_colo(); 479 vm_start(); 480 } else { 481 runstate_set(global_state_get_runstate()); 482 } 483 /* 484 * This must happen after any state changes since as soon as an external 485 * observer sees this event they might start to prod at the VM assuming 486 * it's ready to use. 487 */ 488 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, 489 MIGRATION_STATUS_COMPLETED); 490 qemu_bh_delete(mis->bh); 491 migration_incoming_state_destroy(); 492 } 493 494 static void coroutine_fn 495 process_incoming_migration_co(void *opaque) 496 { 497 MigrationIncomingState *mis = migration_incoming_get_current(); 498 PostcopyState ps; 499 int ret; 500 Error *local_err = NULL; 501 502 assert(mis->from_src_file); 503 mis->migration_incoming_co = qemu_coroutine_self(); 504 mis->largest_page_size = qemu_ram_pagesize_largest(); 505 postcopy_state_set(POSTCOPY_INCOMING_NONE); 506 migrate_set_state(&mis->state, MIGRATION_STATUS_NONE, 507 MIGRATION_STATUS_ACTIVE); 508 ret = qemu_loadvm_state(mis->from_src_file); 509 510 ps = postcopy_state_get(); 511 trace_process_incoming_migration_co_end(ret, ps); 512 if (ps != POSTCOPY_INCOMING_NONE) { 513 if (ps == POSTCOPY_INCOMING_ADVISE) { 514 /* 515 * Where a migration had postcopy enabled (and thus went to advise) 516 * but managed to complete within the precopy period, we can use 517 * the normal exit. 518 */ 519 postcopy_ram_incoming_cleanup(mis); 520 } else if (ret >= 0) { 521 /* 522 * Postcopy was started, cleanup should happen at the end of the 523 * postcopy thread. 524 */ 525 trace_process_incoming_migration_co_postcopy_end_main(); 526 return; 527 } 528 /* Else if something went wrong then just fall out of the normal exit */ 529 } 530 531 /* we get COLO info, and know if we are in COLO mode */ 532 if (!ret && migration_incoming_colo_enabled()) { 533 /* Make sure all file formats throw away their mutable metadata */ 534 bdrv_activate_all(&local_err); 535 if (local_err) { 536 error_report_err(local_err); 537 goto fail; 538 } 539 540 qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming", 541 colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE); 542 mis->have_colo_incoming_thread = true; 543 qemu_coroutine_yield(); 544 545 qemu_mutex_unlock_iothread(); 546 /* Wait checkpoint incoming thread exit before free resource */ 547 qemu_thread_join(&mis->colo_incoming_thread); 548 qemu_mutex_lock_iothread(); 549 /* We hold the global iothread lock, so it is safe here */ 550 colo_release_ram_cache(); 551 } 552 553 if (ret < 0) { 554 error_report("load of migration failed: %s", strerror(-ret)); 555 goto fail; 556 } 557 mis->bh = qemu_bh_new(process_incoming_migration_bh, mis); 558 qemu_bh_schedule(mis->bh); 559 mis->migration_incoming_co = NULL; 560 return; 561 fail: 562 local_err = NULL; 563 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, 564 MIGRATION_STATUS_FAILED); 565 qemu_fclose(mis->from_src_file); 566 567 multifd_load_cleanup(); 568 569 exit(EXIT_FAILURE); 570 } 571 572 /** 573 * migration_incoming_setup: Setup incoming migration 574 * @f: file for main migration channel 575 * @errp: where to put errors 576 * 577 * Returns: %true on success, %false on error. 578 */ 579 static bool migration_incoming_setup(QEMUFile *f, Error **errp) 580 { 581 MigrationIncomingState *mis = migration_incoming_get_current(); 582 583 if (!mis->from_src_file) { 584 mis->from_src_file = f; 585 } 586 qemu_file_set_blocking(f, false); 587 return true; 588 } 589 590 void migration_incoming_process(void) 591 { 592 Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL); 593 qemu_coroutine_enter(co); 594 } 595 596 /* Returns true if recovered from a paused migration, otherwise false */ 597 static bool postcopy_try_recover(void) 598 { 599 MigrationIncomingState *mis = migration_incoming_get_current(); 600 601 if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { 602 /* Resumed from a paused postcopy migration */ 603 604 /* This should be set already in migration_incoming_setup() */ 605 assert(mis->from_src_file); 606 /* Postcopy has standalone thread to do vm load */ 607 qemu_file_set_blocking(mis->from_src_file, true); 608 609 /* Re-configure the return path */ 610 mis->to_src_file = qemu_file_get_return_path(mis->from_src_file); 611 612 migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED, 613 MIGRATION_STATUS_POSTCOPY_RECOVER); 614 615 /* 616 * Here, we only wake up the main loading thread (while the 617 * rest threads will still be waiting), so that we can receive 618 * commands from source now, and answer it if needed. The 619 * rest threads will be woken up afterwards until we are sure 620 * that source is ready to reply to page requests. 621 */ 622 qemu_sem_post(&mis->postcopy_pause_sem_dst); 623 return true; 624 } 625 626 return false; 627 } 628 629 void migration_fd_process_incoming(QEMUFile *f, Error **errp) 630 { 631 if (!migration_incoming_setup(f, errp)) { 632 return; 633 } 634 if (postcopy_try_recover()) { 635 return; 636 } 637 migration_incoming_process(); 638 } 639 640 /* 641 * Returns true when we want to start a new incoming migration process, 642 * false otherwise. 643 */ 644 static bool migration_should_start_incoming(bool main_channel) 645 { 646 /* Multifd doesn't start unless all channels are established */ 647 if (migrate_multifd()) { 648 return migration_has_all_channels(); 649 } 650 651 /* Preempt channel only starts when the main channel is created */ 652 if (migrate_postcopy_preempt()) { 653 return main_channel; 654 } 655 656 /* 657 * For all the rest types of migration, we should only reach here when 658 * it's the main channel that's being created, and we should always 659 * proceed with this channel. 660 */ 661 assert(main_channel); 662 return true; 663 } 664 665 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) 666 { 667 MigrationIncomingState *mis = migration_incoming_get_current(); 668 Error *local_err = NULL; 669 QEMUFile *f; 670 bool default_channel = true; 671 uint32_t channel_magic = 0; 672 int ret = 0; 673 674 if (migrate_multifd() && !migrate_postcopy_ram() && 675 qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { 676 /* 677 * With multiple channels, it is possible that we receive channels 678 * out of order on destination side, causing incorrect mapping of 679 * source channels on destination side. Check channel MAGIC to 680 * decide type of channel. Please note this is best effort, postcopy 681 * preempt channel does not send any magic number so avoid it for 682 * postcopy live migration. Also tls live migration already does 683 * tls handshake while initializing main channel so with tls this 684 * issue is not possible. 685 */ 686 ret = migration_channel_read_peek(ioc, (void *)&channel_magic, 687 sizeof(channel_magic), &local_err); 688 689 if (ret != 0) { 690 error_propagate(errp, local_err); 691 return; 692 } 693 694 default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); 695 } else { 696 default_channel = !mis->from_src_file; 697 } 698 699 if (multifd_load_setup(errp) != 0) { 700 error_setg(errp, "Failed to setup multifd channels"); 701 return; 702 } 703 704 if (default_channel) { 705 f = qemu_file_new_input(ioc); 706 707 if (!migration_incoming_setup(f, errp)) { 708 return; 709 } 710 } else { 711 /* Multiple connections */ 712 assert(migration_needs_multiple_sockets()); 713 if (migrate_multifd()) { 714 multifd_recv_new_channel(ioc, &local_err); 715 } else { 716 assert(migrate_postcopy_preempt()); 717 f = qemu_file_new_input(ioc); 718 postcopy_preempt_new_channel(mis, f); 719 } 720 if (local_err) { 721 error_propagate(errp, local_err); 722 return; 723 } 724 } 725 726 if (migration_should_start_incoming(default_channel)) { 727 /* If it's a recovery, we're done */ 728 if (postcopy_try_recover()) { 729 return; 730 } 731 migration_incoming_process(); 732 } 733 } 734 735 /** 736 * @migration_has_all_channels: We have received all channels that we need 737 * 738 * Returns true when we have got connections to all the channels that 739 * we need for migration. 740 */ 741 bool migration_has_all_channels(void) 742 { 743 MigrationIncomingState *mis = migration_incoming_get_current(); 744 745 if (!mis->from_src_file) { 746 return false; 747 } 748 749 if (migrate_multifd()) { 750 return multifd_recv_all_channels_created(); 751 } 752 753 if (migrate_postcopy_preempt()) { 754 return mis->postcopy_qemufile_dst != NULL; 755 } 756 757 return true; 758 } 759 760 /* 761 * Send a 'SHUT' message on the return channel with the given value 762 * to indicate that we've finished with the RP. Non-0 value indicates 763 * error. 764 */ 765 void migrate_send_rp_shut(MigrationIncomingState *mis, 766 uint32_t value) 767 { 768 uint32_t buf; 769 770 buf = cpu_to_be32(value); 771 migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf); 772 } 773 774 /* 775 * Send a 'PONG' message on the return channel with the given value 776 * (normally in response to a 'PING') 777 */ 778 void migrate_send_rp_pong(MigrationIncomingState *mis, 779 uint32_t value) 780 { 781 uint32_t buf; 782 783 buf = cpu_to_be32(value); 784 migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf); 785 } 786 787 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis, 788 char *block_name) 789 { 790 char buf[512]; 791 int len; 792 int64_t res; 793 794 /* 795 * First, we send the header part. It contains only the len of 796 * idstr, and the idstr itself. 797 */ 798 len = strlen(block_name); 799 buf[0] = len; 800 memcpy(buf + 1, block_name, len); 801 802 if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 803 error_report("%s: MSG_RP_RECV_BITMAP only used for recovery", 804 __func__); 805 return; 806 } 807 808 migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf); 809 810 /* 811 * Next, we dump the received bitmap to the stream. 812 * 813 * TODO: currently we are safe since we are the only one that is 814 * using the to_src_file handle (fault thread is still paused), 815 * and it's ok even not taking the mutex. However the best way is 816 * to take the lock before sending the message header, and release 817 * the lock after sending the bitmap. 818 */ 819 qemu_mutex_lock(&mis->rp_mutex); 820 res = ramblock_recv_bitmap_send(mis->to_src_file, block_name); 821 qemu_mutex_unlock(&mis->rp_mutex); 822 823 trace_migrate_send_rp_recv_bitmap(block_name, res); 824 } 825 826 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value) 827 { 828 uint32_t buf; 829 830 buf = cpu_to_be32(value); 831 migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf); 832 } 833 834 /* 835 * Return true if we're already in the middle of a migration 836 * (i.e. any of the active or setup states) 837 */ 838 bool migration_is_setup_or_active(int state) 839 { 840 switch (state) { 841 case MIGRATION_STATUS_ACTIVE: 842 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 843 case MIGRATION_STATUS_POSTCOPY_PAUSED: 844 case MIGRATION_STATUS_POSTCOPY_RECOVER: 845 case MIGRATION_STATUS_SETUP: 846 case MIGRATION_STATUS_PRE_SWITCHOVER: 847 case MIGRATION_STATUS_DEVICE: 848 case MIGRATION_STATUS_WAIT_UNPLUG: 849 case MIGRATION_STATUS_COLO: 850 return true; 851 852 default: 853 return false; 854 855 } 856 } 857 858 bool migration_is_running(int state) 859 { 860 switch (state) { 861 case MIGRATION_STATUS_ACTIVE: 862 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 863 case MIGRATION_STATUS_POSTCOPY_PAUSED: 864 case MIGRATION_STATUS_POSTCOPY_RECOVER: 865 case MIGRATION_STATUS_SETUP: 866 case MIGRATION_STATUS_PRE_SWITCHOVER: 867 case MIGRATION_STATUS_DEVICE: 868 case MIGRATION_STATUS_WAIT_UNPLUG: 869 case MIGRATION_STATUS_CANCELLING: 870 return true; 871 872 default: 873 return false; 874 875 } 876 } 877 878 static bool migrate_show_downtime(MigrationState *s) 879 { 880 return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy(); 881 } 882 883 static void populate_time_info(MigrationInfo *info, MigrationState *s) 884 { 885 info->has_status = true; 886 info->has_setup_time = true; 887 info->setup_time = s->setup_time; 888 889 if (s->state == MIGRATION_STATUS_COMPLETED) { 890 info->has_total_time = true; 891 info->total_time = s->total_time; 892 } else { 893 info->has_total_time = true; 894 info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - 895 s->start_time; 896 } 897 898 if (migrate_show_downtime(s)) { 899 info->has_downtime = true; 900 info->downtime = s->downtime; 901 } else { 902 info->has_expected_downtime = true; 903 info->expected_downtime = s->expected_downtime; 904 } 905 } 906 907 static void populate_ram_info(MigrationInfo *info, MigrationState *s) 908 { 909 size_t page_size = qemu_target_page_size(); 910 911 info->ram = g_malloc0(sizeof(*info->ram)); 912 info->ram->transferred = stat64_get(&mig_stats.transferred); 913 info->ram->total = ram_bytes_total(); 914 info->ram->duplicate = stat64_get(&mig_stats.zero_pages); 915 /* legacy value. It is not used anymore */ 916 info->ram->skipped = 0; 917 info->ram->normal = stat64_get(&mig_stats.normal_pages); 918 info->ram->normal_bytes = info->ram->normal * page_size; 919 info->ram->mbps = s->mbps; 920 info->ram->dirty_sync_count = 921 stat64_get(&mig_stats.dirty_sync_count); 922 info->ram->dirty_sync_missed_zero_copy = 923 stat64_get(&mig_stats.dirty_sync_missed_zero_copy); 924 info->ram->postcopy_requests = 925 stat64_get(&mig_stats.postcopy_requests); 926 info->ram->page_size = page_size; 927 info->ram->multifd_bytes = stat64_get(&mig_stats.multifd_bytes); 928 info->ram->pages_per_second = s->pages_per_second; 929 info->ram->precopy_bytes = stat64_get(&mig_stats.precopy_bytes); 930 info->ram->downtime_bytes = stat64_get(&mig_stats.downtime_bytes); 931 info->ram->postcopy_bytes = stat64_get(&mig_stats.postcopy_bytes); 932 933 if (migrate_xbzrle()) { 934 info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); 935 info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); 936 info->xbzrle_cache->bytes = xbzrle_counters.bytes; 937 info->xbzrle_cache->pages = xbzrle_counters.pages; 938 info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss; 939 info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate; 940 info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate; 941 info->xbzrle_cache->overflow = xbzrle_counters.overflow; 942 } 943 944 if (migrate_compress()) { 945 info->compression = g_malloc0(sizeof(*info->compression)); 946 info->compression->pages = compression_counters.pages; 947 info->compression->busy = compression_counters.busy; 948 info->compression->busy_rate = compression_counters.busy_rate; 949 info->compression->compressed_size = 950 compression_counters.compressed_size; 951 info->compression->compression_rate = 952 compression_counters.compression_rate; 953 } 954 955 if (cpu_throttle_active()) { 956 info->has_cpu_throttle_percentage = true; 957 info->cpu_throttle_percentage = cpu_throttle_get_percentage(); 958 } 959 960 if (s->state != MIGRATION_STATUS_COMPLETED) { 961 info->ram->remaining = ram_bytes_remaining(); 962 info->ram->dirty_pages_rate = 963 stat64_get(&mig_stats.dirty_pages_rate); 964 } 965 } 966 967 static void populate_disk_info(MigrationInfo *info) 968 { 969 if (blk_mig_active()) { 970 info->disk = g_malloc0(sizeof(*info->disk)); 971 info->disk->transferred = blk_mig_bytes_transferred(); 972 info->disk->remaining = blk_mig_bytes_remaining(); 973 info->disk->total = blk_mig_bytes_total(); 974 } 975 } 976 977 static void fill_source_migration_info(MigrationInfo *info) 978 { 979 MigrationState *s = migrate_get_current(); 980 int state = qatomic_read(&s->state); 981 GSList *cur_blocker = migration_blockers; 982 983 info->blocked_reasons = NULL; 984 985 /* 986 * There are two types of reasons a migration might be blocked; 987 * a) devices marked in VMState as non-migratable, and 988 * b) Explicit migration blockers 989 * We need to add both of them here. 990 */ 991 qemu_savevm_non_migratable_list(&info->blocked_reasons); 992 993 while (cur_blocker) { 994 QAPI_LIST_PREPEND(info->blocked_reasons, 995 g_strdup(error_get_pretty(cur_blocker->data))); 996 cur_blocker = g_slist_next(cur_blocker); 997 } 998 info->has_blocked_reasons = info->blocked_reasons != NULL; 999 1000 switch (state) { 1001 case MIGRATION_STATUS_NONE: 1002 /* no migration has happened ever */ 1003 /* do not overwrite destination migration status */ 1004 return; 1005 case MIGRATION_STATUS_SETUP: 1006 info->has_status = true; 1007 info->has_total_time = false; 1008 break; 1009 case MIGRATION_STATUS_ACTIVE: 1010 case MIGRATION_STATUS_CANCELLING: 1011 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1012 case MIGRATION_STATUS_PRE_SWITCHOVER: 1013 case MIGRATION_STATUS_DEVICE: 1014 case MIGRATION_STATUS_POSTCOPY_PAUSED: 1015 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1016 /* TODO add some postcopy stats */ 1017 populate_time_info(info, s); 1018 populate_ram_info(info, s); 1019 populate_disk_info(info); 1020 populate_vfio_info(info); 1021 break; 1022 case MIGRATION_STATUS_COLO: 1023 info->has_status = true; 1024 /* TODO: display COLO specific information (checkpoint info etc.) */ 1025 break; 1026 case MIGRATION_STATUS_COMPLETED: 1027 populate_time_info(info, s); 1028 populate_ram_info(info, s); 1029 populate_vfio_info(info); 1030 break; 1031 case MIGRATION_STATUS_FAILED: 1032 info->has_status = true; 1033 if (s->error) { 1034 info->error_desc = g_strdup(error_get_pretty(s->error)); 1035 } 1036 break; 1037 case MIGRATION_STATUS_CANCELLED: 1038 info->has_status = true; 1039 break; 1040 case MIGRATION_STATUS_WAIT_UNPLUG: 1041 info->has_status = true; 1042 break; 1043 } 1044 info->status = state; 1045 } 1046 1047 static void fill_destination_migration_info(MigrationInfo *info) 1048 { 1049 MigrationIncomingState *mis = migration_incoming_get_current(); 1050 1051 if (mis->socket_address_list) { 1052 info->has_socket_address = true; 1053 info->socket_address = 1054 QAPI_CLONE(SocketAddressList, mis->socket_address_list); 1055 } 1056 1057 switch (mis->state) { 1058 case MIGRATION_STATUS_NONE: 1059 return; 1060 case MIGRATION_STATUS_SETUP: 1061 case MIGRATION_STATUS_CANCELLING: 1062 case MIGRATION_STATUS_CANCELLED: 1063 case MIGRATION_STATUS_ACTIVE: 1064 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1065 case MIGRATION_STATUS_POSTCOPY_PAUSED: 1066 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1067 case MIGRATION_STATUS_FAILED: 1068 case MIGRATION_STATUS_COLO: 1069 info->has_status = true; 1070 break; 1071 case MIGRATION_STATUS_COMPLETED: 1072 info->has_status = true; 1073 fill_destination_postcopy_migration_info(info); 1074 break; 1075 } 1076 info->status = mis->state; 1077 } 1078 1079 MigrationInfo *qmp_query_migrate(Error **errp) 1080 { 1081 MigrationInfo *info = g_malloc0(sizeof(*info)); 1082 1083 fill_destination_migration_info(info); 1084 fill_source_migration_info(info); 1085 1086 return info; 1087 } 1088 1089 void qmp_migrate_start_postcopy(Error **errp) 1090 { 1091 MigrationState *s = migrate_get_current(); 1092 1093 if (!migrate_postcopy()) { 1094 error_setg(errp, "Enable postcopy with migrate_set_capability before" 1095 " the start of migration"); 1096 return; 1097 } 1098 1099 if (s->state == MIGRATION_STATUS_NONE) { 1100 error_setg(errp, "Postcopy must be started after migration has been" 1101 " started"); 1102 return; 1103 } 1104 /* 1105 * we don't error if migration has finished since that would be racy 1106 * with issuing this command. 1107 */ 1108 qatomic_set(&s->start_postcopy, true); 1109 } 1110 1111 /* shared migration helpers */ 1112 1113 void migrate_set_state(int *state, int old_state, int new_state) 1114 { 1115 assert(new_state < MIGRATION_STATUS__MAX); 1116 if (qatomic_cmpxchg(state, old_state, new_state) == old_state) { 1117 trace_migrate_set_state(MigrationStatus_str(new_state)); 1118 migrate_generate_event(new_state); 1119 } 1120 } 1121 1122 static void migrate_fd_cleanup(MigrationState *s) 1123 { 1124 qemu_bh_delete(s->cleanup_bh); 1125 s->cleanup_bh = NULL; 1126 1127 g_free(s->hostname); 1128 s->hostname = NULL; 1129 json_writer_free(s->vmdesc); 1130 s->vmdesc = NULL; 1131 1132 qemu_savevm_state_cleanup(); 1133 1134 if (s->to_dst_file) { 1135 QEMUFile *tmp; 1136 1137 trace_migrate_fd_cleanup(); 1138 qemu_mutex_unlock_iothread(); 1139 if (s->migration_thread_running) { 1140 qemu_thread_join(&s->thread); 1141 s->migration_thread_running = false; 1142 } 1143 qemu_mutex_lock_iothread(); 1144 1145 multifd_save_cleanup(); 1146 qemu_mutex_lock(&s->qemu_file_lock); 1147 tmp = s->to_dst_file; 1148 s->to_dst_file = NULL; 1149 qemu_mutex_unlock(&s->qemu_file_lock); 1150 /* 1151 * Close the file handle without the lock to make sure the 1152 * critical section won't block for long. 1153 */ 1154 migration_ioc_unregister_yank_from_file(tmp); 1155 qemu_fclose(tmp); 1156 } 1157 1158 if (s->postcopy_qemufile_src) { 1159 migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src); 1160 qemu_fclose(s->postcopy_qemufile_src); 1161 s->postcopy_qemufile_src = NULL; 1162 } 1163 1164 assert(!migration_is_active(s)); 1165 1166 if (s->state == MIGRATION_STATUS_CANCELLING) { 1167 migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, 1168 MIGRATION_STATUS_CANCELLED); 1169 } 1170 1171 if (s->error) { 1172 /* It is used on info migrate. We can't free it */ 1173 error_report_err(error_copy(s->error)); 1174 } 1175 notifier_list_notify(&migration_state_notifiers, s); 1176 block_cleanup_parameters(); 1177 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 1178 } 1179 1180 static void migrate_fd_cleanup_schedule(MigrationState *s) 1181 { 1182 /* 1183 * Ref the state for bh, because it may be called when 1184 * there're already no other refs 1185 */ 1186 object_ref(OBJECT(s)); 1187 qemu_bh_schedule(s->cleanup_bh); 1188 } 1189 1190 static void migrate_fd_cleanup_bh(void *opaque) 1191 { 1192 MigrationState *s = opaque; 1193 migrate_fd_cleanup(s); 1194 object_unref(OBJECT(s)); 1195 } 1196 1197 void migrate_set_error(MigrationState *s, const Error *error) 1198 { 1199 QEMU_LOCK_GUARD(&s->error_mutex); 1200 if (!s->error) { 1201 s->error = error_copy(error); 1202 } 1203 } 1204 1205 static void migrate_error_free(MigrationState *s) 1206 { 1207 QEMU_LOCK_GUARD(&s->error_mutex); 1208 if (s->error) { 1209 error_free(s->error); 1210 s->error = NULL; 1211 } 1212 } 1213 1214 void migrate_fd_error(MigrationState *s, const Error *error) 1215 { 1216 trace_migrate_fd_error(error_get_pretty(error)); 1217 assert(s->to_dst_file == NULL); 1218 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 1219 MIGRATION_STATUS_FAILED); 1220 migrate_set_error(s, error); 1221 } 1222 1223 static void migrate_fd_cancel(MigrationState *s) 1224 { 1225 int old_state ; 1226 QEMUFile *f = migrate_get_current()->to_dst_file; 1227 trace_migrate_fd_cancel(); 1228 1229 WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { 1230 if (s->rp_state.from_dst_file) { 1231 /* shutdown the rp socket, so causing the rp thread to shutdown */ 1232 qemu_file_shutdown(s->rp_state.from_dst_file); 1233 } 1234 } 1235 1236 do { 1237 old_state = s->state; 1238 if (!migration_is_running(old_state)) { 1239 break; 1240 } 1241 /* If the migration is paused, kick it out of the pause */ 1242 if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) { 1243 qemu_sem_post(&s->pause_sem); 1244 } 1245 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); 1246 } while (s->state != MIGRATION_STATUS_CANCELLING); 1247 1248 /* 1249 * If we're unlucky the migration code might be stuck somewhere in a 1250 * send/write while the network has failed and is waiting to timeout; 1251 * if we've got shutdown(2) available then we can force it to quit. 1252 * The outgoing qemu file gets closed in migrate_fd_cleanup that is 1253 * called in a bh, so there is no race against this cancel. 1254 */ 1255 if (s->state == MIGRATION_STATUS_CANCELLING && f) { 1256 qemu_file_shutdown(f); 1257 } 1258 if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) { 1259 Error *local_err = NULL; 1260 1261 bdrv_activate_all(&local_err); 1262 if (local_err) { 1263 error_report_err(local_err); 1264 } else { 1265 s->block_inactive = false; 1266 } 1267 } 1268 } 1269 1270 void add_migration_state_change_notifier(Notifier *notify) 1271 { 1272 notifier_list_add(&migration_state_notifiers, notify); 1273 } 1274 1275 void remove_migration_state_change_notifier(Notifier *notify) 1276 { 1277 notifier_remove(notify); 1278 } 1279 1280 bool migration_in_setup(MigrationState *s) 1281 { 1282 return s->state == MIGRATION_STATUS_SETUP; 1283 } 1284 1285 bool migration_has_finished(MigrationState *s) 1286 { 1287 return s->state == MIGRATION_STATUS_COMPLETED; 1288 } 1289 1290 bool migration_has_failed(MigrationState *s) 1291 { 1292 return (s->state == MIGRATION_STATUS_CANCELLED || 1293 s->state == MIGRATION_STATUS_FAILED); 1294 } 1295 1296 bool migration_in_postcopy(void) 1297 { 1298 MigrationState *s = migrate_get_current(); 1299 1300 switch (s->state) { 1301 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1302 case MIGRATION_STATUS_POSTCOPY_PAUSED: 1303 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1304 return true; 1305 default: 1306 return false; 1307 } 1308 } 1309 1310 bool migration_in_postcopy_after_devices(MigrationState *s) 1311 { 1312 return migration_in_postcopy() && s->postcopy_after_devices; 1313 } 1314 1315 bool migration_in_incoming_postcopy(void) 1316 { 1317 PostcopyState ps = postcopy_state_get(); 1318 1319 return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END; 1320 } 1321 1322 bool migration_incoming_postcopy_advised(void) 1323 { 1324 PostcopyState ps = postcopy_state_get(); 1325 1326 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 1327 } 1328 1329 bool migration_in_bg_snapshot(void) 1330 { 1331 MigrationState *s = migrate_get_current(); 1332 1333 return migrate_background_snapshot() && 1334 migration_is_setup_or_active(s->state); 1335 } 1336 1337 bool migration_is_idle(void) 1338 { 1339 MigrationState *s = current_migration; 1340 1341 if (!s) { 1342 return true; 1343 } 1344 1345 switch (s->state) { 1346 case MIGRATION_STATUS_NONE: 1347 case MIGRATION_STATUS_CANCELLED: 1348 case MIGRATION_STATUS_COMPLETED: 1349 case MIGRATION_STATUS_FAILED: 1350 return true; 1351 case MIGRATION_STATUS_SETUP: 1352 case MIGRATION_STATUS_CANCELLING: 1353 case MIGRATION_STATUS_ACTIVE: 1354 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1355 case MIGRATION_STATUS_COLO: 1356 case MIGRATION_STATUS_PRE_SWITCHOVER: 1357 case MIGRATION_STATUS_DEVICE: 1358 case MIGRATION_STATUS_WAIT_UNPLUG: 1359 return false; 1360 case MIGRATION_STATUS__MAX: 1361 g_assert_not_reached(); 1362 } 1363 1364 return false; 1365 } 1366 1367 bool migration_is_active(MigrationState *s) 1368 { 1369 return (s->state == MIGRATION_STATUS_ACTIVE || 1370 s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); 1371 } 1372 1373 void migrate_init(MigrationState *s) 1374 { 1375 /* 1376 * Reinitialise all migration state, except 1377 * parameters/capabilities that the user set, and 1378 * locks. 1379 */ 1380 s->cleanup_bh = 0; 1381 s->vm_start_bh = 0; 1382 s->to_dst_file = NULL; 1383 s->state = MIGRATION_STATUS_NONE; 1384 s->rp_state.from_dst_file = NULL; 1385 s->rp_state.error = false; 1386 s->mbps = 0.0; 1387 s->pages_per_second = 0.0; 1388 s->downtime = 0; 1389 s->expected_downtime = 0; 1390 s->setup_time = 0; 1391 s->start_postcopy = false; 1392 s->postcopy_after_devices = false; 1393 s->migration_thread_running = false; 1394 error_free(s->error); 1395 s->error = NULL; 1396 s->hostname = NULL; 1397 1398 migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); 1399 1400 s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1401 s->total_time = 0; 1402 s->vm_was_running = false; 1403 s->iteration_initial_bytes = 0; 1404 s->threshold_size = 0; 1405 } 1406 1407 int migrate_add_blocker_internal(Error *reason, Error **errp) 1408 { 1409 /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */ 1410 if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) { 1411 error_propagate_prepend(errp, error_copy(reason), 1412 "disallowing migration blocker " 1413 "(migration/snapshot in progress) for: "); 1414 return -EBUSY; 1415 } 1416 1417 migration_blockers = g_slist_prepend(migration_blockers, reason); 1418 return 0; 1419 } 1420 1421 int migrate_add_blocker(Error *reason, Error **errp) 1422 { 1423 if (only_migratable) { 1424 error_propagate_prepend(errp, error_copy(reason), 1425 "disallowing migration blocker " 1426 "(--only-migratable) for: "); 1427 return -EACCES; 1428 } 1429 1430 return migrate_add_blocker_internal(reason, errp); 1431 } 1432 1433 void migrate_del_blocker(Error *reason) 1434 { 1435 migration_blockers = g_slist_remove(migration_blockers, reason); 1436 } 1437 1438 void qmp_migrate_incoming(const char *uri, Error **errp) 1439 { 1440 Error *local_err = NULL; 1441 static bool once = true; 1442 1443 if (!once) { 1444 error_setg(errp, "The incoming migration has already been started"); 1445 return; 1446 } 1447 if (!runstate_check(RUN_STATE_INMIGRATE)) { 1448 error_setg(errp, "'-incoming' was not specified on the command line"); 1449 return; 1450 } 1451 1452 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { 1453 return; 1454 } 1455 1456 qemu_start_incoming_migration(uri, &local_err); 1457 1458 if (local_err) { 1459 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 1460 error_propagate(errp, local_err); 1461 return; 1462 } 1463 1464 once = false; 1465 } 1466 1467 void qmp_migrate_recover(const char *uri, Error **errp) 1468 { 1469 MigrationIncomingState *mis = migration_incoming_get_current(); 1470 1471 /* 1472 * Don't even bother to use ERRP_GUARD() as it _must_ always be set by 1473 * callers (no one should ignore a recover failure); if there is, it's a 1474 * programming error. 1475 */ 1476 assert(errp); 1477 1478 if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) { 1479 error_setg(errp, "Migrate recover can only be run " 1480 "when postcopy is paused."); 1481 return; 1482 } 1483 1484 /* If there's an existing transport, release it */ 1485 migration_incoming_transport_cleanup(mis); 1486 1487 /* 1488 * Note that this call will never start a real migration; it will 1489 * only re-setup the migration stream and poke existing migration 1490 * to continue using that newly established channel. 1491 */ 1492 qemu_start_incoming_migration(uri, errp); 1493 } 1494 1495 void qmp_migrate_pause(Error **errp) 1496 { 1497 MigrationState *ms = migrate_get_current(); 1498 MigrationIncomingState *mis = migration_incoming_get_current(); 1499 int ret; 1500 1501 if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { 1502 /* Source side, during postcopy */ 1503 qemu_mutex_lock(&ms->qemu_file_lock); 1504 ret = qemu_file_shutdown(ms->to_dst_file); 1505 qemu_mutex_unlock(&ms->qemu_file_lock); 1506 if (ret) { 1507 error_setg(errp, "Failed to pause source migration"); 1508 } 1509 return; 1510 } 1511 1512 if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { 1513 ret = qemu_file_shutdown(mis->from_src_file); 1514 if (ret) { 1515 error_setg(errp, "Failed to pause destination migration"); 1516 } 1517 return; 1518 } 1519 1520 error_setg(errp, "migrate-pause is currently only supported " 1521 "during postcopy-active state"); 1522 } 1523 1524 bool migration_is_blocked(Error **errp) 1525 { 1526 if (qemu_savevm_state_blocked(errp)) { 1527 return true; 1528 } 1529 1530 if (migration_blockers) { 1531 error_propagate(errp, error_copy(migration_blockers->data)); 1532 return true; 1533 } 1534 1535 return false; 1536 } 1537 1538 /* Returns true if continue to migrate, or false if error detected */ 1539 static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, 1540 bool resume, Error **errp) 1541 { 1542 Error *local_err = NULL; 1543 1544 if (resume) { 1545 if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) { 1546 error_setg(errp, "Cannot resume if there is no " 1547 "paused migration"); 1548 return false; 1549 } 1550 1551 /* 1552 * Postcopy recovery won't work well with release-ram 1553 * capability since release-ram will drop the page buffer as 1554 * long as the page is put into the send buffer. So if there 1555 * is a network failure happened, any page buffers that have 1556 * not yet reached the destination VM but have already been 1557 * sent from the source VM will be lost forever. Let's refuse 1558 * the client from resuming such a postcopy migration. 1559 * Luckily release-ram was designed to only be used when src 1560 * and destination VMs are on the same host, so it should be 1561 * fine. 1562 */ 1563 if (migrate_release_ram()) { 1564 error_setg(errp, "Postcopy recovery cannot work " 1565 "when release-ram capability is set"); 1566 return false; 1567 } 1568 1569 /* This is a resume, skip init status */ 1570 return true; 1571 } 1572 1573 if (migration_is_running(s->state)) { 1574 error_setg(errp, QERR_MIGRATION_ACTIVE); 1575 return false; 1576 } 1577 1578 if (runstate_check(RUN_STATE_INMIGRATE)) { 1579 error_setg(errp, "Guest is waiting for an incoming migration"); 1580 return false; 1581 } 1582 1583 if (runstate_check(RUN_STATE_POSTMIGRATE)) { 1584 error_setg(errp, "Can't migrate the vm that was paused due to " 1585 "previous migration"); 1586 return false; 1587 } 1588 1589 if (migration_is_blocked(errp)) { 1590 return false; 1591 } 1592 1593 if (blk || blk_inc) { 1594 if (migrate_colo()) { 1595 error_setg(errp, "No disk migration is required in COLO mode"); 1596 return false; 1597 } 1598 if (migrate_block() || migrate_block_incremental()) { 1599 error_setg(errp, "Command options are incompatible with " 1600 "current migration capabilities"); 1601 return false; 1602 } 1603 if (!migrate_cap_set(MIGRATION_CAPABILITY_BLOCK, true, &local_err)) { 1604 error_propagate(errp, local_err); 1605 return false; 1606 } 1607 s->must_remove_block_options = true; 1608 } 1609 1610 if (blk_inc) { 1611 migrate_set_block_incremental(true); 1612 } 1613 1614 migrate_init(s); 1615 /* 1616 * set mig_stats compression_counters memory to zero for a 1617 * new migration 1618 */ 1619 memset(&mig_stats, 0, sizeof(mig_stats)); 1620 memset(&compression_counters, 0, sizeof(compression_counters)); 1621 1622 return true; 1623 } 1624 1625 void qmp_migrate(const char *uri, bool has_blk, bool blk, 1626 bool has_inc, bool inc, bool has_detach, bool detach, 1627 bool has_resume, bool resume, Error **errp) 1628 { 1629 Error *local_err = NULL; 1630 MigrationState *s = migrate_get_current(); 1631 const char *p = NULL; 1632 1633 /* URI is not suitable for migration? */ 1634 if (!migration_channels_and_uri_compatible(uri, errp)) { 1635 return; 1636 } 1637 1638 if (!migrate_prepare(s, has_blk && blk, has_inc && inc, 1639 has_resume && resume, errp)) { 1640 /* Error detected, put into errp */ 1641 return; 1642 } 1643 1644 if (!(has_resume && resume)) { 1645 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { 1646 return; 1647 } 1648 } 1649 1650 if (strstart(uri, "tcp:", &p) || 1651 strstart(uri, "unix:", NULL) || 1652 strstart(uri, "vsock:", NULL)) { 1653 socket_start_outgoing_migration(s, p ? p : uri, &local_err); 1654 #ifdef CONFIG_RDMA 1655 } else if (strstart(uri, "rdma:", &p)) { 1656 rdma_start_outgoing_migration(s, p, &local_err); 1657 #endif 1658 } else if (strstart(uri, "exec:", &p)) { 1659 exec_start_outgoing_migration(s, p, &local_err); 1660 } else if (strstart(uri, "fd:", &p)) { 1661 fd_start_outgoing_migration(s, p, &local_err); 1662 } else { 1663 if (!(has_resume && resume)) { 1664 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 1665 } 1666 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri", 1667 "a valid migration protocol"); 1668 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 1669 MIGRATION_STATUS_FAILED); 1670 block_cleanup_parameters(); 1671 return; 1672 } 1673 1674 if (local_err) { 1675 if (!(has_resume && resume)) { 1676 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 1677 } 1678 migrate_fd_error(s, local_err); 1679 error_propagate(errp, local_err); 1680 return; 1681 } 1682 } 1683 1684 void qmp_migrate_cancel(Error **errp) 1685 { 1686 migration_cancel(NULL); 1687 } 1688 1689 void qmp_migrate_continue(MigrationStatus state, Error **errp) 1690 { 1691 MigrationState *s = migrate_get_current(); 1692 if (s->state != state) { 1693 error_setg(errp, "Migration not in expected state: %s", 1694 MigrationStatus_str(s->state)); 1695 return; 1696 } 1697 qemu_sem_post(&s->pause_sem); 1698 } 1699 1700 /* migration thread support */ 1701 /* 1702 * Something bad happened to the RP stream, mark an error 1703 * The caller shall print or trace something to indicate why 1704 */ 1705 static void mark_source_rp_bad(MigrationState *s) 1706 { 1707 s->rp_state.error = true; 1708 } 1709 1710 static struct rp_cmd_args { 1711 ssize_t len; /* -1 = variable */ 1712 const char *name; 1713 } rp_cmd_args[] = { 1714 [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" }, 1715 [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" }, 1716 [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" }, 1717 [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" }, 1718 [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, 1719 [MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, 1720 [MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" }, 1721 [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, 1722 }; 1723 1724 /* 1725 * Process a request for pages received on the return path, 1726 * We're allowed to send more than requested (e.g. to round to our page size) 1727 * and we don't need to send pages that have already been sent. 1728 */ 1729 static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, 1730 ram_addr_t start, size_t len) 1731 { 1732 long our_host_ps = qemu_real_host_page_size(); 1733 1734 trace_migrate_handle_rp_req_pages(rbname, start, len); 1735 1736 /* 1737 * Since we currently insist on matching page sizes, just sanity check 1738 * we're being asked for whole host pages. 1739 */ 1740 if (!QEMU_IS_ALIGNED(start, our_host_ps) || 1741 !QEMU_IS_ALIGNED(len, our_host_ps)) { 1742 error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT 1743 " len: %zd", __func__, start, len); 1744 mark_source_rp_bad(ms); 1745 return; 1746 } 1747 1748 if (ram_save_queue_pages(rbname, start, len)) { 1749 mark_source_rp_bad(ms); 1750 } 1751 } 1752 1753 /* Return true to retry, false to quit */ 1754 static bool postcopy_pause_return_path_thread(MigrationState *s) 1755 { 1756 trace_postcopy_pause_return_path(); 1757 1758 qemu_sem_wait(&s->postcopy_pause_rp_sem); 1759 1760 trace_postcopy_pause_return_path_continued(); 1761 1762 return true; 1763 } 1764 1765 static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name) 1766 { 1767 RAMBlock *block = qemu_ram_block_by_name(block_name); 1768 1769 if (!block) { 1770 error_report("%s: invalid block name '%s'", __func__, block_name); 1771 return -EINVAL; 1772 } 1773 1774 /* Fetch the received bitmap and refresh the dirty bitmap */ 1775 return ram_dirty_bitmap_reload(s, block); 1776 } 1777 1778 static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value) 1779 { 1780 trace_source_return_path_thread_resume_ack(value); 1781 1782 if (value != MIGRATION_RESUME_ACK_VALUE) { 1783 error_report("%s: illegal resume_ack value %"PRIu32, 1784 __func__, value); 1785 return -1; 1786 } 1787 1788 /* Now both sides are active. */ 1789 migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER, 1790 MIGRATION_STATUS_POSTCOPY_ACTIVE); 1791 1792 /* Notify send thread that time to continue send pages */ 1793 qemu_sem_post(&s->rp_state.rp_sem); 1794 1795 return 0; 1796 } 1797 1798 /* 1799 * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if 1800 * existed) in a safe way. 1801 */ 1802 static void migration_release_dst_files(MigrationState *ms) 1803 { 1804 QEMUFile *file; 1805 1806 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { 1807 /* 1808 * Reset the from_dst_file pointer first before releasing it, as we 1809 * can't block within lock section 1810 */ 1811 file = ms->rp_state.from_dst_file; 1812 ms->rp_state.from_dst_file = NULL; 1813 } 1814 1815 /* 1816 * Do the same to postcopy fast path socket too if there is. No 1817 * locking needed because this qemufile should only be managed by 1818 * return path thread. 1819 */ 1820 if (ms->postcopy_qemufile_src) { 1821 migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src); 1822 qemu_file_shutdown(ms->postcopy_qemufile_src); 1823 qemu_fclose(ms->postcopy_qemufile_src); 1824 ms->postcopy_qemufile_src = NULL; 1825 } 1826 1827 qemu_fclose(file); 1828 } 1829 1830 /* 1831 * Handles messages sent on the return path towards the source VM 1832 * 1833 */ 1834 static void *source_return_path_thread(void *opaque) 1835 { 1836 MigrationState *ms = opaque; 1837 QEMUFile *rp = ms->rp_state.from_dst_file; 1838 uint16_t header_len, header_type; 1839 uint8_t buf[512]; 1840 uint32_t tmp32, sibling_error; 1841 ram_addr_t start = 0; /* =0 to silence warning */ 1842 size_t len = 0, expected_len; 1843 int res; 1844 1845 trace_source_return_path_thread_entry(); 1846 rcu_register_thread(); 1847 1848 retry: 1849 while (!ms->rp_state.error && !qemu_file_get_error(rp) && 1850 migration_is_setup_or_active(ms->state)) { 1851 trace_source_return_path_thread_loop_top(); 1852 header_type = qemu_get_be16(rp); 1853 header_len = qemu_get_be16(rp); 1854 1855 if (qemu_file_get_error(rp)) { 1856 mark_source_rp_bad(ms); 1857 goto out; 1858 } 1859 1860 if (header_type >= MIG_RP_MSG_MAX || 1861 header_type == MIG_RP_MSG_INVALID) { 1862 error_report("RP: Received invalid message 0x%04x length 0x%04x", 1863 header_type, header_len); 1864 mark_source_rp_bad(ms); 1865 goto out; 1866 } 1867 1868 if ((rp_cmd_args[header_type].len != -1 && 1869 header_len != rp_cmd_args[header_type].len) || 1870 header_len > sizeof(buf)) { 1871 error_report("RP: Received '%s' message (0x%04x) with" 1872 "incorrect length %d expecting %zu", 1873 rp_cmd_args[header_type].name, header_type, header_len, 1874 (size_t)rp_cmd_args[header_type].len); 1875 mark_source_rp_bad(ms); 1876 goto out; 1877 } 1878 1879 /* We know we've got a valid header by this point */ 1880 res = qemu_get_buffer(rp, buf, header_len); 1881 if (res != header_len) { 1882 error_report("RP: Failed reading data for message 0x%04x" 1883 " read %d expected %d", 1884 header_type, res, header_len); 1885 mark_source_rp_bad(ms); 1886 goto out; 1887 } 1888 1889 /* OK, we have the message and the data */ 1890 switch (header_type) { 1891 case MIG_RP_MSG_SHUT: 1892 sibling_error = ldl_be_p(buf); 1893 trace_source_return_path_thread_shut(sibling_error); 1894 if (sibling_error) { 1895 error_report("RP: Sibling indicated error %d", sibling_error); 1896 mark_source_rp_bad(ms); 1897 } 1898 /* 1899 * We'll let the main thread deal with closing the RP 1900 * we could do a shutdown(2) on it, but we're the only user 1901 * anyway, so there's nothing gained. 1902 */ 1903 goto out; 1904 1905 case MIG_RP_MSG_PONG: 1906 tmp32 = ldl_be_p(buf); 1907 trace_source_return_path_thread_pong(tmp32); 1908 qemu_sem_post(&ms->rp_state.rp_pong_acks); 1909 break; 1910 1911 case MIG_RP_MSG_REQ_PAGES: 1912 start = ldq_be_p(buf); 1913 len = ldl_be_p(buf + 8); 1914 migrate_handle_rp_req_pages(ms, NULL, start, len); 1915 break; 1916 1917 case MIG_RP_MSG_REQ_PAGES_ID: 1918 expected_len = 12 + 1; /* header + termination */ 1919 1920 if (header_len >= expected_len) { 1921 start = ldq_be_p(buf); 1922 len = ldl_be_p(buf + 8); 1923 /* Now we expect an idstr */ 1924 tmp32 = buf[12]; /* Length of the following idstr */ 1925 buf[13 + tmp32] = '\0'; 1926 expected_len += tmp32; 1927 } 1928 if (header_len != expected_len) { 1929 error_report("RP: Req_Page_id with length %d expecting %zd", 1930 header_len, expected_len); 1931 mark_source_rp_bad(ms); 1932 goto out; 1933 } 1934 migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len); 1935 break; 1936 1937 case MIG_RP_MSG_RECV_BITMAP: 1938 if (header_len < 1) { 1939 error_report("%s: missing block name", __func__); 1940 mark_source_rp_bad(ms); 1941 goto out; 1942 } 1943 /* Format: len (1B) + idstr (<255B). This ends the idstr. */ 1944 buf[buf[0] + 1] = '\0'; 1945 if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) { 1946 mark_source_rp_bad(ms); 1947 goto out; 1948 } 1949 break; 1950 1951 case MIG_RP_MSG_RESUME_ACK: 1952 tmp32 = ldl_be_p(buf); 1953 if (migrate_handle_rp_resume_ack(ms, tmp32)) { 1954 mark_source_rp_bad(ms); 1955 goto out; 1956 } 1957 break; 1958 1959 default: 1960 break; 1961 } 1962 } 1963 1964 out: 1965 res = qemu_file_get_error(rp); 1966 if (res) { 1967 if (res && migration_in_postcopy()) { 1968 /* 1969 * Maybe there is something we can do: it looks like a 1970 * network down issue, and we pause for a recovery. 1971 */ 1972 migration_release_dst_files(ms); 1973 rp = NULL; 1974 if (postcopy_pause_return_path_thread(ms)) { 1975 /* 1976 * Reload rp, reset the rest. Referencing it is safe since 1977 * it's reset only by us above, or when migration completes 1978 */ 1979 rp = ms->rp_state.from_dst_file; 1980 ms->rp_state.error = false; 1981 goto retry; 1982 } 1983 } 1984 1985 trace_source_return_path_thread_bad_end(); 1986 mark_source_rp_bad(ms); 1987 } 1988 1989 trace_source_return_path_thread_end(); 1990 migration_release_dst_files(ms); 1991 rcu_unregister_thread(); 1992 return NULL; 1993 } 1994 1995 static int open_return_path_on_source(MigrationState *ms, 1996 bool create_thread) 1997 { 1998 ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file); 1999 if (!ms->rp_state.from_dst_file) { 2000 return -1; 2001 } 2002 2003 trace_open_return_path_on_source(); 2004 2005 if (!create_thread) { 2006 /* We're done */ 2007 return 0; 2008 } 2009 2010 qemu_thread_create(&ms->rp_state.rp_thread, "return path", 2011 source_return_path_thread, ms, QEMU_THREAD_JOINABLE); 2012 ms->rp_state.rp_thread_created = true; 2013 2014 trace_open_return_path_on_source_continue(); 2015 2016 return 0; 2017 } 2018 2019 /* Returns 0 if the RP was ok, otherwise there was an error on the RP */ 2020 static int await_return_path_close_on_source(MigrationState *ms) 2021 { 2022 /* 2023 * If this is a normal exit then the destination will send a SHUT and the 2024 * rp_thread will exit, however if there's an error we need to cause 2025 * it to exit. 2026 */ 2027 if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) { 2028 /* 2029 * shutdown(2), if we have it, will cause it to unblock if it's stuck 2030 * waiting for the destination. 2031 */ 2032 qemu_file_shutdown(ms->rp_state.from_dst_file); 2033 mark_source_rp_bad(ms); 2034 } 2035 trace_await_return_path_close_on_source_joining(); 2036 qemu_thread_join(&ms->rp_state.rp_thread); 2037 ms->rp_state.rp_thread_created = false; 2038 trace_await_return_path_close_on_source_close(); 2039 return ms->rp_state.error; 2040 } 2041 2042 static inline void 2043 migration_wait_main_channel(MigrationState *ms) 2044 { 2045 /* Wait until one PONG message received */ 2046 qemu_sem_wait(&ms->rp_state.rp_pong_acks); 2047 } 2048 2049 /* 2050 * Switch from normal iteration to postcopy 2051 * Returns non-0 on error 2052 */ 2053 static int postcopy_start(MigrationState *ms) 2054 { 2055 int ret; 2056 QIOChannelBuffer *bioc; 2057 QEMUFile *fb; 2058 int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 2059 int64_t bandwidth = migrate_max_postcopy_bandwidth(); 2060 bool restart_block = false; 2061 int cur_state = MIGRATION_STATUS_ACTIVE; 2062 2063 if (migrate_postcopy_preempt()) { 2064 migration_wait_main_channel(ms); 2065 if (postcopy_preempt_establish_channel(ms)) { 2066 migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); 2067 return -1; 2068 } 2069 } 2070 2071 if (!migrate_pause_before_switchover()) { 2072 migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, 2073 MIGRATION_STATUS_POSTCOPY_ACTIVE); 2074 } 2075 2076 trace_postcopy_start(); 2077 qemu_mutex_lock_iothread(); 2078 trace_postcopy_start_set_run(); 2079 2080 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); 2081 global_state_store(); 2082 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); 2083 if (ret < 0) { 2084 goto fail; 2085 } 2086 2087 ret = migration_maybe_pause(ms, &cur_state, 2088 MIGRATION_STATUS_POSTCOPY_ACTIVE); 2089 if (ret < 0) { 2090 goto fail; 2091 } 2092 2093 ret = bdrv_inactivate_all(); 2094 if (ret < 0) { 2095 goto fail; 2096 } 2097 restart_block = true; 2098 2099 /* 2100 * Cause any non-postcopiable, but iterative devices to 2101 * send out their final data. 2102 */ 2103 qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false); 2104 2105 /* 2106 * in Finish migrate and with the io-lock held everything should 2107 * be quiet, but we've potentially still got dirty pages and we 2108 * need to tell the destination to throw any pages it's already received 2109 * that are dirty 2110 */ 2111 if (migrate_postcopy_ram()) { 2112 ram_postcopy_send_discard_bitmap(ms); 2113 } 2114 2115 /* 2116 * send rest of state - note things that are doing postcopy 2117 * will notice we're in POSTCOPY_ACTIVE and not actually 2118 * wrap their state up here 2119 */ 2120 /* 0 max-postcopy-bandwidth means unlimited */ 2121 if (!bandwidth) { 2122 qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX); 2123 } else { 2124 qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO); 2125 } 2126 if (migrate_postcopy_ram()) { 2127 /* Ping just for debugging, helps line traces up */ 2128 qemu_savevm_send_ping(ms->to_dst_file, 2); 2129 } 2130 2131 /* 2132 * While loading the device state we may trigger page transfer 2133 * requests and the fd must be free to process those, and thus 2134 * the destination must read the whole device state off the fd before 2135 * it starts processing it. Unfortunately the ad-hoc migration format 2136 * doesn't allow the destination to know the size to read without fully 2137 * parsing it through each devices load-state code (especially the open 2138 * coded devices that use get/put). 2139 * So we wrap the device state up in a package with a length at the start; 2140 * to do this we use a qemu_buf to hold the whole of the device state. 2141 */ 2142 bioc = qio_channel_buffer_new(4096); 2143 qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer"); 2144 fb = qemu_file_new_output(QIO_CHANNEL(bioc)); 2145 object_unref(OBJECT(bioc)); 2146 2147 /* 2148 * Make sure the receiver can get incoming pages before we send the rest 2149 * of the state 2150 */ 2151 qemu_savevm_send_postcopy_listen(fb); 2152 2153 qemu_savevm_state_complete_precopy(fb, false, false); 2154 if (migrate_postcopy_ram()) { 2155 qemu_savevm_send_ping(fb, 3); 2156 } 2157 2158 qemu_savevm_send_postcopy_run(fb); 2159 2160 /* <><> end of stuff going into the package */ 2161 2162 /* Last point of recovery; as soon as we send the package the destination 2163 * can open devices and potentially start running. 2164 * Lets just check again we've not got any errors. 2165 */ 2166 ret = qemu_file_get_error(ms->to_dst_file); 2167 if (ret) { 2168 error_report("postcopy_start: Migration stream errored (pre package)"); 2169 goto fail_closefb; 2170 } 2171 2172 restart_block = false; 2173 2174 /* Now send that blob */ 2175 if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) { 2176 goto fail_closefb; 2177 } 2178 qemu_fclose(fb); 2179 2180 /* Send a notify to give a chance for anything that needs to happen 2181 * at the transition to postcopy and after the device state; in particular 2182 * spice needs to trigger a transition now 2183 */ 2184 ms->postcopy_after_devices = true; 2185 notifier_list_notify(&migration_state_notifiers, ms); 2186 2187 ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop; 2188 2189 qemu_mutex_unlock_iothread(); 2190 2191 if (migrate_postcopy_ram()) { 2192 /* 2193 * Although this ping is just for debug, it could potentially be 2194 * used for getting a better measurement of downtime at the source. 2195 */ 2196 qemu_savevm_send_ping(ms->to_dst_file, 4); 2197 } 2198 2199 if (migrate_release_ram()) { 2200 ram_postcopy_migrated_memory_release(ms); 2201 } 2202 2203 ret = qemu_file_get_error(ms->to_dst_file); 2204 if (ret) { 2205 error_report("postcopy_start: Migration stream errored"); 2206 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 2207 MIGRATION_STATUS_FAILED); 2208 } 2209 2210 trace_postcopy_preempt_enabled(migrate_postcopy_preempt()); 2211 2212 return ret; 2213 2214 fail_closefb: 2215 qemu_fclose(fb); 2216 fail: 2217 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 2218 MIGRATION_STATUS_FAILED); 2219 if (restart_block) { 2220 /* A failure happened early enough that we know the destination hasn't 2221 * accessed block devices, so we're safe to recover. 2222 */ 2223 Error *local_err = NULL; 2224 2225 bdrv_activate_all(&local_err); 2226 if (local_err) { 2227 error_report_err(local_err); 2228 } 2229 } 2230 qemu_mutex_unlock_iothread(); 2231 return -1; 2232 } 2233 2234 /** 2235 * migration_maybe_pause: Pause if required to by 2236 * migrate_pause_before_switchover called with the iothread locked 2237 * Returns: 0 on success 2238 */ 2239 static int migration_maybe_pause(MigrationState *s, 2240 int *current_active_state, 2241 int new_state) 2242 { 2243 if (!migrate_pause_before_switchover()) { 2244 return 0; 2245 } 2246 2247 /* Since leaving this state is not atomic with posting the semaphore 2248 * it's possible that someone could have issued multiple migrate_continue 2249 * and the semaphore is incorrectly positive at this point; 2250 * the docs say it's undefined to reinit a semaphore that's already 2251 * init'd, so use timedwait to eat up any existing posts. 2252 */ 2253 while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) { 2254 /* This block intentionally left blank */ 2255 } 2256 2257 /* 2258 * If the migration is cancelled when it is in the completion phase, 2259 * the migration state is set to MIGRATION_STATUS_CANCELLING. 2260 * So we don't need to wait a semaphore, otherwise we would always 2261 * wait for the 'pause_sem' semaphore. 2262 */ 2263 if (s->state != MIGRATION_STATUS_CANCELLING) { 2264 qemu_mutex_unlock_iothread(); 2265 migrate_set_state(&s->state, *current_active_state, 2266 MIGRATION_STATUS_PRE_SWITCHOVER); 2267 qemu_sem_wait(&s->pause_sem); 2268 migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, 2269 new_state); 2270 *current_active_state = new_state; 2271 qemu_mutex_lock_iothread(); 2272 } 2273 2274 return s->state == new_state ? 0 : -EINVAL; 2275 } 2276 2277 /** 2278 * migration_completion: Used by migration_thread when there's not much left. 2279 * The caller 'breaks' the loop when this returns. 2280 * 2281 * @s: Current migration state 2282 */ 2283 static void migration_completion(MigrationState *s) 2284 { 2285 int ret; 2286 int current_active_state = s->state; 2287 2288 if (s->state == MIGRATION_STATUS_ACTIVE) { 2289 qemu_mutex_lock_iothread(); 2290 s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 2291 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); 2292 s->vm_was_running = runstate_is_running(); 2293 ret = global_state_store(); 2294 2295 if (!ret) { 2296 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); 2297 trace_migration_completion_vm_stop(ret); 2298 if (ret >= 0) { 2299 ret = migration_maybe_pause(s, ¤t_active_state, 2300 MIGRATION_STATUS_DEVICE); 2301 } 2302 if (ret >= 0) { 2303 s->block_inactive = !migrate_colo(); 2304 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); 2305 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, 2306 s->block_inactive); 2307 } 2308 } 2309 qemu_mutex_unlock_iothread(); 2310 2311 if (ret < 0) { 2312 goto fail; 2313 } 2314 } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { 2315 trace_migration_completion_postcopy_end(); 2316 2317 qemu_mutex_lock_iothread(); 2318 qemu_savevm_state_complete_postcopy(s->to_dst_file); 2319 qemu_mutex_unlock_iothread(); 2320 2321 /* 2322 * Shutdown the postcopy fast path thread. This is only needed 2323 * when dest QEMU binary is old (7.1/7.2). QEMU 8.0+ doesn't need 2324 * this. 2325 */ 2326 if (migrate_postcopy_preempt() && s->preempt_pre_7_2) { 2327 postcopy_preempt_shutdown_file(s); 2328 } 2329 2330 trace_migration_completion_postcopy_end_after_complete(); 2331 } else { 2332 goto fail; 2333 } 2334 2335 /* 2336 * If rp was opened we must clean up the thread before 2337 * cleaning everything else up (since if there are no failures 2338 * it will wait for the destination to send it's status in 2339 * a SHUT command). 2340 */ 2341 if (s->rp_state.rp_thread_created) { 2342 int rp_error; 2343 trace_migration_return_path_end_before(); 2344 rp_error = await_return_path_close_on_source(s); 2345 trace_migration_return_path_end_after(rp_error); 2346 if (rp_error) { 2347 goto fail_invalidate; 2348 } 2349 } 2350 2351 if (qemu_file_get_error(s->to_dst_file)) { 2352 trace_migration_completion_file_err(); 2353 goto fail_invalidate; 2354 } 2355 2356 if (migrate_colo() && s->state == MIGRATION_STATUS_ACTIVE) { 2357 /* COLO does not support postcopy */ 2358 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, 2359 MIGRATION_STATUS_COLO); 2360 } else { 2361 migrate_set_state(&s->state, current_active_state, 2362 MIGRATION_STATUS_COMPLETED); 2363 } 2364 2365 return; 2366 2367 fail_invalidate: 2368 /* If not doing postcopy, vm_start() will be called: let's regain 2369 * control on images. 2370 */ 2371 if (s->state == MIGRATION_STATUS_ACTIVE || 2372 s->state == MIGRATION_STATUS_DEVICE) { 2373 Error *local_err = NULL; 2374 2375 qemu_mutex_lock_iothread(); 2376 bdrv_activate_all(&local_err); 2377 if (local_err) { 2378 error_report_err(local_err); 2379 s->block_inactive = true; 2380 } else { 2381 s->block_inactive = false; 2382 } 2383 qemu_mutex_unlock_iothread(); 2384 } 2385 2386 fail: 2387 migrate_set_state(&s->state, current_active_state, 2388 MIGRATION_STATUS_FAILED); 2389 } 2390 2391 /** 2392 * bg_migration_completion: Used by bg_migration_thread when after all the 2393 * RAM has been saved. The caller 'breaks' the loop when this returns. 2394 * 2395 * @s: Current migration state 2396 */ 2397 static void bg_migration_completion(MigrationState *s) 2398 { 2399 int current_active_state = s->state; 2400 2401 /* 2402 * Stop tracking RAM writes - un-protect memory, un-register UFFD 2403 * memory ranges, flush kernel wait queues and wake up threads 2404 * waiting for write fault to be resolved. 2405 */ 2406 ram_write_tracking_stop(); 2407 2408 if (s->state == MIGRATION_STATUS_ACTIVE) { 2409 /* 2410 * By this moment we have RAM content saved into the migration stream. 2411 * The next step is to flush the non-RAM content (device state) 2412 * right after the ram content. The device state has been stored into 2413 * the temporary buffer before RAM saving started. 2414 */ 2415 qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); 2416 qemu_fflush(s->to_dst_file); 2417 } else if (s->state == MIGRATION_STATUS_CANCELLING) { 2418 goto fail; 2419 } 2420 2421 if (qemu_file_get_error(s->to_dst_file)) { 2422 trace_migration_completion_file_err(); 2423 goto fail; 2424 } 2425 2426 migrate_set_state(&s->state, current_active_state, 2427 MIGRATION_STATUS_COMPLETED); 2428 return; 2429 2430 fail: 2431 migrate_set_state(&s->state, current_active_state, 2432 MIGRATION_STATUS_FAILED); 2433 } 2434 2435 typedef enum MigThrError { 2436 /* No error detected */ 2437 MIG_THR_ERR_NONE = 0, 2438 /* Detected error, but resumed successfully */ 2439 MIG_THR_ERR_RECOVERED = 1, 2440 /* Detected fatal error, need to exit */ 2441 MIG_THR_ERR_FATAL = 2, 2442 } MigThrError; 2443 2444 static int postcopy_resume_handshake(MigrationState *s) 2445 { 2446 qemu_savevm_send_postcopy_resume(s->to_dst_file); 2447 2448 while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { 2449 qemu_sem_wait(&s->rp_state.rp_sem); 2450 } 2451 2452 if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { 2453 return 0; 2454 } 2455 2456 return -1; 2457 } 2458 2459 /* Return zero if success, or <0 for error */ 2460 static int postcopy_do_resume(MigrationState *s) 2461 { 2462 int ret; 2463 2464 /* 2465 * Call all the resume_prepare() hooks, so that modules can be 2466 * ready for the migration resume. 2467 */ 2468 ret = qemu_savevm_state_resume_prepare(s); 2469 if (ret) { 2470 error_report("%s: resume_prepare() failure detected: %d", 2471 __func__, ret); 2472 return ret; 2473 } 2474 2475 /* 2476 * If preempt is enabled, re-establish the preempt channel. Note that 2477 * we do it after resume prepare to make sure the main channel will be 2478 * created before the preempt channel. E.g. with weak network, the 2479 * dest QEMU may get messed up with the preempt and main channels on 2480 * the order of connection setup. This guarantees the correct order. 2481 */ 2482 ret = postcopy_preempt_establish_channel(s); 2483 if (ret) { 2484 error_report("%s: postcopy_preempt_establish_channel(): %d", 2485 __func__, ret); 2486 return ret; 2487 } 2488 2489 /* 2490 * Last handshake with destination on the resume (destination will 2491 * switch to postcopy-active afterwards) 2492 */ 2493 ret = postcopy_resume_handshake(s); 2494 if (ret) { 2495 error_report("%s: handshake failed: %d", __func__, ret); 2496 return ret; 2497 } 2498 2499 return 0; 2500 } 2501 2502 /* 2503 * We don't return until we are in a safe state to continue current 2504 * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or 2505 * MIG_THR_ERR_FATAL if unrecovery failure happened. 2506 */ 2507 static MigThrError postcopy_pause(MigrationState *s) 2508 { 2509 assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); 2510 2511 while (true) { 2512 QEMUFile *file; 2513 2514 /* 2515 * Current channel is possibly broken. Release it. Note that this is 2516 * guaranteed even without lock because to_dst_file should only be 2517 * modified by the migration thread. That also guarantees that the 2518 * unregister of yank is safe too without the lock. It should be safe 2519 * even to be within the qemu_file_lock, but we didn't do that to avoid 2520 * taking more mutex (yank_lock) within qemu_file_lock. TL;DR: we make 2521 * the qemu_file_lock critical section as small as possible. 2522 */ 2523 assert(s->to_dst_file); 2524 migration_ioc_unregister_yank_from_file(s->to_dst_file); 2525 qemu_mutex_lock(&s->qemu_file_lock); 2526 file = s->to_dst_file; 2527 s->to_dst_file = NULL; 2528 qemu_mutex_unlock(&s->qemu_file_lock); 2529 2530 qemu_file_shutdown(file); 2531 qemu_fclose(file); 2532 2533 migrate_set_state(&s->state, s->state, 2534 MIGRATION_STATUS_POSTCOPY_PAUSED); 2535 2536 error_report("Detected IO failure for postcopy. " 2537 "Migration paused."); 2538 2539 /* 2540 * We wait until things fixed up. Then someone will setup the 2541 * status back for us. 2542 */ 2543 while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { 2544 qemu_sem_wait(&s->postcopy_pause_sem); 2545 } 2546 2547 if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { 2548 /* Woken up by a recover procedure. Give it a shot */ 2549 2550 /* 2551 * Firstly, let's wake up the return path now, with a new 2552 * return path channel. 2553 */ 2554 qemu_sem_post(&s->postcopy_pause_rp_sem); 2555 2556 /* Do the resume logic */ 2557 if (postcopy_do_resume(s) == 0) { 2558 /* Let's continue! */ 2559 trace_postcopy_pause_continued(); 2560 return MIG_THR_ERR_RECOVERED; 2561 } else { 2562 /* 2563 * Something wrong happened during the recovery, let's 2564 * pause again. Pause is always better than throwing 2565 * data away. 2566 */ 2567 continue; 2568 } 2569 } else { 2570 /* This is not right... Time to quit. */ 2571 return MIG_THR_ERR_FATAL; 2572 } 2573 } 2574 } 2575 2576 static MigThrError migration_detect_error(MigrationState *s) 2577 { 2578 int ret; 2579 int state = s->state; 2580 Error *local_error = NULL; 2581 2582 if (state == MIGRATION_STATUS_CANCELLING || 2583 state == MIGRATION_STATUS_CANCELLED) { 2584 /* End the migration, but don't set the state to failed */ 2585 return MIG_THR_ERR_FATAL; 2586 } 2587 2588 /* 2589 * Try to detect any file errors. Note that postcopy_qemufile_src will 2590 * be NULL when postcopy preempt is not enabled. 2591 */ 2592 ret = qemu_file_get_error_obj_any(s->to_dst_file, 2593 s->postcopy_qemufile_src, 2594 &local_error); 2595 if (!ret) { 2596 /* Everything is fine */ 2597 assert(!local_error); 2598 return MIG_THR_ERR_NONE; 2599 } 2600 2601 if (local_error) { 2602 migrate_set_error(s, local_error); 2603 error_free(local_error); 2604 } 2605 2606 if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) { 2607 /* 2608 * For postcopy, we allow the network to be down for a 2609 * while. After that, it can be continued by a 2610 * recovery phase. 2611 */ 2612 return postcopy_pause(s); 2613 } else { 2614 /* 2615 * For precopy (or postcopy with error outside IO), we fail 2616 * with no time. 2617 */ 2618 migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); 2619 trace_migration_thread_file_err(); 2620 2621 /* Time to stop the migration, now. */ 2622 return MIG_THR_ERR_FATAL; 2623 } 2624 } 2625 2626 /* How many bytes have we transferred since the beginning of the migration */ 2627 static uint64_t migration_total_bytes(MigrationState *s) 2628 { 2629 return qemu_file_total_transferred(s->to_dst_file) + 2630 stat64_get(&mig_stats.multifd_bytes); 2631 } 2632 2633 static void migration_calculate_complete(MigrationState *s) 2634 { 2635 uint64_t bytes = migration_total_bytes(s); 2636 int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 2637 int64_t transfer_time; 2638 2639 s->total_time = end_time - s->start_time; 2640 if (!s->downtime) { 2641 /* 2642 * It's still not set, so we are precopy migration. For 2643 * postcopy, downtime is calculated during postcopy_start(). 2644 */ 2645 s->downtime = end_time - s->downtime_start; 2646 } 2647 2648 transfer_time = s->total_time - s->setup_time; 2649 if (transfer_time) { 2650 s->mbps = ((double) bytes * 8.0) / transfer_time / 1000; 2651 } 2652 } 2653 2654 static void update_iteration_initial_status(MigrationState *s) 2655 { 2656 /* 2657 * Update these three fields at the same time to avoid mismatch info lead 2658 * wrong speed calculation. 2659 */ 2660 s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 2661 s->iteration_initial_bytes = migration_total_bytes(s); 2662 s->iteration_initial_pages = ram_get_total_transferred_pages(); 2663 } 2664 2665 static void migration_update_counters(MigrationState *s, 2666 int64_t current_time) 2667 { 2668 uint64_t transferred, transferred_pages, time_spent; 2669 uint64_t current_bytes; /* bytes transferred since the beginning */ 2670 double bandwidth; 2671 2672 if (current_time < s->iteration_start_time + BUFFER_DELAY) { 2673 return; 2674 } 2675 2676 current_bytes = migration_total_bytes(s); 2677 transferred = current_bytes - s->iteration_initial_bytes; 2678 time_spent = current_time - s->iteration_start_time; 2679 bandwidth = (double)transferred / time_spent; 2680 s->threshold_size = bandwidth * migrate_downtime_limit(); 2681 2682 s->mbps = (((double) transferred * 8.0) / 2683 ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; 2684 2685 transferred_pages = ram_get_total_transferred_pages() - 2686 s->iteration_initial_pages; 2687 s->pages_per_second = (double) transferred_pages / 2688 (((double) time_spent / 1000.0)); 2689 2690 /* 2691 * if we haven't sent anything, we don't want to 2692 * recalculate. 10000 is a small enough number for our purposes 2693 */ 2694 if (stat64_get(&mig_stats.dirty_pages_rate) && 2695 transferred > 10000) { 2696 s->expected_downtime = 2697 stat64_get(&mig_stats.dirty_bytes_last_sync) / bandwidth; 2698 } 2699 2700 qemu_file_reset_rate_limit(s->to_dst_file); 2701 2702 update_iteration_initial_status(s); 2703 2704 trace_migrate_transferred(transferred, time_spent, 2705 bandwidth, s->threshold_size); 2706 } 2707 2708 /* Migration thread iteration status */ 2709 typedef enum { 2710 MIG_ITERATE_RESUME, /* Resume current iteration */ 2711 MIG_ITERATE_SKIP, /* Skip current iteration */ 2712 MIG_ITERATE_BREAK, /* Break the loop */ 2713 } MigIterateState; 2714 2715 /* 2716 * Return true if continue to the next iteration directly, false 2717 * otherwise. 2718 */ 2719 static MigIterateState migration_iteration_run(MigrationState *s) 2720 { 2721 uint64_t must_precopy, can_postcopy; 2722 bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; 2723 2724 qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy); 2725 uint64_t pending_size = must_precopy + can_postcopy; 2726 2727 trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); 2728 2729 if (must_precopy <= s->threshold_size) { 2730 qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy); 2731 pending_size = must_precopy + can_postcopy; 2732 trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy); 2733 } 2734 2735 if (!pending_size || pending_size < s->threshold_size) { 2736 trace_migration_thread_low_pending(pending_size); 2737 migration_completion(s); 2738 return MIG_ITERATE_BREAK; 2739 } 2740 2741 /* Still a significant amount to transfer */ 2742 if (!in_postcopy && must_precopy <= s->threshold_size && 2743 qatomic_read(&s->start_postcopy)) { 2744 if (postcopy_start(s)) { 2745 error_report("%s: postcopy failed to start", __func__); 2746 } 2747 return MIG_ITERATE_SKIP; 2748 } 2749 2750 /* Just another iteration step */ 2751 qemu_savevm_state_iterate(s->to_dst_file, in_postcopy); 2752 return MIG_ITERATE_RESUME; 2753 } 2754 2755 static void migration_iteration_finish(MigrationState *s) 2756 { 2757 /* If we enabled cpu throttling for auto-converge, turn it off. */ 2758 cpu_throttle_stop(); 2759 2760 qemu_mutex_lock_iothread(); 2761 switch (s->state) { 2762 case MIGRATION_STATUS_COMPLETED: 2763 migration_calculate_complete(s); 2764 runstate_set(RUN_STATE_POSTMIGRATE); 2765 break; 2766 case MIGRATION_STATUS_COLO: 2767 if (!migrate_colo()) { 2768 error_report("%s: critical error: calling COLO code without " 2769 "COLO enabled", __func__); 2770 } 2771 migrate_start_colo_process(s); 2772 s->vm_was_running = true; 2773 /* Fallthrough */ 2774 case MIGRATION_STATUS_FAILED: 2775 case MIGRATION_STATUS_CANCELLED: 2776 case MIGRATION_STATUS_CANCELLING: 2777 if (s->vm_was_running) { 2778 if (!runstate_check(RUN_STATE_SHUTDOWN)) { 2779 vm_start(); 2780 } 2781 } else { 2782 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { 2783 runstate_set(RUN_STATE_POSTMIGRATE); 2784 } 2785 } 2786 break; 2787 2788 default: 2789 /* Should not reach here, but if so, forgive the VM. */ 2790 error_report("%s: Unknown ending state %d", __func__, s->state); 2791 break; 2792 } 2793 migrate_fd_cleanup_schedule(s); 2794 qemu_mutex_unlock_iothread(); 2795 } 2796 2797 static void bg_migration_iteration_finish(MigrationState *s) 2798 { 2799 qemu_mutex_lock_iothread(); 2800 switch (s->state) { 2801 case MIGRATION_STATUS_COMPLETED: 2802 migration_calculate_complete(s); 2803 break; 2804 2805 case MIGRATION_STATUS_ACTIVE: 2806 case MIGRATION_STATUS_FAILED: 2807 case MIGRATION_STATUS_CANCELLED: 2808 case MIGRATION_STATUS_CANCELLING: 2809 break; 2810 2811 default: 2812 /* Should not reach here, but if so, forgive the VM. */ 2813 error_report("%s: Unknown ending state %d", __func__, s->state); 2814 break; 2815 } 2816 2817 migrate_fd_cleanup_schedule(s); 2818 qemu_mutex_unlock_iothread(); 2819 } 2820 2821 /* 2822 * Return true if continue to the next iteration directly, false 2823 * otherwise. 2824 */ 2825 static MigIterateState bg_migration_iteration_run(MigrationState *s) 2826 { 2827 int res; 2828 2829 res = qemu_savevm_state_iterate(s->to_dst_file, false); 2830 if (res > 0) { 2831 bg_migration_completion(s); 2832 return MIG_ITERATE_BREAK; 2833 } 2834 2835 return MIG_ITERATE_RESUME; 2836 } 2837 2838 void migration_make_urgent_request(void) 2839 { 2840 qemu_sem_post(&migrate_get_current()->rate_limit_sem); 2841 } 2842 2843 void migration_consume_urgent_request(void) 2844 { 2845 qemu_sem_wait(&migrate_get_current()->rate_limit_sem); 2846 } 2847 2848 /* Returns true if the rate limiting was broken by an urgent request */ 2849 bool migration_rate_limit(void) 2850 { 2851 int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 2852 MigrationState *s = migrate_get_current(); 2853 2854 bool urgent = false; 2855 migration_update_counters(s, now); 2856 if (qemu_file_rate_limit(s->to_dst_file)) { 2857 2858 if (qemu_file_get_error(s->to_dst_file)) { 2859 return false; 2860 } 2861 /* 2862 * Wait for a delay to do rate limiting OR 2863 * something urgent to post the semaphore. 2864 */ 2865 int ms = s->iteration_start_time + BUFFER_DELAY - now; 2866 trace_migration_rate_limit_pre(ms); 2867 if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) { 2868 /* 2869 * We were woken by one or more urgent things but 2870 * the timedwait will have consumed one of them. 2871 * The service routine for the urgent wake will dec 2872 * the semaphore itself for each item it consumes, 2873 * so add this one we just eat back. 2874 */ 2875 qemu_sem_post(&s->rate_limit_sem); 2876 urgent = true; 2877 } 2878 trace_migration_rate_limit_post(urgent); 2879 } 2880 return urgent; 2881 } 2882 2883 /* 2884 * if failover devices are present, wait they are completely 2885 * unplugged 2886 */ 2887 2888 static void qemu_savevm_wait_unplug(MigrationState *s, int old_state, 2889 int new_state) 2890 { 2891 if (qemu_savevm_state_guest_unplug_pending()) { 2892 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG); 2893 2894 while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && 2895 qemu_savevm_state_guest_unplug_pending()) { 2896 qemu_sem_timedwait(&s->wait_unplug_sem, 250); 2897 } 2898 if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) { 2899 int timeout = 120; /* 30 seconds */ 2900 /* 2901 * migration has been canceled 2902 * but as we have started an unplug we must wait the end 2903 * to be able to plug back the card 2904 */ 2905 while (timeout-- && qemu_savevm_state_guest_unplug_pending()) { 2906 qemu_sem_timedwait(&s->wait_unplug_sem, 250); 2907 } 2908 if (qemu_savevm_state_guest_unplug_pending() && 2909 !qtest_enabled()) { 2910 warn_report("migration: partially unplugged device on " 2911 "failure"); 2912 } 2913 } 2914 2915 migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state); 2916 } else { 2917 migrate_set_state(&s->state, old_state, new_state); 2918 } 2919 } 2920 2921 /* 2922 * Master migration thread on the source VM. 2923 * It drives the migration and pumps the data down the outgoing channel. 2924 */ 2925 static void *migration_thread(void *opaque) 2926 { 2927 MigrationState *s = opaque; 2928 MigrationThread *thread = NULL; 2929 int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); 2930 MigThrError thr_error; 2931 bool urgent = false; 2932 2933 thread = MigrationThreadAdd("live_migration", qemu_get_thread_id()); 2934 2935 rcu_register_thread(); 2936 2937 object_ref(OBJECT(s)); 2938 update_iteration_initial_status(s); 2939 2940 qemu_savevm_state_header(s->to_dst_file); 2941 2942 /* 2943 * If we opened the return path, we need to make sure dst has it 2944 * opened as well. 2945 */ 2946 if (s->rp_state.rp_thread_created) { 2947 /* Now tell the dest that it should open its end so it can reply */ 2948 qemu_savevm_send_open_return_path(s->to_dst_file); 2949 2950 /* And do a ping that will make stuff easier to debug */ 2951 qemu_savevm_send_ping(s->to_dst_file, 1); 2952 } 2953 2954 if (migrate_postcopy()) { 2955 /* 2956 * Tell the destination that we *might* want to do postcopy later; 2957 * if the other end can't do postcopy it should fail now, nice and 2958 * early. 2959 */ 2960 qemu_savevm_send_postcopy_advise(s->to_dst_file); 2961 } 2962 2963 if (migrate_colo()) { 2964 /* Notify migration destination that we enable COLO */ 2965 qemu_savevm_send_colo_enable(s->to_dst_file); 2966 } 2967 2968 qemu_savevm_state_setup(s->to_dst_file); 2969 2970 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, 2971 MIGRATION_STATUS_ACTIVE); 2972 2973 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; 2974 2975 trace_migration_thread_setup_complete(); 2976 2977 while (migration_is_active(s)) { 2978 if (urgent || !qemu_file_rate_limit(s->to_dst_file)) { 2979 MigIterateState iter_state = migration_iteration_run(s); 2980 if (iter_state == MIG_ITERATE_SKIP) { 2981 continue; 2982 } else if (iter_state == MIG_ITERATE_BREAK) { 2983 break; 2984 } 2985 } 2986 2987 /* 2988 * Try to detect any kind of failures, and see whether we 2989 * should stop the migration now. 2990 */ 2991 thr_error = migration_detect_error(s); 2992 if (thr_error == MIG_THR_ERR_FATAL) { 2993 /* Stop migration */ 2994 break; 2995 } else if (thr_error == MIG_THR_ERR_RECOVERED) { 2996 /* 2997 * Just recovered from a e.g. network failure, reset all 2998 * the local variables. This is important to avoid 2999 * breaking transferred_bytes and bandwidth calculation 3000 */ 3001 update_iteration_initial_status(s); 3002 } 3003 3004 urgent = migration_rate_limit(); 3005 } 3006 3007 trace_migration_thread_after_loop(); 3008 migration_iteration_finish(s); 3009 object_unref(OBJECT(s)); 3010 rcu_unregister_thread(); 3011 MigrationThreadDel(thread); 3012 return NULL; 3013 } 3014 3015 static void bg_migration_vm_start_bh(void *opaque) 3016 { 3017 MigrationState *s = opaque; 3018 3019 qemu_bh_delete(s->vm_start_bh); 3020 s->vm_start_bh = NULL; 3021 3022 vm_start(); 3023 s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start; 3024 } 3025 3026 /** 3027 * Background snapshot thread, based on live migration code. 3028 * This is an alternative implementation of live migration mechanism 3029 * introduced specifically to support background snapshots. 3030 * 3031 * It takes advantage of userfault_fd write protection mechanism introduced 3032 * in v5.7 kernel. Compared to existing dirty page logging migration much 3033 * lesser stream traffic is produced resulting in smaller snapshot images, 3034 * simply cause of no page duplicates can get into the stream. 3035 * 3036 * Another key point is that generated vmstate stream reflects machine state 3037 * 'frozen' at the beginning of snapshot creation compared to dirty page logging 3038 * mechanism, which effectively results in that saved snapshot is the state of VM 3039 * at the end of the process. 3040 */ 3041 static void *bg_migration_thread(void *opaque) 3042 { 3043 MigrationState *s = opaque; 3044 int64_t setup_start; 3045 MigThrError thr_error; 3046 QEMUFile *fb; 3047 bool early_fail = true; 3048 3049 rcu_register_thread(); 3050 object_ref(OBJECT(s)); 3051 3052 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); 3053 3054 setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); 3055 /* 3056 * We want to save vmstate for the moment when migration has been 3057 * initiated but also we want to save RAM content while VM is running. 3058 * The RAM content should appear first in the vmstate. So, we first 3059 * stash the non-RAM part of the vmstate to the temporary buffer, 3060 * then write RAM part of the vmstate to the migration stream 3061 * with vCPUs running and, finally, write stashed non-RAM part of 3062 * the vmstate from the buffer to the migration stream. 3063 */ 3064 s->bioc = qio_channel_buffer_new(512 * 1024); 3065 qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); 3066 fb = qemu_file_new_output(QIO_CHANNEL(s->bioc)); 3067 object_unref(OBJECT(s->bioc)); 3068 3069 update_iteration_initial_status(s); 3070 3071 /* 3072 * Prepare for tracking memory writes with UFFD-WP - populate 3073 * RAM pages before protecting. 3074 */ 3075 #ifdef __linux__ 3076 ram_write_tracking_prepare(); 3077 #endif 3078 3079 qemu_savevm_state_header(s->to_dst_file); 3080 qemu_savevm_state_setup(s->to_dst_file); 3081 3082 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, 3083 MIGRATION_STATUS_ACTIVE); 3084 3085 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; 3086 3087 trace_migration_thread_setup_complete(); 3088 s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 3089 3090 qemu_mutex_lock_iothread(); 3091 3092 /* 3093 * If VM is currently in suspended state, then, to make a valid runstate 3094 * transition in vm_stop_force_state() we need to wakeup it up. 3095 */ 3096 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); 3097 s->vm_was_running = runstate_is_running(); 3098 3099 if (global_state_store()) { 3100 goto fail; 3101 } 3102 /* Forcibly stop VM before saving state of vCPUs and devices */ 3103 if (vm_stop_force_state(RUN_STATE_PAUSED)) { 3104 goto fail; 3105 } 3106 /* 3107 * Put vCPUs in sync with shadow context structures, then 3108 * save their state to channel-buffer along with devices. 3109 */ 3110 cpu_synchronize_all_states(); 3111 if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { 3112 goto fail; 3113 } 3114 /* 3115 * Since we are going to get non-iterable state data directly 3116 * from s->bioc->data, explicit flush is needed here. 3117 */ 3118 qemu_fflush(fb); 3119 3120 /* Now initialize UFFD context and start tracking RAM writes */ 3121 if (ram_write_tracking_start()) { 3122 goto fail; 3123 } 3124 early_fail = false; 3125 3126 /* 3127 * Start VM from BH handler to avoid write-fault lock here. 3128 * UFFD-WP protection for the whole RAM is already enabled so 3129 * calling VM state change notifiers from vm_start() would initiate 3130 * writes to virtio VQs memory which is in write-protected region. 3131 */ 3132 s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); 3133 qemu_bh_schedule(s->vm_start_bh); 3134 3135 qemu_mutex_unlock_iothread(); 3136 3137 while (migration_is_active(s)) { 3138 MigIterateState iter_state = bg_migration_iteration_run(s); 3139 if (iter_state == MIG_ITERATE_SKIP) { 3140 continue; 3141 } else if (iter_state == MIG_ITERATE_BREAK) { 3142 break; 3143 } 3144 3145 /* 3146 * Try to detect any kind of failures, and see whether we 3147 * should stop the migration now. 3148 */ 3149 thr_error = migration_detect_error(s); 3150 if (thr_error == MIG_THR_ERR_FATAL) { 3151 /* Stop migration */ 3152 break; 3153 } 3154 3155 migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); 3156 } 3157 3158 trace_migration_thread_after_loop(); 3159 3160 fail: 3161 if (early_fail) { 3162 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, 3163 MIGRATION_STATUS_FAILED); 3164 qemu_mutex_unlock_iothread(); 3165 } 3166 3167 bg_migration_iteration_finish(s); 3168 3169 qemu_fclose(fb); 3170 object_unref(OBJECT(s)); 3171 rcu_unregister_thread(); 3172 3173 return NULL; 3174 } 3175 3176 void migrate_fd_connect(MigrationState *s, Error *error_in) 3177 { 3178 Error *local_err = NULL; 3179 int64_t rate_limit; 3180 bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; 3181 3182 /* 3183 * If there's a previous error, free it and prepare for another one. 3184 * Meanwhile if migration completes successfully, there won't have an error 3185 * dumped when calling migrate_fd_cleanup(). 3186 */ 3187 migrate_error_free(s); 3188 3189 s->expected_downtime = migrate_downtime_limit(); 3190 if (resume) { 3191 assert(s->cleanup_bh); 3192 } else { 3193 assert(!s->cleanup_bh); 3194 s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s); 3195 } 3196 if (error_in) { 3197 migrate_fd_error(s, error_in); 3198 if (resume) { 3199 /* 3200 * Don't do cleanup for resume if channel is invalid, but only dump 3201 * the error. We wait for another channel connect from the user. 3202 * The error_report still gives HMP user a hint on what failed. 3203 * It's normally done in migrate_fd_cleanup(), but call it here 3204 * explicitly. 3205 */ 3206 error_report_err(error_copy(s->error)); 3207 } else { 3208 migrate_fd_cleanup(s); 3209 } 3210 return; 3211 } 3212 3213 if (resume) { 3214 /* This is a resumed migration */ 3215 rate_limit = migrate_max_postcopy_bandwidth() / 3216 XFER_LIMIT_RATIO; 3217 } else { 3218 /* This is a fresh new migration */ 3219 rate_limit = migrate_max_bandwidth() / XFER_LIMIT_RATIO; 3220 3221 /* Notify before starting migration thread */ 3222 notifier_list_notify(&migration_state_notifiers, s); 3223 } 3224 3225 qemu_file_set_rate_limit(s->to_dst_file, rate_limit); 3226 qemu_file_set_blocking(s->to_dst_file, true); 3227 3228 /* 3229 * Open the return path. For postcopy, it is used exclusively. For 3230 * precopy, only if user specified "return-path" capability would 3231 * QEMU uses the return path. 3232 */ 3233 if (migrate_postcopy_ram() || migrate_return_path()) { 3234 if (open_return_path_on_source(s, !resume)) { 3235 error_report("Unable to open return-path for postcopy"); 3236 migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); 3237 migrate_fd_cleanup(s); 3238 return; 3239 } 3240 } 3241 3242 /* 3243 * This needs to be done before resuming a postcopy. Note: for newer 3244 * QEMUs we will delay the channel creation until postcopy_start(), to 3245 * avoid disorder of channel creations. 3246 */ 3247 if (migrate_postcopy_preempt() && s->preempt_pre_7_2) { 3248 postcopy_preempt_setup(s); 3249 } 3250 3251 if (resume) { 3252 /* Wakeup the main migration thread to do the recovery */ 3253 migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED, 3254 MIGRATION_STATUS_POSTCOPY_RECOVER); 3255 qemu_sem_post(&s->postcopy_pause_sem); 3256 return; 3257 } 3258 3259 if (multifd_save_setup(&local_err) != 0) { 3260 error_report_err(local_err); 3261 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 3262 MIGRATION_STATUS_FAILED); 3263 migrate_fd_cleanup(s); 3264 return; 3265 } 3266 3267 if (migrate_background_snapshot()) { 3268 qemu_thread_create(&s->thread, "bg_snapshot", 3269 bg_migration_thread, s, QEMU_THREAD_JOINABLE); 3270 } else { 3271 qemu_thread_create(&s->thread, "live_migration", 3272 migration_thread, s, QEMU_THREAD_JOINABLE); 3273 } 3274 s->migration_thread_running = true; 3275 } 3276 3277 static void migration_class_init(ObjectClass *klass, void *data) 3278 { 3279 DeviceClass *dc = DEVICE_CLASS(klass); 3280 3281 dc->user_creatable = false; 3282 device_class_set_props(dc, migration_properties); 3283 } 3284 3285 static void migration_instance_finalize(Object *obj) 3286 { 3287 MigrationState *ms = MIGRATION_OBJ(obj); 3288 3289 qemu_mutex_destroy(&ms->error_mutex); 3290 qemu_mutex_destroy(&ms->qemu_file_lock); 3291 qemu_sem_destroy(&ms->wait_unplug_sem); 3292 qemu_sem_destroy(&ms->rate_limit_sem); 3293 qemu_sem_destroy(&ms->pause_sem); 3294 qemu_sem_destroy(&ms->postcopy_pause_sem); 3295 qemu_sem_destroy(&ms->postcopy_pause_rp_sem); 3296 qemu_sem_destroy(&ms->rp_state.rp_sem); 3297 qemu_sem_destroy(&ms->rp_state.rp_pong_acks); 3298 qemu_sem_destroy(&ms->postcopy_qemufile_src_sem); 3299 error_free(ms->error); 3300 } 3301 3302 static void migration_instance_init(Object *obj) 3303 { 3304 MigrationState *ms = MIGRATION_OBJ(obj); 3305 3306 ms->state = MIGRATION_STATUS_NONE; 3307 ms->mbps = -1; 3308 ms->pages_per_second = -1; 3309 qemu_sem_init(&ms->pause_sem, 0); 3310 qemu_mutex_init(&ms->error_mutex); 3311 3312 migrate_params_init(&ms->parameters); 3313 3314 qemu_sem_init(&ms->postcopy_pause_sem, 0); 3315 qemu_sem_init(&ms->postcopy_pause_rp_sem, 0); 3316 qemu_sem_init(&ms->rp_state.rp_sem, 0); 3317 qemu_sem_init(&ms->rp_state.rp_pong_acks, 0); 3318 qemu_sem_init(&ms->rate_limit_sem, 0); 3319 qemu_sem_init(&ms->wait_unplug_sem, 0); 3320 qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0); 3321 qemu_mutex_init(&ms->qemu_file_lock); 3322 } 3323 3324 /* 3325 * Return true if check pass, false otherwise. Error will be put 3326 * inside errp if provided. 3327 */ 3328 static bool migration_object_check(MigrationState *ms, Error **errp) 3329 { 3330 /* Assuming all off */ 3331 bool old_caps[MIGRATION_CAPABILITY__MAX] = { 0 }; 3332 3333 if (!migrate_params_check(&ms->parameters, errp)) { 3334 return false; 3335 } 3336 3337 return migrate_caps_check(old_caps, ms->capabilities, errp); 3338 } 3339 3340 static const TypeInfo migration_type = { 3341 .name = TYPE_MIGRATION, 3342 /* 3343 * NOTE: TYPE_MIGRATION is not really a device, as the object is 3344 * not created using qdev_new(), it is not attached to the qdev 3345 * device tree, and it is never realized. 3346 * 3347 * TODO: Make this TYPE_OBJECT once QOM provides something like 3348 * TYPE_DEVICE's "-global" properties. 3349 */ 3350 .parent = TYPE_DEVICE, 3351 .class_init = migration_class_init, 3352 .class_size = sizeof(MigrationClass), 3353 .instance_size = sizeof(MigrationState), 3354 .instance_init = migration_instance_init, 3355 .instance_finalize = migration_instance_finalize, 3356 }; 3357 3358 static void register_migration_types(void) 3359 { 3360 type_register_static(&migration_type); 3361 } 3362 3363 type_init(register_migration_types); 3364