1 /* 2 * QEMU live migration 3 * 4 * Copyright IBM, Corp. 2008 5 * 6 * Authors: 7 * Anthony Liguori <aliguori@us.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qemu/cutils.h" 18 #include "qemu/error-report.h" 19 #include "qemu/main-loop.h" 20 #include "migration/blocker.h" 21 #include "exec.h" 22 #include "fd.h" 23 #include "file.h" 24 #include "socket.h" 25 #include "sysemu/runstate.h" 26 #include "sysemu/sysemu.h" 27 #include "sysemu/cpu-throttle.h" 28 #include "rdma.h" 29 #include "ram.h" 30 #include "ram-compress.h" 31 #include "migration/global_state.h" 32 #include "migration/misc.h" 33 #include "migration.h" 34 #include "migration-stats.h" 35 #include "savevm.h" 36 #include "qemu-file.h" 37 #include "channel.h" 38 #include "migration/vmstate.h" 39 #include "block/block.h" 40 #include "qapi/error.h" 41 #include "qapi/clone-visitor.h" 42 #include "qapi/qapi-visit-migration.h" 43 #include "qapi/qapi-visit-sockets.h" 44 #include "qapi/qapi-commands-migration.h" 45 #include "qapi/qapi-events-migration.h" 46 #include "qapi/qmp/qerror.h" 47 #include "qapi/qmp/qnull.h" 48 #include "qemu/rcu.h" 49 #include "block.h" 50 #include "postcopy-ram.h" 51 #include "qemu/thread.h" 52 #include "trace.h" 53 #include "exec/target_page.h" 54 #include "io/channel-buffer.h" 55 #include "io/channel-tls.h" 56 #include "migration/colo.h" 57 #include "hw/boards.h" 58 #include "monitor/monitor.h" 59 #include "net/announce.h" 60 #include "qemu/queue.h" 61 #include "multifd.h" 62 #include "threadinfo.h" 63 #include "qemu/yank.h" 64 #include "sysemu/cpus.h" 65 #include "yank_functions.h" 66 #include "sysemu/qtest.h" 67 #include "options.h" 68 #include "sysemu/dirtylimit.h" 69 #include "qemu/sockets.h" 70 71 static NotifierList migration_state_notifiers = 72 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); 73 74 /* Messages sent on the return path from destination to source */ 75 enum mig_rp_message_type { 76 MIG_RP_MSG_INVALID = 0, /* Must be 0 */ 77 MIG_RP_MSG_SHUT, /* sibling will not send any more RP messages */ 78 MIG_RP_MSG_PONG, /* Response to a PING; data (seq: be32 ) */ 79 80 MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */ 81 MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */ 82 MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */ 83 MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */ 84 MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */ 85 86 MIG_RP_MSG_MAX 87 }; 88 89 /* When we add fault tolerance, we could have several 90 migrations at once. For now we don't need to add 91 dynamic creation of migration */ 92 93 static MigrationState *current_migration; 94 static MigrationIncomingState *current_incoming; 95 96 static GSList *migration_blockers[MIG_MODE__MAX]; 97 98 static bool migration_object_check(MigrationState *ms, Error **errp); 99 static int migration_maybe_pause(MigrationState *s, 100 int *current_active_state, 101 int new_state); 102 static void migrate_fd_cancel(MigrationState *s); 103 static bool close_return_path_on_source(MigrationState *s); 104 105 static void migration_downtime_start(MigrationState *s) 106 { 107 trace_vmstate_downtime_checkpoint("src-downtime-start"); 108 s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 109 } 110 111 static void migration_downtime_end(MigrationState *s) 112 { 113 int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 114 115 /* 116 * If downtime already set, should mean that postcopy already set it, 117 * then that should be the real downtime already. 118 */ 119 if (!s->downtime) { 120 s->downtime = now - s->downtime_start; 121 } 122 123 trace_vmstate_downtime_checkpoint("src-downtime-end"); 124 } 125 126 static bool migration_needs_multiple_sockets(void) 127 { 128 return migrate_multifd() || migrate_postcopy_preempt(); 129 } 130 131 static bool transport_supports_multi_channels(MigrationAddress *addr) 132 { 133 if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { 134 SocketAddress *saddr = &addr->u.socket; 135 136 return saddr->type == SOCKET_ADDRESS_TYPE_INET || 137 saddr->type == SOCKET_ADDRESS_TYPE_UNIX || 138 saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; 139 } 140 141 return false; 142 } 143 144 static bool 145 migration_channels_and_transport_compatible(MigrationAddress *addr, 146 Error **errp) 147 { 148 if (migration_needs_multiple_sockets() && 149 !transport_supports_multi_channels(addr)) { 150 error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); 151 return false; 152 } 153 154 return true; 155 } 156 157 static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp) 158 { 159 uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp; 160 161 return (a > b) - (a < b); 162 } 163 164 int migration_stop_vm(RunState state) 165 { 166 int ret = vm_stop_force_state(state); 167 168 trace_vmstate_downtime_checkpoint("src-vm-stopped"); 169 170 return ret; 171 } 172 173 void migration_object_init(void) 174 { 175 /* This can only be called once. */ 176 assert(!current_migration); 177 current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION)); 178 179 /* 180 * Init the migrate incoming object as well no matter whether 181 * we'll use it or not. 182 */ 183 assert(!current_incoming); 184 current_incoming = g_new0(MigrationIncomingState, 1); 185 current_incoming->state = MIGRATION_STATUS_NONE; 186 current_incoming->postcopy_remote_fds = 187 g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD)); 188 qemu_mutex_init(¤t_incoming->rp_mutex); 189 qemu_mutex_init(¤t_incoming->postcopy_prio_thread_mutex); 190 qemu_event_init(¤t_incoming->main_thread_load_event, false); 191 qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0); 192 qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0); 193 qemu_sem_init(¤t_incoming->postcopy_pause_sem_fast_load, 0); 194 qemu_sem_init(¤t_incoming->postcopy_qemufile_dst_done, 0); 195 196 qemu_mutex_init(¤t_incoming->page_request_mutex); 197 qemu_cond_init(¤t_incoming->page_request_cond); 198 current_incoming->page_requested = g_tree_new(page_request_addr_cmp); 199 200 migration_object_check(current_migration, &error_fatal); 201 202 blk_mig_init(); 203 ram_mig_init(); 204 dirty_bitmap_mig_init(); 205 } 206 207 void migration_cancel(const Error *error) 208 { 209 if (error) { 210 migrate_set_error(current_migration, error); 211 } 212 if (migrate_dirty_limit()) { 213 qmp_cancel_vcpu_dirty_limit(false, -1, NULL); 214 } 215 migrate_fd_cancel(current_migration); 216 } 217 218 void migration_shutdown(void) 219 { 220 /* 221 * When the QEMU main thread exit, the COLO thread 222 * may wait a semaphore. So, we should wakeup the 223 * COLO thread before migration shutdown. 224 */ 225 colo_shutdown(); 226 /* 227 * Cancel the current migration - that will (eventually) 228 * stop the migration using this structure 229 */ 230 migration_cancel(NULL); 231 object_unref(OBJECT(current_migration)); 232 233 /* 234 * Cancel outgoing migration of dirty bitmaps. It should 235 * at least unref used block nodes. 236 */ 237 dirty_bitmap_mig_cancel_outgoing(); 238 239 /* 240 * Cancel incoming migration of dirty bitmaps. Dirty bitmaps 241 * are non-critical data, and their loss never considered as 242 * something serious. 243 */ 244 dirty_bitmap_mig_cancel_incoming(); 245 } 246 247 /* For outgoing */ 248 MigrationState *migrate_get_current(void) 249 { 250 /* This can only be called after the object created. */ 251 assert(current_migration); 252 return current_migration; 253 } 254 255 MigrationIncomingState *migration_incoming_get_current(void) 256 { 257 assert(current_incoming); 258 return current_incoming; 259 } 260 261 void migration_incoming_transport_cleanup(MigrationIncomingState *mis) 262 { 263 if (mis->socket_address_list) { 264 qapi_free_SocketAddressList(mis->socket_address_list); 265 mis->socket_address_list = NULL; 266 } 267 268 if (mis->transport_cleanup) { 269 mis->transport_cleanup(mis->transport_data); 270 mis->transport_data = mis->transport_cleanup = NULL; 271 } 272 } 273 274 void migration_incoming_state_destroy(void) 275 { 276 struct MigrationIncomingState *mis = migration_incoming_get_current(); 277 278 multifd_load_cleanup(); 279 compress_threads_load_cleanup(); 280 281 if (mis->to_src_file) { 282 /* Tell source that we are done */ 283 migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0); 284 qemu_fclose(mis->to_src_file); 285 mis->to_src_file = NULL; 286 } 287 288 if (mis->from_src_file) { 289 migration_ioc_unregister_yank_from_file(mis->from_src_file); 290 qemu_fclose(mis->from_src_file); 291 mis->from_src_file = NULL; 292 } 293 if (mis->postcopy_remote_fds) { 294 g_array_free(mis->postcopy_remote_fds, TRUE); 295 mis->postcopy_remote_fds = NULL; 296 } 297 298 migration_incoming_transport_cleanup(mis); 299 qemu_event_reset(&mis->main_thread_load_event); 300 301 if (mis->page_requested) { 302 g_tree_destroy(mis->page_requested); 303 mis->page_requested = NULL; 304 } 305 306 if (mis->postcopy_qemufile_dst) { 307 migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst); 308 qemu_fclose(mis->postcopy_qemufile_dst); 309 mis->postcopy_qemufile_dst = NULL; 310 } 311 312 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 313 } 314 315 static void migrate_generate_event(int new_state) 316 { 317 if (migrate_events()) { 318 qapi_event_send_migration(new_state); 319 } 320 } 321 322 /* 323 * Send a message on the return channel back to the source 324 * of the migration. 325 */ 326 static int migrate_send_rp_message(MigrationIncomingState *mis, 327 enum mig_rp_message_type message_type, 328 uint16_t len, void *data) 329 { 330 int ret = 0; 331 332 trace_migrate_send_rp_message((int)message_type, len); 333 QEMU_LOCK_GUARD(&mis->rp_mutex); 334 335 /* 336 * It's possible that the file handle got lost due to network 337 * failures. 338 */ 339 if (!mis->to_src_file) { 340 ret = -EIO; 341 return ret; 342 } 343 344 qemu_put_be16(mis->to_src_file, (unsigned int)message_type); 345 qemu_put_be16(mis->to_src_file, len); 346 qemu_put_buffer(mis->to_src_file, data, len); 347 return qemu_fflush(mis->to_src_file); 348 } 349 350 /* Request one page from the source VM at the given start address. 351 * rb: the RAMBlock to request the page in 352 * Start: Address offset within the RB 353 * Len: Length in bytes required - must be a multiple of pagesize 354 */ 355 int migrate_send_rp_message_req_pages(MigrationIncomingState *mis, 356 RAMBlock *rb, ram_addr_t start) 357 { 358 uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */ 359 size_t msglen = 12; /* start + len */ 360 size_t len = qemu_ram_pagesize(rb); 361 enum mig_rp_message_type msg_type; 362 const char *rbname; 363 int rbname_len; 364 365 *(uint64_t *)bufc = cpu_to_be64((uint64_t)start); 366 *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len); 367 368 /* 369 * We maintain the last ramblock that we requested for page. Note that we 370 * don't need locking because this function will only be called within the 371 * postcopy ram fault thread. 372 */ 373 if (rb != mis->last_rb) { 374 mis->last_rb = rb; 375 376 rbname = qemu_ram_get_idstr(rb); 377 rbname_len = strlen(rbname); 378 379 assert(rbname_len < 256); 380 381 bufc[msglen++] = rbname_len; 382 memcpy(bufc + msglen, rbname, rbname_len); 383 msglen += rbname_len; 384 msg_type = MIG_RP_MSG_REQ_PAGES_ID; 385 } else { 386 msg_type = MIG_RP_MSG_REQ_PAGES; 387 } 388 389 return migrate_send_rp_message(mis, msg_type, msglen, bufc); 390 } 391 392 int migrate_send_rp_req_pages(MigrationIncomingState *mis, 393 RAMBlock *rb, ram_addr_t start, uint64_t haddr) 394 { 395 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb)); 396 bool received = false; 397 398 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) { 399 received = ramblock_recv_bitmap_test_byte_offset(rb, start); 400 if (!received && !g_tree_lookup(mis->page_requested, aligned)) { 401 /* 402 * The page has not been received, and it's not yet in the page 403 * request list. Queue it. Set the value of element to 1, so that 404 * things like g_tree_lookup() will return TRUE (1) when found. 405 */ 406 g_tree_insert(mis->page_requested, aligned, (gpointer)1); 407 qatomic_inc(&mis->page_requested_count); 408 trace_postcopy_page_req_add(aligned, mis->page_requested_count); 409 } 410 } 411 412 /* 413 * If the page is there, skip sending the message. We don't even need the 414 * lock because as long as the page arrived, it'll be there forever. 415 */ 416 if (received) { 417 return 0; 418 } 419 420 return migrate_send_rp_message_req_pages(mis, rb, start); 421 } 422 423 static bool migration_colo_enabled; 424 bool migration_incoming_colo_enabled(void) 425 { 426 return migration_colo_enabled; 427 } 428 429 void migration_incoming_disable_colo(void) 430 { 431 ram_block_discard_disable(false); 432 migration_colo_enabled = false; 433 } 434 435 int migration_incoming_enable_colo(void) 436 { 437 #ifndef CONFIG_REPLICATION 438 error_report("ENABLE_COLO command come in migration stream, but COLO " 439 "module is not built in"); 440 return -ENOTSUP; 441 #endif 442 443 if (!migrate_colo()) { 444 error_report("ENABLE_COLO command come in migration stream, but c-colo " 445 "capability is not set"); 446 return -EINVAL; 447 } 448 449 if (ram_block_discard_disable(true)) { 450 error_report("COLO: cannot disable RAM discard"); 451 return -EBUSY; 452 } 453 migration_colo_enabled = true; 454 return 0; 455 } 456 457 void migrate_add_address(SocketAddress *address) 458 { 459 MigrationIncomingState *mis = migration_incoming_get_current(); 460 461 QAPI_LIST_PREPEND(mis->socket_address_list, 462 QAPI_CLONE(SocketAddress, address)); 463 } 464 465 bool migrate_uri_parse(const char *uri, MigrationChannel **channel, 466 Error **errp) 467 { 468 g_autoptr(MigrationChannel) val = g_new0(MigrationChannel, 1); 469 g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1); 470 InetSocketAddress *isock = &addr->u.rdma; 471 strList **tail = &addr->u.exec.args; 472 473 if (strstart(uri, "exec:", NULL)) { 474 addr->transport = MIGRATION_ADDRESS_TYPE_EXEC; 475 #ifdef WIN32 476 QAPI_LIST_APPEND(tail, g_strdup(exec_get_cmd_path())); 477 QAPI_LIST_APPEND(tail, g_strdup("/c")); 478 #else 479 QAPI_LIST_APPEND(tail, g_strdup("/bin/sh")); 480 QAPI_LIST_APPEND(tail, g_strdup("-c")); 481 #endif 482 QAPI_LIST_APPEND(tail, g_strdup(uri + strlen("exec:"))); 483 } else if (strstart(uri, "rdma:", NULL)) { 484 if (inet_parse(isock, uri + strlen("rdma:"), errp)) { 485 qapi_free_InetSocketAddress(isock); 486 return false; 487 } 488 addr->transport = MIGRATION_ADDRESS_TYPE_RDMA; 489 } else if (strstart(uri, "tcp:", NULL) || 490 strstart(uri, "unix:", NULL) || 491 strstart(uri, "vsock:", NULL) || 492 strstart(uri, "fd:", NULL)) { 493 addr->transport = MIGRATION_ADDRESS_TYPE_SOCKET; 494 SocketAddress *saddr = socket_parse(uri, errp); 495 if (!saddr) { 496 return false; 497 } 498 addr->u.socket.type = saddr->type; 499 addr->u.socket.u = saddr->u; 500 /* Don't free the objects inside; their ownership moved to "addr" */ 501 g_free(saddr); 502 } else if (strstart(uri, "file:", NULL)) { 503 addr->transport = MIGRATION_ADDRESS_TYPE_FILE; 504 addr->u.file.filename = g_strdup(uri + strlen("file:")); 505 if (file_parse_offset(addr->u.file.filename, &addr->u.file.offset, 506 errp)) { 507 return false; 508 } 509 } else { 510 error_setg(errp, "unknown migration protocol: %s", uri); 511 return false; 512 } 513 514 val->channel_type = MIGRATION_CHANNEL_TYPE_MAIN; 515 val->addr = g_steal_pointer(&addr); 516 *channel = g_steal_pointer(&val); 517 return true; 518 } 519 520 static void qemu_start_incoming_migration(const char *uri, bool has_channels, 521 MigrationChannelList *channels, 522 Error **errp) 523 { 524 g_autoptr(MigrationChannel) channel = NULL; 525 MigrationAddress *addr = NULL; 526 MigrationIncomingState *mis = migration_incoming_get_current(); 527 528 /* 529 * Having preliminary checks for uri and channel 530 */ 531 if (uri && has_channels) { 532 error_setg(errp, "'uri' and 'channels' arguments are mutually " 533 "exclusive; exactly one of the two should be present in " 534 "'migrate-incoming' qmp command "); 535 return; 536 } else if (channels) { 537 /* To verify that Migrate channel list has only item */ 538 if (channels->next) { 539 error_setg(errp, "Channel list has more than one entries"); 540 return; 541 } 542 addr = channels->value->addr; 543 } else if (uri) { 544 /* caller uses the old URI syntax */ 545 if (!migrate_uri_parse(uri, &channel, errp)) { 546 return; 547 } 548 addr = channel->addr; 549 } else { 550 error_setg(errp, "neither 'uri' or 'channels' argument are " 551 "specified in 'migrate-incoming' qmp command "); 552 return; 553 } 554 555 /* transport mechanism not suitable for migration? */ 556 if (!migration_channels_and_transport_compatible(addr, errp)) { 557 return; 558 } 559 560 migrate_set_state(&mis->state, MIGRATION_STATUS_NONE, 561 MIGRATION_STATUS_SETUP); 562 563 if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { 564 SocketAddress *saddr = &addr->u.socket; 565 if (saddr->type == SOCKET_ADDRESS_TYPE_INET || 566 saddr->type == SOCKET_ADDRESS_TYPE_UNIX || 567 saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) { 568 socket_start_incoming_migration(saddr, errp); 569 } else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) { 570 fd_start_incoming_migration(saddr->u.fd.str, errp); 571 } 572 #ifdef CONFIG_RDMA 573 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) { 574 if (migrate_compress()) { 575 error_setg(errp, "RDMA and compression can't be used together"); 576 return; 577 } 578 if (migrate_xbzrle()) { 579 error_setg(errp, "RDMA and XBZRLE can't be used together"); 580 return; 581 } 582 if (migrate_multifd()) { 583 error_setg(errp, "RDMA and multifd can't be used together"); 584 return; 585 } 586 rdma_start_incoming_migration(&addr->u.rdma, errp); 587 #endif 588 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) { 589 exec_start_incoming_migration(addr->u.exec.args, errp); 590 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { 591 file_start_incoming_migration(&addr->u.file, errp); 592 } else { 593 error_setg(errp, "unknown migration protocol: %s", uri); 594 } 595 } 596 597 static void process_incoming_migration_bh(void *opaque) 598 { 599 Error *local_err = NULL; 600 MigrationIncomingState *mis = opaque; 601 602 trace_vmstate_downtime_checkpoint("dst-precopy-bh-enter"); 603 604 /* If capability late_block_activate is set: 605 * Only fire up the block code now if we're going to restart the 606 * VM, else 'cont' will do it. 607 * This causes file locking to happen; so we don't want it to happen 608 * unless we really are starting the VM. 609 */ 610 if (!migrate_late_block_activate() || 611 (autostart && (!global_state_received() || 612 global_state_get_runstate() == RUN_STATE_RUNNING))) { 613 /* Make sure all file formats throw away their mutable metadata. 614 * If we get an error here, just don't restart the VM yet. */ 615 bdrv_activate_all(&local_err); 616 if (local_err) { 617 error_report_err(local_err); 618 local_err = NULL; 619 autostart = false; 620 } 621 } 622 623 /* 624 * This must happen after all error conditions are dealt with and 625 * we're sure the VM is going to be running on this host. 626 */ 627 qemu_announce_self(&mis->announce_timer, migrate_announce_params()); 628 629 trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced"); 630 631 multifd_load_shutdown(); 632 633 dirty_bitmap_mig_before_vm_start(); 634 635 if (!global_state_received() || 636 global_state_get_runstate() == RUN_STATE_RUNNING) { 637 if (autostart) { 638 vm_start(); 639 } else { 640 runstate_set(RUN_STATE_PAUSED); 641 } 642 } else if (migration_incoming_colo_enabled()) { 643 migration_incoming_disable_colo(); 644 vm_start(); 645 } else { 646 runstate_set(global_state_get_runstate()); 647 } 648 trace_vmstate_downtime_checkpoint("dst-precopy-bh-vm-started"); 649 /* 650 * This must happen after any state changes since as soon as an external 651 * observer sees this event they might start to prod at the VM assuming 652 * it's ready to use. 653 */ 654 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, 655 MIGRATION_STATUS_COMPLETED); 656 qemu_bh_delete(mis->bh); 657 migration_incoming_state_destroy(); 658 object_unref(OBJECT(migrate_get_current())); 659 } 660 661 static void coroutine_fn 662 process_incoming_migration_co(void *opaque) 663 { 664 MigrationIncomingState *mis = migration_incoming_get_current(); 665 PostcopyState ps; 666 int ret; 667 668 assert(mis->from_src_file); 669 670 if (compress_threads_load_setup(mis->from_src_file)) { 671 error_report("Failed to setup decompress threads"); 672 goto fail; 673 } 674 675 mis->largest_page_size = qemu_ram_pagesize_largest(); 676 postcopy_state_set(POSTCOPY_INCOMING_NONE); 677 migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP, 678 MIGRATION_STATUS_ACTIVE); 679 680 mis->loadvm_co = qemu_coroutine_self(); 681 ret = qemu_loadvm_state(mis->from_src_file); 682 mis->loadvm_co = NULL; 683 684 trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed"); 685 686 ps = postcopy_state_get(); 687 trace_process_incoming_migration_co_end(ret, ps); 688 if (ps != POSTCOPY_INCOMING_NONE) { 689 if (ps == POSTCOPY_INCOMING_ADVISE) { 690 /* 691 * Where a migration had postcopy enabled (and thus went to advise) 692 * but managed to complete within the precopy period, we can use 693 * the normal exit. 694 */ 695 postcopy_ram_incoming_cleanup(mis); 696 } else if (ret >= 0) { 697 /* 698 * Postcopy was started, cleanup should happen at the end of the 699 * postcopy thread. 700 */ 701 trace_process_incoming_migration_co_postcopy_end_main(); 702 return; 703 } 704 /* Else if something went wrong then just fall out of the normal exit */ 705 } 706 707 if (ret < 0) { 708 error_report("load of migration failed: %s", strerror(-ret)); 709 goto fail; 710 } 711 712 if (colo_incoming_co() < 0) { 713 goto fail; 714 } 715 716 mis->bh = qemu_bh_new(process_incoming_migration_bh, mis); 717 object_ref(OBJECT(migrate_get_current())); 718 qemu_bh_schedule(mis->bh); 719 return; 720 fail: 721 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, 722 MIGRATION_STATUS_FAILED); 723 qemu_fclose(mis->from_src_file); 724 725 multifd_load_cleanup(); 726 compress_threads_load_cleanup(); 727 728 exit(EXIT_FAILURE); 729 } 730 731 /** 732 * migration_incoming_setup: Setup incoming migration 733 * @f: file for main migration channel 734 * @errp: where to put errors 735 * 736 * Returns: %true on success, %false on error. 737 */ 738 static bool migration_incoming_setup(QEMUFile *f, Error **errp) 739 { 740 MigrationIncomingState *mis = migration_incoming_get_current(); 741 742 if (!mis->from_src_file) { 743 mis->from_src_file = f; 744 } 745 qemu_file_set_blocking(f, false); 746 return true; 747 } 748 749 void migration_incoming_process(void) 750 { 751 Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL); 752 qemu_coroutine_enter(co); 753 } 754 755 /* Returns true if recovered from a paused migration, otherwise false */ 756 static bool postcopy_try_recover(void) 757 { 758 MigrationIncomingState *mis = migration_incoming_get_current(); 759 760 if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { 761 /* Resumed from a paused postcopy migration */ 762 763 /* This should be set already in migration_incoming_setup() */ 764 assert(mis->from_src_file); 765 /* Postcopy has standalone thread to do vm load */ 766 qemu_file_set_blocking(mis->from_src_file, true); 767 768 /* Re-configure the return path */ 769 mis->to_src_file = qemu_file_get_return_path(mis->from_src_file); 770 771 migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED, 772 MIGRATION_STATUS_POSTCOPY_RECOVER); 773 774 /* 775 * Here, we only wake up the main loading thread (while the 776 * rest threads will still be waiting), so that we can receive 777 * commands from source now, and answer it if needed. The 778 * rest threads will be woken up afterwards until we are sure 779 * that source is ready to reply to page requests. 780 */ 781 qemu_sem_post(&mis->postcopy_pause_sem_dst); 782 return true; 783 } 784 785 return false; 786 } 787 788 void migration_fd_process_incoming(QEMUFile *f, Error **errp) 789 { 790 if (!migration_incoming_setup(f, errp)) { 791 return; 792 } 793 if (postcopy_try_recover()) { 794 return; 795 } 796 migration_incoming_process(); 797 } 798 799 /* 800 * Returns true when we want to start a new incoming migration process, 801 * false otherwise. 802 */ 803 static bool migration_should_start_incoming(bool main_channel) 804 { 805 /* Multifd doesn't start unless all channels are established */ 806 if (migrate_multifd()) { 807 return migration_has_all_channels(); 808 } 809 810 /* Preempt channel only starts when the main channel is created */ 811 if (migrate_postcopy_preempt()) { 812 return main_channel; 813 } 814 815 /* 816 * For all the rest types of migration, we should only reach here when 817 * it's the main channel that's being created, and we should always 818 * proceed with this channel. 819 */ 820 assert(main_channel); 821 return true; 822 } 823 824 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) 825 { 826 MigrationIncomingState *mis = migration_incoming_get_current(); 827 Error *local_err = NULL; 828 QEMUFile *f; 829 bool default_channel = true; 830 uint32_t channel_magic = 0; 831 int ret = 0; 832 833 if (migrate_multifd() && !migrate_postcopy_ram() && 834 qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { 835 /* 836 * With multiple channels, it is possible that we receive channels 837 * out of order on destination side, causing incorrect mapping of 838 * source channels on destination side. Check channel MAGIC to 839 * decide type of channel. Please note this is best effort, postcopy 840 * preempt channel does not send any magic number so avoid it for 841 * postcopy live migration. Also tls live migration already does 842 * tls handshake while initializing main channel so with tls this 843 * issue is not possible. 844 */ 845 ret = migration_channel_read_peek(ioc, (void *)&channel_magic, 846 sizeof(channel_magic), &local_err); 847 848 if (ret != 0) { 849 error_propagate(errp, local_err); 850 return; 851 } 852 853 default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC)); 854 } else { 855 default_channel = !mis->from_src_file; 856 } 857 858 if (multifd_load_setup(errp) != 0) { 859 error_setg(errp, "Failed to setup multifd channels"); 860 return; 861 } 862 863 if (default_channel) { 864 f = qemu_file_new_input(ioc); 865 866 if (!migration_incoming_setup(f, errp)) { 867 return; 868 } 869 } else { 870 /* Multiple connections */ 871 assert(migration_needs_multiple_sockets()); 872 if (migrate_multifd()) { 873 multifd_recv_new_channel(ioc, &local_err); 874 } else { 875 assert(migrate_postcopy_preempt()); 876 f = qemu_file_new_input(ioc); 877 postcopy_preempt_new_channel(mis, f); 878 } 879 if (local_err) { 880 error_propagate(errp, local_err); 881 return; 882 } 883 } 884 885 if (migration_should_start_incoming(default_channel)) { 886 /* If it's a recovery, we're done */ 887 if (postcopy_try_recover()) { 888 return; 889 } 890 migration_incoming_process(); 891 } 892 } 893 894 /** 895 * @migration_has_all_channels: We have received all channels that we need 896 * 897 * Returns true when we have got connections to all the channels that 898 * we need for migration. 899 */ 900 bool migration_has_all_channels(void) 901 { 902 MigrationIncomingState *mis = migration_incoming_get_current(); 903 904 if (!mis->from_src_file) { 905 return false; 906 } 907 908 if (migrate_multifd()) { 909 return multifd_recv_all_channels_created(); 910 } 911 912 if (migrate_postcopy_preempt()) { 913 return mis->postcopy_qemufile_dst != NULL; 914 } 915 916 return true; 917 } 918 919 int migrate_send_rp_switchover_ack(MigrationIncomingState *mis) 920 { 921 return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL); 922 } 923 924 /* 925 * Send a 'SHUT' message on the return channel with the given value 926 * to indicate that we've finished with the RP. Non-0 value indicates 927 * error. 928 */ 929 void migrate_send_rp_shut(MigrationIncomingState *mis, 930 uint32_t value) 931 { 932 uint32_t buf; 933 934 buf = cpu_to_be32(value); 935 migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf); 936 } 937 938 /* 939 * Send a 'PONG' message on the return channel with the given value 940 * (normally in response to a 'PING') 941 */ 942 void migrate_send_rp_pong(MigrationIncomingState *mis, 943 uint32_t value) 944 { 945 uint32_t buf; 946 947 buf = cpu_to_be32(value); 948 migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf); 949 } 950 951 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis, 952 char *block_name) 953 { 954 char buf[512]; 955 int len; 956 int64_t res; 957 958 /* 959 * First, we send the header part. It contains only the len of 960 * idstr, and the idstr itself. 961 */ 962 len = strlen(block_name); 963 buf[0] = len; 964 memcpy(buf + 1, block_name, len); 965 966 if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 967 error_report("%s: MSG_RP_RECV_BITMAP only used for recovery", 968 __func__); 969 return; 970 } 971 972 migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf); 973 974 /* 975 * Next, we dump the received bitmap to the stream. 976 * 977 * TODO: currently we are safe since we are the only one that is 978 * using the to_src_file handle (fault thread is still paused), 979 * and it's ok even not taking the mutex. However the best way is 980 * to take the lock before sending the message header, and release 981 * the lock after sending the bitmap. 982 */ 983 qemu_mutex_lock(&mis->rp_mutex); 984 res = ramblock_recv_bitmap_send(mis->to_src_file, block_name); 985 qemu_mutex_unlock(&mis->rp_mutex); 986 987 trace_migrate_send_rp_recv_bitmap(block_name, res); 988 } 989 990 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value) 991 { 992 uint32_t buf; 993 994 buf = cpu_to_be32(value); 995 migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf); 996 } 997 998 /* 999 * Return true if we're already in the middle of a migration 1000 * (i.e. any of the active or setup states) 1001 */ 1002 bool migration_is_setup_or_active(int state) 1003 { 1004 switch (state) { 1005 case MIGRATION_STATUS_ACTIVE: 1006 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1007 case MIGRATION_STATUS_POSTCOPY_PAUSED: 1008 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1009 case MIGRATION_STATUS_SETUP: 1010 case MIGRATION_STATUS_PRE_SWITCHOVER: 1011 case MIGRATION_STATUS_DEVICE: 1012 case MIGRATION_STATUS_WAIT_UNPLUG: 1013 case MIGRATION_STATUS_COLO: 1014 return true; 1015 1016 default: 1017 return false; 1018 1019 } 1020 } 1021 1022 bool migration_is_running(int state) 1023 { 1024 switch (state) { 1025 case MIGRATION_STATUS_ACTIVE: 1026 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1027 case MIGRATION_STATUS_POSTCOPY_PAUSED: 1028 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1029 case MIGRATION_STATUS_SETUP: 1030 case MIGRATION_STATUS_PRE_SWITCHOVER: 1031 case MIGRATION_STATUS_DEVICE: 1032 case MIGRATION_STATUS_WAIT_UNPLUG: 1033 case MIGRATION_STATUS_CANCELLING: 1034 return true; 1035 1036 default: 1037 return false; 1038 1039 } 1040 } 1041 1042 static bool migrate_show_downtime(MigrationState *s) 1043 { 1044 return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy(); 1045 } 1046 1047 static void populate_time_info(MigrationInfo *info, MigrationState *s) 1048 { 1049 info->has_status = true; 1050 info->has_setup_time = true; 1051 info->setup_time = s->setup_time; 1052 1053 if (s->state == MIGRATION_STATUS_COMPLETED) { 1054 info->has_total_time = true; 1055 info->total_time = s->total_time; 1056 } else { 1057 info->has_total_time = true; 1058 info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - 1059 s->start_time; 1060 } 1061 1062 if (migrate_show_downtime(s)) { 1063 info->has_downtime = true; 1064 info->downtime = s->downtime; 1065 } else { 1066 info->has_expected_downtime = true; 1067 info->expected_downtime = s->expected_downtime; 1068 } 1069 } 1070 1071 static void populate_ram_info(MigrationInfo *info, MigrationState *s) 1072 { 1073 size_t page_size = qemu_target_page_size(); 1074 1075 info->ram = g_malloc0(sizeof(*info->ram)); 1076 info->ram->transferred = migration_transferred_bytes(); 1077 info->ram->total = ram_bytes_total(); 1078 info->ram->duplicate = stat64_get(&mig_stats.zero_pages); 1079 /* legacy value. It is not used anymore */ 1080 info->ram->skipped = 0; 1081 info->ram->normal = stat64_get(&mig_stats.normal_pages); 1082 info->ram->normal_bytes = info->ram->normal * page_size; 1083 info->ram->mbps = s->mbps; 1084 info->ram->dirty_sync_count = 1085 stat64_get(&mig_stats.dirty_sync_count); 1086 info->ram->dirty_sync_missed_zero_copy = 1087 stat64_get(&mig_stats.dirty_sync_missed_zero_copy); 1088 info->ram->postcopy_requests = 1089 stat64_get(&mig_stats.postcopy_requests); 1090 info->ram->page_size = page_size; 1091 info->ram->multifd_bytes = stat64_get(&mig_stats.multifd_bytes); 1092 info->ram->pages_per_second = s->pages_per_second; 1093 info->ram->precopy_bytes = stat64_get(&mig_stats.precopy_bytes); 1094 info->ram->downtime_bytes = stat64_get(&mig_stats.downtime_bytes); 1095 info->ram->postcopy_bytes = stat64_get(&mig_stats.postcopy_bytes); 1096 1097 if (migrate_xbzrle()) { 1098 info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); 1099 info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); 1100 info->xbzrle_cache->bytes = xbzrle_counters.bytes; 1101 info->xbzrle_cache->pages = xbzrle_counters.pages; 1102 info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss; 1103 info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate; 1104 info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate; 1105 info->xbzrle_cache->overflow = xbzrle_counters.overflow; 1106 } 1107 1108 populate_compress(info); 1109 1110 if (cpu_throttle_active()) { 1111 info->has_cpu_throttle_percentage = true; 1112 info->cpu_throttle_percentage = cpu_throttle_get_percentage(); 1113 } 1114 1115 if (s->state != MIGRATION_STATUS_COMPLETED) { 1116 info->ram->remaining = ram_bytes_remaining(); 1117 info->ram->dirty_pages_rate = 1118 stat64_get(&mig_stats.dirty_pages_rate); 1119 } 1120 1121 if (migrate_dirty_limit() && dirtylimit_in_service()) { 1122 info->has_dirty_limit_throttle_time_per_round = true; 1123 info->dirty_limit_throttle_time_per_round = 1124 dirtylimit_throttle_time_per_round(); 1125 1126 info->has_dirty_limit_ring_full_time = true; 1127 info->dirty_limit_ring_full_time = dirtylimit_ring_full_time(); 1128 } 1129 } 1130 1131 static void populate_disk_info(MigrationInfo *info) 1132 { 1133 if (blk_mig_active()) { 1134 info->disk = g_malloc0(sizeof(*info->disk)); 1135 info->disk->transferred = blk_mig_bytes_transferred(); 1136 info->disk->remaining = blk_mig_bytes_remaining(); 1137 info->disk->total = blk_mig_bytes_total(); 1138 } 1139 } 1140 1141 static void fill_source_migration_info(MigrationInfo *info) 1142 { 1143 MigrationState *s = migrate_get_current(); 1144 int state = qatomic_read(&s->state); 1145 GSList *cur_blocker = migration_blockers[migrate_mode()]; 1146 1147 info->blocked_reasons = NULL; 1148 1149 /* 1150 * There are two types of reasons a migration might be blocked; 1151 * a) devices marked in VMState as non-migratable, and 1152 * b) Explicit migration blockers 1153 * We need to add both of them here. 1154 */ 1155 qemu_savevm_non_migratable_list(&info->blocked_reasons); 1156 1157 while (cur_blocker) { 1158 QAPI_LIST_PREPEND(info->blocked_reasons, 1159 g_strdup(error_get_pretty(cur_blocker->data))); 1160 cur_blocker = g_slist_next(cur_blocker); 1161 } 1162 info->has_blocked_reasons = info->blocked_reasons != NULL; 1163 1164 switch (state) { 1165 case MIGRATION_STATUS_NONE: 1166 /* no migration has happened ever */ 1167 /* do not overwrite destination migration status */ 1168 return; 1169 case MIGRATION_STATUS_SETUP: 1170 info->has_status = true; 1171 info->has_total_time = false; 1172 break; 1173 case MIGRATION_STATUS_ACTIVE: 1174 case MIGRATION_STATUS_CANCELLING: 1175 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1176 case MIGRATION_STATUS_PRE_SWITCHOVER: 1177 case MIGRATION_STATUS_DEVICE: 1178 case MIGRATION_STATUS_POSTCOPY_PAUSED: 1179 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1180 /* TODO add some postcopy stats */ 1181 populate_time_info(info, s); 1182 populate_ram_info(info, s); 1183 populate_disk_info(info); 1184 migration_populate_vfio_info(info); 1185 break; 1186 case MIGRATION_STATUS_COLO: 1187 info->has_status = true; 1188 /* TODO: display COLO specific information (checkpoint info etc.) */ 1189 break; 1190 case MIGRATION_STATUS_COMPLETED: 1191 populate_time_info(info, s); 1192 populate_ram_info(info, s); 1193 migration_populate_vfio_info(info); 1194 break; 1195 case MIGRATION_STATUS_FAILED: 1196 info->has_status = true; 1197 break; 1198 case MIGRATION_STATUS_CANCELLED: 1199 info->has_status = true; 1200 break; 1201 case MIGRATION_STATUS_WAIT_UNPLUG: 1202 info->has_status = true; 1203 break; 1204 } 1205 info->status = state; 1206 1207 QEMU_LOCK_GUARD(&s->error_mutex); 1208 if (s->error) { 1209 info->error_desc = g_strdup(error_get_pretty(s->error)); 1210 } 1211 } 1212 1213 static void fill_destination_migration_info(MigrationInfo *info) 1214 { 1215 MigrationIncomingState *mis = migration_incoming_get_current(); 1216 1217 if (mis->socket_address_list) { 1218 info->has_socket_address = true; 1219 info->socket_address = 1220 QAPI_CLONE(SocketAddressList, mis->socket_address_list); 1221 } 1222 1223 switch (mis->state) { 1224 case MIGRATION_STATUS_NONE: 1225 return; 1226 case MIGRATION_STATUS_SETUP: 1227 case MIGRATION_STATUS_CANCELLING: 1228 case MIGRATION_STATUS_CANCELLED: 1229 case MIGRATION_STATUS_ACTIVE: 1230 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1231 case MIGRATION_STATUS_POSTCOPY_PAUSED: 1232 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1233 case MIGRATION_STATUS_FAILED: 1234 case MIGRATION_STATUS_COLO: 1235 info->has_status = true; 1236 break; 1237 case MIGRATION_STATUS_COMPLETED: 1238 info->has_status = true; 1239 fill_destination_postcopy_migration_info(info); 1240 break; 1241 } 1242 info->status = mis->state; 1243 } 1244 1245 MigrationInfo *qmp_query_migrate(Error **errp) 1246 { 1247 MigrationInfo *info = g_malloc0(sizeof(*info)); 1248 1249 fill_destination_migration_info(info); 1250 fill_source_migration_info(info); 1251 1252 return info; 1253 } 1254 1255 void qmp_migrate_start_postcopy(Error **errp) 1256 { 1257 MigrationState *s = migrate_get_current(); 1258 1259 if (!migrate_postcopy()) { 1260 error_setg(errp, "Enable postcopy with migrate_set_capability before" 1261 " the start of migration"); 1262 return; 1263 } 1264 1265 if (s->state == MIGRATION_STATUS_NONE) { 1266 error_setg(errp, "Postcopy must be started after migration has been" 1267 " started"); 1268 return; 1269 } 1270 /* 1271 * we don't error if migration has finished since that would be racy 1272 * with issuing this command. 1273 */ 1274 qatomic_set(&s->start_postcopy, true); 1275 } 1276 1277 /* shared migration helpers */ 1278 1279 void migrate_set_state(int *state, int old_state, int new_state) 1280 { 1281 assert(new_state < MIGRATION_STATUS__MAX); 1282 if (qatomic_cmpxchg(state, old_state, new_state) == old_state) { 1283 trace_migrate_set_state(MigrationStatus_str(new_state)); 1284 migrate_generate_event(new_state); 1285 } 1286 } 1287 1288 static void migrate_fd_cleanup(MigrationState *s) 1289 { 1290 qemu_bh_delete(s->cleanup_bh); 1291 s->cleanup_bh = NULL; 1292 1293 g_free(s->hostname); 1294 s->hostname = NULL; 1295 json_writer_free(s->vmdesc); 1296 s->vmdesc = NULL; 1297 1298 qemu_savevm_state_cleanup(); 1299 1300 if (s->to_dst_file) { 1301 QEMUFile *tmp; 1302 1303 trace_migrate_fd_cleanup(); 1304 qemu_mutex_unlock_iothread(); 1305 if (s->migration_thread_running) { 1306 qemu_thread_join(&s->thread); 1307 s->migration_thread_running = false; 1308 } 1309 qemu_mutex_lock_iothread(); 1310 1311 multifd_save_cleanup(); 1312 qemu_mutex_lock(&s->qemu_file_lock); 1313 tmp = s->to_dst_file; 1314 s->to_dst_file = NULL; 1315 qemu_mutex_unlock(&s->qemu_file_lock); 1316 /* 1317 * Close the file handle without the lock to make sure the 1318 * critical section won't block for long. 1319 */ 1320 migration_ioc_unregister_yank_from_file(tmp); 1321 qemu_fclose(tmp); 1322 } 1323 1324 /* 1325 * We already cleaned up to_dst_file, so errors from the return 1326 * path might be due to that, ignore them. 1327 */ 1328 close_return_path_on_source(s); 1329 1330 assert(!migration_is_active(s)); 1331 1332 if (s->state == MIGRATION_STATUS_CANCELLING) { 1333 migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, 1334 MIGRATION_STATUS_CANCELLED); 1335 } 1336 1337 if (s->error) { 1338 /* It is used on info migrate. We can't free it */ 1339 error_report_err(error_copy(s->error)); 1340 } 1341 migration_call_notifiers(s); 1342 block_cleanup_parameters(); 1343 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 1344 } 1345 1346 static void migrate_fd_cleanup_schedule(MigrationState *s) 1347 { 1348 /* 1349 * Ref the state for bh, because it may be called when 1350 * there're already no other refs 1351 */ 1352 object_ref(OBJECT(s)); 1353 qemu_bh_schedule(s->cleanup_bh); 1354 } 1355 1356 static void migrate_fd_cleanup_bh(void *opaque) 1357 { 1358 MigrationState *s = opaque; 1359 migrate_fd_cleanup(s); 1360 object_unref(OBJECT(s)); 1361 } 1362 1363 void migrate_set_error(MigrationState *s, const Error *error) 1364 { 1365 QEMU_LOCK_GUARD(&s->error_mutex); 1366 if (!s->error) { 1367 s->error = error_copy(error); 1368 } 1369 } 1370 1371 bool migrate_has_error(MigrationState *s) 1372 { 1373 /* The lock is not helpful here, but still follow the rule */ 1374 QEMU_LOCK_GUARD(&s->error_mutex); 1375 return qatomic_read(&s->error); 1376 } 1377 1378 static void migrate_error_free(MigrationState *s) 1379 { 1380 QEMU_LOCK_GUARD(&s->error_mutex); 1381 if (s->error) { 1382 error_free(s->error); 1383 s->error = NULL; 1384 } 1385 } 1386 1387 static void migrate_fd_error(MigrationState *s, const Error *error) 1388 { 1389 trace_migrate_fd_error(error_get_pretty(error)); 1390 assert(s->to_dst_file == NULL); 1391 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 1392 MIGRATION_STATUS_FAILED); 1393 migrate_set_error(s, error); 1394 } 1395 1396 static void migrate_fd_cancel(MigrationState *s) 1397 { 1398 int old_state ; 1399 1400 trace_migrate_fd_cancel(); 1401 1402 WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { 1403 if (s->rp_state.from_dst_file) { 1404 /* shutdown the rp socket, so causing the rp thread to shutdown */ 1405 qemu_file_shutdown(s->rp_state.from_dst_file); 1406 } 1407 } 1408 1409 do { 1410 old_state = s->state; 1411 if (!migration_is_running(old_state)) { 1412 break; 1413 } 1414 /* If the migration is paused, kick it out of the pause */ 1415 if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) { 1416 qemu_sem_post(&s->pause_sem); 1417 } 1418 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); 1419 } while (s->state != MIGRATION_STATUS_CANCELLING); 1420 1421 /* 1422 * If we're unlucky the migration code might be stuck somewhere in a 1423 * send/write while the network has failed and is waiting to timeout; 1424 * if we've got shutdown(2) available then we can force it to quit. 1425 */ 1426 if (s->state == MIGRATION_STATUS_CANCELLING) { 1427 WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) { 1428 if (s->to_dst_file) { 1429 qemu_file_shutdown(s->to_dst_file); 1430 } 1431 } 1432 } 1433 if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) { 1434 Error *local_err = NULL; 1435 1436 bdrv_activate_all(&local_err); 1437 if (local_err) { 1438 error_report_err(local_err); 1439 } else { 1440 s->block_inactive = false; 1441 } 1442 } 1443 } 1444 1445 void migration_add_notifier(Notifier *notify, 1446 void (*func)(Notifier *notifier, void *data)) 1447 { 1448 notify->notify = func; 1449 notifier_list_add(&migration_state_notifiers, notify); 1450 } 1451 1452 void migration_remove_notifier(Notifier *notify) 1453 { 1454 if (notify->notify) { 1455 notifier_remove(notify); 1456 notify->notify = NULL; 1457 } 1458 } 1459 1460 void migration_call_notifiers(MigrationState *s) 1461 { 1462 notifier_list_notify(&migration_state_notifiers, s); 1463 } 1464 1465 bool migration_in_setup(MigrationState *s) 1466 { 1467 return s->state == MIGRATION_STATUS_SETUP; 1468 } 1469 1470 bool migration_has_finished(MigrationState *s) 1471 { 1472 return s->state == MIGRATION_STATUS_COMPLETED; 1473 } 1474 1475 bool migration_has_failed(MigrationState *s) 1476 { 1477 return (s->state == MIGRATION_STATUS_CANCELLED || 1478 s->state == MIGRATION_STATUS_FAILED); 1479 } 1480 1481 bool migration_in_postcopy(void) 1482 { 1483 MigrationState *s = migrate_get_current(); 1484 1485 switch (s->state) { 1486 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1487 case MIGRATION_STATUS_POSTCOPY_PAUSED: 1488 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1489 return true; 1490 default: 1491 return false; 1492 } 1493 } 1494 1495 bool migration_postcopy_is_alive(int state) 1496 { 1497 switch (state) { 1498 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1499 case MIGRATION_STATUS_POSTCOPY_RECOVER: 1500 return true; 1501 default: 1502 return false; 1503 } 1504 } 1505 1506 bool migration_in_postcopy_after_devices(MigrationState *s) 1507 { 1508 return migration_in_postcopy() && s->postcopy_after_devices; 1509 } 1510 1511 bool migration_in_incoming_postcopy(void) 1512 { 1513 PostcopyState ps = postcopy_state_get(); 1514 1515 return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END; 1516 } 1517 1518 bool migration_incoming_postcopy_advised(void) 1519 { 1520 PostcopyState ps = postcopy_state_get(); 1521 1522 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END; 1523 } 1524 1525 bool migration_in_bg_snapshot(void) 1526 { 1527 MigrationState *s = migrate_get_current(); 1528 1529 return migrate_background_snapshot() && 1530 migration_is_setup_or_active(s->state); 1531 } 1532 1533 bool migration_is_idle(void) 1534 { 1535 MigrationState *s = current_migration; 1536 1537 if (!s) { 1538 return true; 1539 } 1540 1541 switch (s->state) { 1542 case MIGRATION_STATUS_NONE: 1543 case MIGRATION_STATUS_CANCELLED: 1544 case MIGRATION_STATUS_COMPLETED: 1545 case MIGRATION_STATUS_FAILED: 1546 return true; 1547 case MIGRATION_STATUS_SETUP: 1548 case MIGRATION_STATUS_CANCELLING: 1549 case MIGRATION_STATUS_ACTIVE: 1550 case MIGRATION_STATUS_POSTCOPY_ACTIVE: 1551 case MIGRATION_STATUS_COLO: 1552 case MIGRATION_STATUS_PRE_SWITCHOVER: 1553 case MIGRATION_STATUS_DEVICE: 1554 case MIGRATION_STATUS_WAIT_UNPLUG: 1555 return false; 1556 case MIGRATION_STATUS__MAX: 1557 g_assert_not_reached(); 1558 } 1559 1560 return false; 1561 } 1562 1563 bool migration_is_active(MigrationState *s) 1564 { 1565 return (s->state == MIGRATION_STATUS_ACTIVE || 1566 s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); 1567 } 1568 1569 int migrate_init(MigrationState *s, Error **errp) 1570 { 1571 int ret; 1572 1573 ret = qemu_savevm_state_prepare(errp); 1574 if (ret) { 1575 return ret; 1576 } 1577 1578 /* 1579 * Reinitialise all migration state, except 1580 * parameters/capabilities that the user set, and 1581 * locks. 1582 */ 1583 s->cleanup_bh = 0; 1584 s->vm_start_bh = 0; 1585 s->to_dst_file = NULL; 1586 s->state = MIGRATION_STATUS_NONE; 1587 s->rp_state.from_dst_file = NULL; 1588 s->mbps = 0.0; 1589 s->pages_per_second = 0.0; 1590 s->downtime = 0; 1591 s->expected_downtime = 0; 1592 s->setup_time = 0; 1593 s->start_postcopy = false; 1594 s->postcopy_after_devices = false; 1595 s->migration_thread_running = false; 1596 error_free(s->error); 1597 s->error = NULL; 1598 s->hostname = NULL; 1599 s->vmdesc = NULL; 1600 1601 migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); 1602 1603 s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1604 s->total_time = 0; 1605 s->vm_old_state = -1; 1606 s->iteration_initial_bytes = 0; 1607 s->threshold_size = 0; 1608 s->switchover_acked = false; 1609 s->rdma_migration = false; 1610 /* 1611 * set mig_stats memory to zero for a new migration 1612 */ 1613 memset(&mig_stats, 0, sizeof(mig_stats)); 1614 migration_reset_vfio_bytes_transferred(); 1615 1616 return 0; 1617 } 1618 1619 static bool is_busy(Error **reasonp, Error **errp) 1620 { 1621 ERRP_GUARD(); 1622 1623 /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */ 1624 if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) { 1625 error_propagate_prepend(errp, *reasonp, 1626 "disallowing migration blocker " 1627 "(migration/snapshot in progress) for: "); 1628 *reasonp = NULL; 1629 return true; 1630 } 1631 return false; 1632 } 1633 1634 static bool is_only_migratable(Error **reasonp, Error **errp, int modes) 1635 { 1636 ERRP_GUARD(); 1637 1638 if (only_migratable && (modes & BIT(MIG_MODE_NORMAL))) { 1639 error_propagate_prepend(errp, *reasonp, 1640 "disallowing migration blocker " 1641 "(--only-migratable) for: "); 1642 *reasonp = NULL; 1643 return true; 1644 } 1645 return false; 1646 } 1647 1648 static int get_modes(MigMode mode, va_list ap) 1649 { 1650 int modes = 0; 1651 1652 while (mode != -1 && mode != MIG_MODE_ALL) { 1653 assert(mode >= MIG_MODE_NORMAL && mode < MIG_MODE__MAX); 1654 modes |= BIT(mode); 1655 mode = va_arg(ap, MigMode); 1656 } 1657 if (mode == MIG_MODE_ALL) { 1658 modes = BIT(MIG_MODE__MAX) - 1; 1659 } 1660 return modes; 1661 } 1662 1663 static int add_blockers(Error **reasonp, Error **errp, int modes) 1664 { 1665 for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { 1666 if (modes & BIT(mode)) { 1667 migration_blockers[mode] = g_slist_prepend(migration_blockers[mode], 1668 *reasonp); 1669 } 1670 } 1671 return 0; 1672 } 1673 1674 int migrate_add_blocker(Error **reasonp, Error **errp) 1675 { 1676 return migrate_add_blocker_modes(reasonp, errp, MIG_MODE_ALL); 1677 } 1678 1679 int migrate_add_blocker_normal(Error **reasonp, Error **errp) 1680 { 1681 return migrate_add_blocker_modes(reasonp, errp, MIG_MODE_NORMAL, -1); 1682 } 1683 1684 int migrate_add_blocker_modes(Error **reasonp, Error **errp, MigMode mode, ...) 1685 { 1686 int modes; 1687 va_list ap; 1688 1689 va_start(ap, mode); 1690 modes = get_modes(mode, ap); 1691 va_end(ap); 1692 1693 if (is_only_migratable(reasonp, errp, modes)) { 1694 return -EACCES; 1695 } else if (is_busy(reasonp, errp)) { 1696 return -EBUSY; 1697 } 1698 return add_blockers(reasonp, errp, modes); 1699 } 1700 1701 int migrate_add_blocker_internal(Error **reasonp, Error **errp) 1702 { 1703 int modes = BIT(MIG_MODE__MAX) - 1; 1704 1705 if (is_busy(reasonp, errp)) { 1706 return -EBUSY; 1707 } 1708 return add_blockers(reasonp, errp, modes); 1709 } 1710 1711 void migrate_del_blocker(Error **reasonp) 1712 { 1713 if (*reasonp) { 1714 for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { 1715 migration_blockers[mode] = g_slist_remove(migration_blockers[mode], 1716 *reasonp); 1717 } 1718 error_free(*reasonp); 1719 *reasonp = NULL; 1720 } 1721 } 1722 1723 void qmp_migrate_incoming(const char *uri, bool has_channels, 1724 MigrationChannelList *channels, Error **errp) 1725 { 1726 Error *local_err = NULL; 1727 static bool once = true; 1728 1729 if (!once) { 1730 error_setg(errp, "The incoming migration has already been started"); 1731 return; 1732 } 1733 if (!runstate_check(RUN_STATE_INMIGRATE)) { 1734 error_setg(errp, "'-incoming' was not specified on the command line"); 1735 return; 1736 } 1737 1738 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { 1739 return; 1740 } 1741 1742 qemu_start_incoming_migration(uri, has_channels, channels, &local_err); 1743 1744 if (local_err) { 1745 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 1746 error_propagate(errp, local_err); 1747 return; 1748 } 1749 1750 once = false; 1751 } 1752 1753 void qmp_migrate_recover(const char *uri, Error **errp) 1754 { 1755 MigrationIncomingState *mis = migration_incoming_get_current(); 1756 1757 /* 1758 * Don't even bother to use ERRP_GUARD() as it _must_ always be set by 1759 * callers (no one should ignore a recover failure); if there is, it's a 1760 * programming error. 1761 */ 1762 assert(errp); 1763 1764 if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) { 1765 error_setg(errp, "Migrate recover can only be run " 1766 "when postcopy is paused."); 1767 return; 1768 } 1769 1770 /* If there's an existing transport, release it */ 1771 migration_incoming_transport_cleanup(mis); 1772 1773 /* 1774 * Note that this call will never start a real migration; it will 1775 * only re-setup the migration stream and poke existing migration 1776 * to continue using that newly established channel. 1777 */ 1778 qemu_start_incoming_migration(uri, false, NULL, errp); 1779 } 1780 1781 void qmp_migrate_pause(Error **errp) 1782 { 1783 MigrationState *ms = migrate_get_current(); 1784 MigrationIncomingState *mis = migration_incoming_get_current(); 1785 int ret = 0; 1786 1787 if (migration_postcopy_is_alive(ms->state)) { 1788 /* Source side, during postcopy */ 1789 Error *error = NULL; 1790 1791 /* Tell the core migration that we're pausing */ 1792 error_setg(&error, "Postcopy migration is paused by the user"); 1793 migrate_set_error(ms, error); 1794 error_free(error); 1795 1796 qemu_mutex_lock(&ms->qemu_file_lock); 1797 if (ms->to_dst_file) { 1798 ret = qemu_file_shutdown(ms->to_dst_file); 1799 } 1800 qemu_mutex_unlock(&ms->qemu_file_lock); 1801 if (ret) { 1802 error_setg(errp, "Failed to pause source migration"); 1803 } 1804 1805 /* 1806 * Kick the migration thread out of any waiting windows (on behalf 1807 * of the rp thread). 1808 */ 1809 migration_rp_kick(ms); 1810 1811 return; 1812 } 1813 1814 if (migration_postcopy_is_alive(mis->state)) { 1815 ret = qemu_file_shutdown(mis->from_src_file); 1816 if (ret) { 1817 error_setg(errp, "Failed to pause destination migration"); 1818 } 1819 return; 1820 } 1821 1822 error_setg(errp, "migrate-pause is currently only supported " 1823 "during postcopy-active or postcopy-recover state"); 1824 } 1825 1826 bool migration_is_blocked(Error **errp) 1827 { 1828 GSList *blockers = migration_blockers[migrate_mode()]; 1829 1830 if (qemu_savevm_state_blocked(errp)) { 1831 return true; 1832 } 1833 1834 if (blockers) { 1835 error_propagate(errp, error_copy(blockers->data)); 1836 return true; 1837 } 1838 1839 return false; 1840 } 1841 1842 /* Returns true if continue to migrate, or false if error detected */ 1843 static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, 1844 bool resume, Error **errp) 1845 { 1846 Error *local_err = NULL; 1847 1848 if (blk_inc) { 1849 warn_report("parameter 'inc' is deprecated;" 1850 " use blockdev-mirror with NBD instead"); 1851 } 1852 1853 if (blk) { 1854 warn_report("parameter 'blk' is deprecated;" 1855 " use blockdev-mirror with NBD instead"); 1856 } 1857 1858 if (resume) { 1859 if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) { 1860 error_setg(errp, "Cannot resume if there is no " 1861 "paused migration"); 1862 return false; 1863 } 1864 1865 /* 1866 * Postcopy recovery won't work well with release-ram 1867 * capability since release-ram will drop the page buffer as 1868 * long as the page is put into the send buffer. So if there 1869 * is a network failure happened, any page buffers that have 1870 * not yet reached the destination VM but have already been 1871 * sent from the source VM will be lost forever. Let's refuse 1872 * the client from resuming such a postcopy migration. 1873 * Luckily release-ram was designed to only be used when src 1874 * and destination VMs are on the same host, so it should be 1875 * fine. 1876 */ 1877 if (migrate_release_ram()) { 1878 error_setg(errp, "Postcopy recovery cannot work " 1879 "when release-ram capability is set"); 1880 return false; 1881 } 1882 1883 /* This is a resume, skip init status */ 1884 return true; 1885 } 1886 1887 if (migration_is_running(s->state)) { 1888 error_setg(errp, QERR_MIGRATION_ACTIVE); 1889 return false; 1890 } 1891 1892 if (runstate_check(RUN_STATE_INMIGRATE)) { 1893 error_setg(errp, "Guest is waiting for an incoming migration"); 1894 return false; 1895 } 1896 1897 if (runstate_check(RUN_STATE_POSTMIGRATE)) { 1898 error_setg(errp, "Can't migrate the vm that was paused due to " 1899 "previous migration"); 1900 return false; 1901 } 1902 1903 if (migration_is_blocked(errp)) { 1904 return false; 1905 } 1906 1907 if (blk || blk_inc) { 1908 if (migrate_colo()) { 1909 error_setg(errp, "No disk migration is required in COLO mode"); 1910 return false; 1911 } 1912 if (migrate_block() || migrate_block_incremental()) { 1913 error_setg(errp, "Command options are incompatible with " 1914 "current migration capabilities"); 1915 return false; 1916 } 1917 if (!migrate_cap_set(MIGRATION_CAPABILITY_BLOCK, true, &local_err)) { 1918 error_propagate(errp, local_err); 1919 return false; 1920 } 1921 s->must_remove_block_options = true; 1922 } 1923 1924 if (blk_inc) { 1925 migrate_set_block_incremental(true); 1926 } 1927 1928 if (migrate_init(s, errp)) { 1929 return false; 1930 } 1931 1932 return true; 1933 } 1934 1935 void qmp_migrate(const char *uri, bool has_channels, 1936 MigrationChannelList *channels, bool has_blk, bool blk, 1937 bool has_inc, bool inc, bool has_detach, bool detach, 1938 bool has_resume, bool resume, Error **errp) 1939 { 1940 bool resume_requested; 1941 Error *local_err = NULL; 1942 MigrationState *s = migrate_get_current(); 1943 g_autoptr(MigrationChannel) channel = NULL; 1944 MigrationAddress *addr = NULL; 1945 1946 /* 1947 * Having preliminary checks for uri and channel 1948 */ 1949 if (uri && has_channels) { 1950 error_setg(errp, "'uri' and 'channels' arguments are mutually " 1951 "exclusive; exactly one of the two should be present in " 1952 "'migrate' qmp command "); 1953 return; 1954 } else if (channels) { 1955 /* To verify that Migrate channel list has only item */ 1956 if (channels->next) { 1957 error_setg(errp, "Channel list has more than one entries"); 1958 return; 1959 } 1960 addr = channels->value->addr; 1961 } else if (uri) { 1962 /* caller uses the old URI syntax */ 1963 if (!migrate_uri_parse(uri, &channel, errp)) { 1964 return; 1965 } 1966 addr = channel->addr; 1967 } else { 1968 error_setg(errp, "neither 'uri' or 'channels' argument are " 1969 "specified in 'migrate' qmp command "); 1970 return; 1971 } 1972 1973 /* transport mechanism not suitable for migration? */ 1974 if (!migration_channels_and_transport_compatible(addr, errp)) { 1975 return; 1976 } 1977 1978 resume_requested = has_resume && resume; 1979 if (!migrate_prepare(s, has_blk && blk, has_inc && inc, 1980 resume_requested, errp)) { 1981 /* Error detected, put into errp */ 1982 return; 1983 } 1984 1985 if (!resume_requested) { 1986 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { 1987 return; 1988 } 1989 } 1990 1991 if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { 1992 SocketAddress *saddr = &addr->u.socket; 1993 if (saddr->type == SOCKET_ADDRESS_TYPE_INET || 1994 saddr->type == SOCKET_ADDRESS_TYPE_UNIX || 1995 saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) { 1996 socket_start_outgoing_migration(s, saddr, &local_err); 1997 } else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) { 1998 fd_start_outgoing_migration(s, saddr->u.fd.str, &local_err); 1999 } 2000 #ifdef CONFIG_RDMA 2001 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) { 2002 rdma_start_outgoing_migration(s, &addr->u.rdma, &local_err); 2003 #endif 2004 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) { 2005 exec_start_outgoing_migration(s, addr->u.exec.args, &local_err); 2006 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { 2007 file_start_outgoing_migration(s, &addr->u.file, &local_err); 2008 } else { 2009 error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri", 2010 "a valid migration protocol"); 2011 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 2012 MIGRATION_STATUS_FAILED); 2013 block_cleanup_parameters(); 2014 } 2015 2016 if (local_err) { 2017 if (!resume_requested) { 2018 yank_unregister_instance(MIGRATION_YANK_INSTANCE); 2019 } 2020 migrate_fd_error(s, local_err); 2021 error_propagate(errp, local_err); 2022 return; 2023 } 2024 } 2025 2026 void qmp_migrate_cancel(Error **errp) 2027 { 2028 migration_cancel(NULL); 2029 } 2030 2031 void qmp_migrate_continue(MigrationStatus state, Error **errp) 2032 { 2033 MigrationState *s = migrate_get_current(); 2034 if (s->state != state) { 2035 error_setg(errp, "Migration not in expected state: %s", 2036 MigrationStatus_str(s->state)); 2037 return; 2038 } 2039 qemu_sem_post(&s->pause_sem); 2040 } 2041 2042 int migration_rp_wait(MigrationState *s) 2043 { 2044 /* If migration has failure already, ignore the wait */ 2045 if (migrate_has_error(s)) { 2046 return -1; 2047 } 2048 2049 qemu_sem_wait(&s->rp_state.rp_sem); 2050 2051 /* After wait, double check that there's no failure */ 2052 if (migrate_has_error(s)) { 2053 return -1; 2054 } 2055 2056 return 0; 2057 } 2058 2059 void migration_rp_kick(MigrationState *s) 2060 { 2061 qemu_sem_post(&s->rp_state.rp_sem); 2062 } 2063 2064 static struct rp_cmd_args { 2065 ssize_t len; /* -1 = variable */ 2066 const char *name; 2067 } rp_cmd_args[] = { 2068 [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" }, 2069 [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" }, 2070 [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" }, 2071 [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" }, 2072 [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, 2073 [MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, 2074 [MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" }, 2075 [MIG_RP_MSG_SWITCHOVER_ACK] = { .len = 0, .name = "SWITCHOVER_ACK" }, 2076 [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, 2077 }; 2078 2079 /* 2080 * Process a request for pages received on the return path, 2081 * We're allowed to send more than requested (e.g. to round to our page size) 2082 * and we don't need to send pages that have already been sent. 2083 */ 2084 static void 2085 migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, 2086 ram_addr_t start, size_t len, Error **errp) 2087 { 2088 long our_host_ps = qemu_real_host_page_size(); 2089 2090 trace_migrate_handle_rp_req_pages(rbname, start, len); 2091 2092 /* 2093 * Since we currently insist on matching page sizes, just sanity check 2094 * we're being asked for whole host pages. 2095 */ 2096 if (!QEMU_IS_ALIGNED(start, our_host_ps) || 2097 !QEMU_IS_ALIGNED(len, our_host_ps)) { 2098 error_setg(errp, "MIG_RP_MSG_REQ_PAGES: Misaligned page request, start:" 2099 RAM_ADDR_FMT " len: %zd", start, len); 2100 return; 2101 } 2102 2103 ram_save_queue_pages(rbname, start, len, errp); 2104 } 2105 2106 static bool migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name, 2107 Error **errp) 2108 { 2109 RAMBlock *block = qemu_ram_block_by_name(block_name); 2110 2111 if (!block) { 2112 error_setg(errp, "MIG_RP_MSG_RECV_BITMAP has invalid block name '%s'", 2113 block_name); 2114 return false; 2115 } 2116 2117 /* Fetch the received bitmap and refresh the dirty bitmap */ 2118 return ram_dirty_bitmap_reload(s, block, errp); 2119 } 2120 2121 static bool migrate_handle_rp_resume_ack(MigrationState *s, 2122 uint32_t value, Error **errp) 2123 { 2124 trace_source_return_path_thread_resume_ack(value); 2125 2126 if (value != MIGRATION_RESUME_ACK_VALUE) { 2127 error_setg(errp, "illegal resume_ack value %"PRIu32, value); 2128 return false; 2129 } 2130 2131 /* Now both sides are active. */ 2132 migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER, 2133 MIGRATION_STATUS_POSTCOPY_ACTIVE); 2134 2135 /* Notify send thread that time to continue send pages */ 2136 migration_rp_kick(s); 2137 2138 return true; 2139 } 2140 2141 /* 2142 * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if 2143 * existed) in a safe way. 2144 */ 2145 static void migration_release_dst_files(MigrationState *ms) 2146 { 2147 QEMUFile *file; 2148 2149 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { 2150 /* 2151 * Reset the from_dst_file pointer first before releasing it, as we 2152 * can't block within lock section 2153 */ 2154 file = ms->rp_state.from_dst_file; 2155 ms->rp_state.from_dst_file = NULL; 2156 } 2157 2158 /* 2159 * Do the same to postcopy fast path socket too if there is. No 2160 * locking needed because this qemufile should only be managed by 2161 * return path thread. 2162 */ 2163 if (ms->postcopy_qemufile_src) { 2164 migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src); 2165 qemu_file_shutdown(ms->postcopy_qemufile_src); 2166 qemu_fclose(ms->postcopy_qemufile_src); 2167 ms->postcopy_qemufile_src = NULL; 2168 } 2169 2170 qemu_fclose(file); 2171 } 2172 2173 /* 2174 * Handles messages sent on the return path towards the source VM 2175 * 2176 */ 2177 static void *source_return_path_thread(void *opaque) 2178 { 2179 MigrationState *ms = opaque; 2180 QEMUFile *rp = ms->rp_state.from_dst_file; 2181 uint16_t header_len, header_type; 2182 uint8_t buf[512]; 2183 uint32_t tmp32, sibling_error; 2184 ram_addr_t start = 0; /* =0 to silence warning */ 2185 size_t len = 0, expected_len; 2186 Error *err = NULL; 2187 int res; 2188 2189 trace_source_return_path_thread_entry(); 2190 rcu_register_thread(); 2191 2192 while (migration_is_setup_or_active(ms->state)) { 2193 trace_source_return_path_thread_loop_top(); 2194 2195 header_type = qemu_get_be16(rp); 2196 header_len = qemu_get_be16(rp); 2197 2198 if (qemu_file_get_error(rp)) { 2199 qemu_file_get_error_obj(rp, &err); 2200 goto out; 2201 } 2202 2203 if (header_type >= MIG_RP_MSG_MAX || 2204 header_type == MIG_RP_MSG_INVALID) { 2205 error_setg(&err, "Received invalid message 0x%04x length 0x%04x", 2206 header_type, header_len); 2207 goto out; 2208 } 2209 2210 if ((rp_cmd_args[header_type].len != -1 && 2211 header_len != rp_cmd_args[header_type].len) || 2212 header_len > sizeof(buf)) { 2213 error_setg(&err, "Received '%s' message (0x%04x) with" 2214 "incorrect length %d expecting %zu", 2215 rp_cmd_args[header_type].name, header_type, header_len, 2216 (size_t)rp_cmd_args[header_type].len); 2217 goto out; 2218 } 2219 2220 /* We know we've got a valid header by this point */ 2221 res = qemu_get_buffer(rp, buf, header_len); 2222 if (res != header_len) { 2223 error_setg(&err, "Failed reading data for message 0x%04x" 2224 " read %d expected %d", 2225 header_type, res, header_len); 2226 goto out; 2227 } 2228 2229 /* OK, we have the message and the data */ 2230 switch (header_type) { 2231 case MIG_RP_MSG_SHUT: 2232 sibling_error = ldl_be_p(buf); 2233 trace_source_return_path_thread_shut(sibling_error); 2234 if (sibling_error) { 2235 error_setg(&err, "Sibling indicated error %d", sibling_error); 2236 } 2237 /* 2238 * We'll let the main thread deal with closing the RP 2239 * we could do a shutdown(2) on it, but we're the only user 2240 * anyway, so there's nothing gained. 2241 */ 2242 goto out; 2243 2244 case MIG_RP_MSG_PONG: 2245 tmp32 = ldl_be_p(buf); 2246 trace_source_return_path_thread_pong(tmp32); 2247 qemu_sem_post(&ms->rp_state.rp_pong_acks); 2248 break; 2249 2250 case MIG_RP_MSG_REQ_PAGES: 2251 start = ldq_be_p(buf); 2252 len = ldl_be_p(buf + 8); 2253 migrate_handle_rp_req_pages(ms, NULL, start, len, &err); 2254 if (err) { 2255 goto out; 2256 } 2257 break; 2258 2259 case MIG_RP_MSG_REQ_PAGES_ID: 2260 expected_len = 12 + 1; /* header + termination */ 2261 2262 if (header_len >= expected_len) { 2263 start = ldq_be_p(buf); 2264 len = ldl_be_p(buf + 8); 2265 /* Now we expect an idstr */ 2266 tmp32 = buf[12]; /* Length of the following idstr */ 2267 buf[13 + tmp32] = '\0'; 2268 expected_len += tmp32; 2269 } 2270 if (header_len != expected_len) { 2271 error_setg(&err, "Req_Page_id with length %d expecting %zd", 2272 header_len, expected_len); 2273 goto out; 2274 } 2275 migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len, 2276 &err); 2277 if (err) { 2278 goto out; 2279 } 2280 break; 2281 2282 case MIG_RP_MSG_RECV_BITMAP: 2283 if (header_len < 1) { 2284 error_setg(&err, "MIG_RP_MSG_RECV_BITMAP missing block name"); 2285 goto out; 2286 } 2287 /* Format: len (1B) + idstr (<255B). This ends the idstr. */ 2288 buf[buf[0] + 1] = '\0'; 2289 if (!migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1), &err)) { 2290 goto out; 2291 } 2292 break; 2293 2294 case MIG_RP_MSG_RESUME_ACK: 2295 tmp32 = ldl_be_p(buf); 2296 if (!migrate_handle_rp_resume_ack(ms, tmp32, &err)) { 2297 goto out; 2298 } 2299 break; 2300 2301 case MIG_RP_MSG_SWITCHOVER_ACK: 2302 ms->switchover_acked = true; 2303 trace_source_return_path_thread_switchover_acked(); 2304 break; 2305 2306 default: 2307 break; 2308 } 2309 } 2310 2311 out: 2312 if (err) { 2313 migrate_set_error(ms, err); 2314 error_free(err); 2315 trace_source_return_path_thread_bad_end(); 2316 } 2317 2318 if (ms->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { 2319 /* 2320 * this will be extremely unlikely: that we got yet another network 2321 * issue during recovering of the 1st network failure.. during this 2322 * period the main migration thread can be waiting on rp_sem for 2323 * this thread to sync with the other side. 2324 * 2325 * When this happens, explicitly kick the migration thread out of 2326 * RECOVER stage and back to PAUSED, so the admin can try 2327 * everything again. 2328 */ 2329 migration_rp_kick(ms); 2330 } 2331 2332 trace_source_return_path_thread_end(); 2333 rcu_unregister_thread(); 2334 2335 return NULL; 2336 } 2337 2338 static int open_return_path_on_source(MigrationState *ms) 2339 { 2340 ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file); 2341 if (!ms->rp_state.from_dst_file) { 2342 return -1; 2343 } 2344 2345 trace_open_return_path_on_source(); 2346 2347 qemu_thread_create(&ms->rp_state.rp_thread, "return path", 2348 source_return_path_thread, ms, QEMU_THREAD_JOINABLE); 2349 ms->rp_state.rp_thread_created = true; 2350 2351 trace_open_return_path_on_source_continue(); 2352 2353 return 0; 2354 } 2355 2356 /* Return true if error detected, or false otherwise */ 2357 static bool close_return_path_on_source(MigrationState *ms) 2358 { 2359 if (!ms->rp_state.rp_thread_created) { 2360 return false; 2361 } 2362 2363 trace_migration_return_path_end_before(); 2364 2365 /* 2366 * If this is a normal exit then the destination will send a SHUT 2367 * and the rp_thread will exit, however if there's an error we 2368 * need to cause it to exit. shutdown(2), if we have it, will 2369 * cause it to unblock if it's stuck waiting for the destination. 2370 */ 2371 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) { 2372 if (ms->to_dst_file && ms->rp_state.from_dst_file && 2373 qemu_file_get_error(ms->to_dst_file)) { 2374 qemu_file_shutdown(ms->rp_state.from_dst_file); 2375 } 2376 } 2377 2378 qemu_thread_join(&ms->rp_state.rp_thread); 2379 ms->rp_state.rp_thread_created = false; 2380 migration_release_dst_files(ms); 2381 trace_migration_return_path_end_after(); 2382 2383 /* Return path will persist the error in MigrationState when quit */ 2384 return migrate_has_error(ms); 2385 } 2386 2387 static inline void 2388 migration_wait_main_channel(MigrationState *ms) 2389 { 2390 /* Wait until one PONG message received */ 2391 qemu_sem_wait(&ms->rp_state.rp_pong_acks); 2392 } 2393 2394 /* 2395 * Switch from normal iteration to postcopy 2396 * Returns non-0 on error 2397 */ 2398 static int postcopy_start(MigrationState *ms, Error **errp) 2399 { 2400 int ret; 2401 QIOChannelBuffer *bioc; 2402 QEMUFile *fb; 2403 uint64_t bandwidth = migrate_max_postcopy_bandwidth(); 2404 bool restart_block = false; 2405 int cur_state = MIGRATION_STATUS_ACTIVE; 2406 2407 if (migrate_postcopy_preempt()) { 2408 migration_wait_main_channel(ms); 2409 if (postcopy_preempt_establish_channel(ms)) { 2410 migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); 2411 return -1; 2412 } 2413 } 2414 2415 if (!migrate_pause_before_switchover()) { 2416 migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, 2417 MIGRATION_STATUS_POSTCOPY_ACTIVE); 2418 } 2419 2420 trace_postcopy_start(); 2421 qemu_mutex_lock_iothread(); 2422 trace_postcopy_start_set_run(); 2423 2424 migration_downtime_start(ms); 2425 2426 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); 2427 global_state_store(); 2428 ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); 2429 if (ret < 0) { 2430 goto fail; 2431 } 2432 2433 ret = migration_maybe_pause(ms, &cur_state, 2434 MIGRATION_STATUS_POSTCOPY_ACTIVE); 2435 if (ret < 0) { 2436 goto fail; 2437 } 2438 2439 ret = bdrv_inactivate_all(); 2440 if (ret < 0) { 2441 goto fail; 2442 } 2443 restart_block = true; 2444 2445 /* 2446 * Cause any non-postcopiable, but iterative devices to 2447 * send out their final data. 2448 */ 2449 qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false); 2450 2451 /* 2452 * in Finish migrate and with the io-lock held everything should 2453 * be quiet, but we've potentially still got dirty pages and we 2454 * need to tell the destination to throw any pages it's already received 2455 * that are dirty 2456 */ 2457 if (migrate_postcopy_ram()) { 2458 ram_postcopy_send_discard_bitmap(ms); 2459 } 2460 2461 /* 2462 * send rest of state - note things that are doing postcopy 2463 * will notice we're in POSTCOPY_ACTIVE and not actually 2464 * wrap their state up here 2465 */ 2466 migration_rate_set(bandwidth); 2467 if (migrate_postcopy_ram()) { 2468 /* Ping just for debugging, helps line traces up */ 2469 qemu_savevm_send_ping(ms->to_dst_file, 2); 2470 } 2471 2472 /* 2473 * While loading the device state we may trigger page transfer 2474 * requests and the fd must be free to process those, and thus 2475 * the destination must read the whole device state off the fd before 2476 * it starts processing it. Unfortunately the ad-hoc migration format 2477 * doesn't allow the destination to know the size to read without fully 2478 * parsing it through each devices load-state code (especially the open 2479 * coded devices that use get/put). 2480 * So we wrap the device state up in a package with a length at the start; 2481 * to do this we use a qemu_buf to hold the whole of the device state. 2482 */ 2483 bioc = qio_channel_buffer_new(4096); 2484 qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer"); 2485 fb = qemu_file_new_output(QIO_CHANNEL(bioc)); 2486 object_unref(OBJECT(bioc)); 2487 2488 /* 2489 * Make sure the receiver can get incoming pages before we send the rest 2490 * of the state 2491 */ 2492 qemu_savevm_send_postcopy_listen(fb); 2493 2494 qemu_savevm_state_complete_precopy(fb, false, false); 2495 if (migrate_postcopy_ram()) { 2496 qemu_savevm_send_ping(fb, 3); 2497 } 2498 2499 qemu_savevm_send_postcopy_run(fb); 2500 2501 /* <><> end of stuff going into the package */ 2502 2503 /* Last point of recovery; as soon as we send the package the destination 2504 * can open devices and potentially start running. 2505 * Lets just check again we've not got any errors. 2506 */ 2507 ret = qemu_file_get_error(ms->to_dst_file); 2508 if (ret) { 2509 error_setg(errp, "postcopy_start: Migration stream errored (pre package)"); 2510 goto fail_closefb; 2511 } 2512 2513 restart_block = false; 2514 2515 /* Now send that blob */ 2516 if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) { 2517 goto fail_closefb; 2518 } 2519 qemu_fclose(fb); 2520 2521 /* Send a notify to give a chance for anything that needs to happen 2522 * at the transition to postcopy and after the device state; in particular 2523 * spice needs to trigger a transition now 2524 */ 2525 ms->postcopy_after_devices = true; 2526 migration_call_notifiers(ms); 2527 2528 migration_downtime_end(ms); 2529 2530 qemu_mutex_unlock_iothread(); 2531 2532 if (migrate_postcopy_ram()) { 2533 /* 2534 * Although this ping is just for debug, it could potentially be 2535 * used for getting a better measurement of downtime at the source. 2536 */ 2537 qemu_savevm_send_ping(ms->to_dst_file, 4); 2538 } 2539 2540 if (migrate_release_ram()) { 2541 ram_postcopy_migrated_memory_release(ms); 2542 } 2543 2544 ret = qemu_file_get_error(ms->to_dst_file); 2545 if (ret) { 2546 error_setg(errp, "postcopy_start: Migration stream errored"); 2547 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 2548 MIGRATION_STATUS_FAILED); 2549 } 2550 2551 trace_postcopy_preempt_enabled(migrate_postcopy_preempt()); 2552 2553 return ret; 2554 2555 fail_closefb: 2556 qemu_fclose(fb); 2557 fail: 2558 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 2559 MIGRATION_STATUS_FAILED); 2560 if (restart_block) { 2561 /* A failure happened early enough that we know the destination hasn't 2562 * accessed block devices, so we're safe to recover. 2563 */ 2564 Error *local_err = NULL; 2565 2566 bdrv_activate_all(&local_err); 2567 if (local_err) { 2568 error_report_err(local_err); 2569 } 2570 } 2571 qemu_mutex_unlock_iothread(); 2572 return -1; 2573 } 2574 2575 /** 2576 * migration_maybe_pause: Pause if required to by 2577 * migrate_pause_before_switchover called with the iothread locked 2578 * Returns: 0 on success 2579 */ 2580 static int migration_maybe_pause(MigrationState *s, 2581 int *current_active_state, 2582 int new_state) 2583 { 2584 if (!migrate_pause_before_switchover()) { 2585 return 0; 2586 } 2587 2588 /* Since leaving this state is not atomic with posting the semaphore 2589 * it's possible that someone could have issued multiple migrate_continue 2590 * and the semaphore is incorrectly positive at this point; 2591 * the docs say it's undefined to reinit a semaphore that's already 2592 * init'd, so use timedwait to eat up any existing posts. 2593 */ 2594 while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) { 2595 /* This block intentionally left blank */ 2596 } 2597 2598 /* 2599 * If the migration is cancelled when it is in the completion phase, 2600 * the migration state is set to MIGRATION_STATUS_CANCELLING. 2601 * So we don't need to wait a semaphore, otherwise we would always 2602 * wait for the 'pause_sem' semaphore. 2603 */ 2604 if (s->state != MIGRATION_STATUS_CANCELLING) { 2605 qemu_mutex_unlock_iothread(); 2606 migrate_set_state(&s->state, *current_active_state, 2607 MIGRATION_STATUS_PRE_SWITCHOVER); 2608 qemu_sem_wait(&s->pause_sem); 2609 migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER, 2610 new_state); 2611 *current_active_state = new_state; 2612 qemu_mutex_lock_iothread(); 2613 } 2614 2615 return s->state == new_state ? 0 : -EINVAL; 2616 } 2617 2618 static int migration_completion_precopy(MigrationState *s, 2619 int *current_active_state) 2620 { 2621 int ret; 2622 2623 qemu_mutex_lock_iothread(); 2624 migration_downtime_start(s); 2625 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); 2626 2627 s->vm_old_state = runstate_get(); 2628 global_state_store(); 2629 2630 ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE); 2631 trace_migration_completion_vm_stop(ret); 2632 if (ret < 0) { 2633 goto out_unlock; 2634 } 2635 2636 ret = migration_maybe_pause(s, current_active_state, 2637 MIGRATION_STATUS_DEVICE); 2638 if (ret < 0) { 2639 goto out_unlock; 2640 } 2641 2642 /* 2643 * Inactivate disks except in COLO, and track that we have done so in order 2644 * to remember to reactivate them if migration fails or is cancelled. 2645 */ 2646 s->block_inactive = !migrate_colo(); 2647 migration_rate_set(RATE_LIMIT_DISABLED); 2648 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false, 2649 s->block_inactive); 2650 out_unlock: 2651 qemu_mutex_unlock_iothread(); 2652 return ret; 2653 } 2654 2655 static void migration_completion_postcopy(MigrationState *s) 2656 { 2657 trace_migration_completion_postcopy_end(); 2658 2659 qemu_mutex_lock_iothread(); 2660 qemu_savevm_state_complete_postcopy(s->to_dst_file); 2661 qemu_mutex_unlock_iothread(); 2662 2663 /* 2664 * Shutdown the postcopy fast path thread. This is only needed when dest 2665 * QEMU binary is old (7.1/7.2). QEMU 8.0+ doesn't need this. 2666 */ 2667 if (migrate_postcopy_preempt() && s->preempt_pre_7_2) { 2668 postcopy_preempt_shutdown_file(s); 2669 } 2670 2671 trace_migration_completion_postcopy_end_after_complete(); 2672 } 2673 2674 static void migration_completion_failed(MigrationState *s, 2675 int current_active_state) 2676 { 2677 if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE || 2678 s->state == MIGRATION_STATUS_DEVICE)) { 2679 /* 2680 * If not doing postcopy, vm_start() will be called: let's 2681 * regain control on images. 2682 */ 2683 Error *local_err = NULL; 2684 2685 qemu_mutex_lock_iothread(); 2686 bdrv_activate_all(&local_err); 2687 if (local_err) { 2688 error_report_err(local_err); 2689 } else { 2690 s->block_inactive = false; 2691 } 2692 qemu_mutex_unlock_iothread(); 2693 } 2694 2695 migrate_set_state(&s->state, current_active_state, 2696 MIGRATION_STATUS_FAILED); 2697 } 2698 2699 /** 2700 * migration_completion: Used by migration_thread when there's not much left. 2701 * The caller 'breaks' the loop when this returns. 2702 * 2703 * @s: Current migration state 2704 */ 2705 static void migration_completion(MigrationState *s) 2706 { 2707 int ret = 0; 2708 int current_active_state = s->state; 2709 2710 if (s->state == MIGRATION_STATUS_ACTIVE) { 2711 ret = migration_completion_precopy(s, ¤t_active_state); 2712 } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { 2713 migration_completion_postcopy(s); 2714 } else { 2715 ret = -1; 2716 } 2717 2718 if (ret < 0) { 2719 goto fail; 2720 } 2721 2722 if (close_return_path_on_source(s)) { 2723 goto fail; 2724 } 2725 2726 if (qemu_file_get_error(s->to_dst_file)) { 2727 trace_migration_completion_file_err(); 2728 goto fail; 2729 } 2730 2731 if (migrate_colo() && s->state == MIGRATION_STATUS_ACTIVE) { 2732 /* COLO does not support postcopy */ 2733 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, 2734 MIGRATION_STATUS_COLO); 2735 } else { 2736 migrate_set_state(&s->state, current_active_state, 2737 MIGRATION_STATUS_COMPLETED); 2738 } 2739 2740 return; 2741 2742 fail: 2743 migration_completion_failed(s, current_active_state); 2744 } 2745 2746 /** 2747 * bg_migration_completion: Used by bg_migration_thread when after all the 2748 * RAM has been saved. The caller 'breaks' the loop when this returns. 2749 * 2750 * @s: Current migration state 2751 */ 2752 static void bg_migration_completion(MigrationState *s) 2753 { 2754 int current_active_state = s->state; 2755 2756 if (s->state == MIGRATION_STATUS_ACTIVE) { 2757 /* 2758 * By this moment we have RAM content saved into the migration stream. 2759 * The next step is to flush the non-RAM content (device state) 2760 * right after the ram content. The device state has been stored into 2761 * the temporary buffer before RAM saving started. 2762 */ 2763 qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage); 2764 qemu_fflush(s->to_dst_file); 2765 } else if (s->state == MIGRATION_STATUS_CANCELLING) { 2766 goto fail; 2767 } 2768 2769 if (qemu_file_get_error(s->to_dst_file)) { 2770 trace_migration_completion_file_err(); 2771 goto fail; 2772 } 2773 2774 migrate_set_state(&s->state, current_active_state, 2775 MIGRATION_STATUS_COMPLETED); 2776 return; 2777 2778 fail: 2779 migrate_set_state(&s->state, current_active_state, 2780 MIGRATION_STATUS_FAILED); 2781 } 2782 2783 typedef enum MigThrError { 2784 /* No error detected */ 2785 MIG_THR_ERR_NONE = 0, 2786 /* Detected error, but resumed successfully */ 2787 MIG_THR_ERR_RECOVERED = 1, 2788 /* Detected fatal error, need to exit */ 2789 MIG_THR_ERR_FATAL = 2, 2790 } MigThrError; 2791 2792 static int postcopy_resume_handshake(MigrationState *s) 2793 { 2794 qemu_savevm_send_postcopy_resume(s->to_dst_file); 2795 2796 while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { 2797 if (migration_rp_wait(s)) { 2798 return -1; 2799 } 2800 } 2801 2802 if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { 2803 return 0; 2804 } 2805 2806 return -1; 2807 } 2808 2809 /* Return zero if success, or <0 for error */ 2810 static int postcopy_do_resume(MigrationState *s) 2811 { 2812 int ret; 2813 2814 /* 2815 * Call all the resume_prepare() hooks, so that modules can be 2816 * ready for the migration resume. 2817 */ 2818 ret = qemu_savevm_state_resume_prepare(s); 2819 if (ret) { 2820 error_report("%s: resume_prepare() failure detected: %d", 2821 __func__, ret); 2822 return ret; 2823 } 2824 2825 /* 2826 * If preempt is enabled, re-establish the preempt channel. Note that 2827 * we do it after resume prepare to make sure the main channel will be 2828 * created before the preempt channel. E.g. with weak network, the 2829 * dest QEMU may get messed up with the preempt and main channels on 2830 * the order of connection setup. This guarantees the correct order. 2831 */ 2832 ret = postcopy_preempt_establish_channel(s); 2833 if (ret) { 2834 error_report("%s: postcopy_preempt_establish_channel(): %d", 2835 __func__, ret); 2836 return ret; 2837 } 2838 2839 /* 2840 * Last handshake with destination on the resume (destination will 2841 * switch to postcopy-active afterwards) 2842 */ 2843 ret = postcopy_resume_handshake(s); 2844 if (ret) { 2845 error_report("%s: handshake failed: %d", __func__, ret); 2846 return ret; 2847 } 2848 2849 return 0; 2850 } 2851 2852 /* 2853 * We don't return until we are in a safe state to continue current 2854 * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or 2855 * MIG_THR_ERR_FATAL if unrecovery failure happened. 2856 */ 2857 static MigThrError postcopy_pause(MigrationState *s) 2858 { 2859 assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); 2860 2861 while (true) { 2862 QEMUFile *file; 2863 2864 /* 2865 * Current channel is possibly broken. Release it. Note that this is 2866 * guaranteed even without lock because to_dst_file should only be 2867 * modified by the migration thread. That also guarantees that the 2868 * unregister of yank is safe too without the lock. It should be safe 2869 * even to be within the qemu_file_lock, but we didn't do that to avoid 2870 * taking more mutex (yank_lock) within qemu_file_lock. TL;DR: we make 2871 * the qemu_file_lock critical section as small as possible. 2872 */ 2873 assert(s->to_dst_file); 2874 migration_ioc_unregister_yank_from_file(s->to_dst_file); 2875 qemu_mutex_lock(&s->qemu_file_lock); 2876 file = s->to_dst_file; 2877 s->to_dst_file = NULL; 2878 qemu_mutex_unlock(&s->qemu_file_lock); 2879 2880 qemu_file_shutdown(file); 2881 qemu_fclose(file); 2882 2883 /* 2884 * We're already pausing, so ignore any errors on the return 2885 * path and just wait for the thread to finish. It will be 2886 * re-created when we resume. 2887 */ 2888 close_return_path_on_source(s); 2889 2890 migrate_set_state(&s->state, s->state, 2891 MIGRATION_STATUS_POSTCOPY_PAUSED); 2892 2893 error_report("Detected IO failure for postcopy. " 2894 "Migration paused."); 2895 2896 /* 2897 * We wait until things fixed up. Then someone will setup the 2898 * status back for us. 2899 */ 2900 while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { 2901 qemu_sem_wait(&s->postcopy_pause_sem); 2902 } 2903 2904 if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { 2905 /* Woken up by a recover procedure. Give it a shot */ 2906 2907 /* Do the resume logic */ 2908 if (postcopy_do_resume(s) == 0) { 2909 /* Let's continue! */ 2910 trace_postcopy_pause_continued(); 2911 return MIG_THR_ERR_RECOVERED; 2912 } else { 2913 /* 2914 * Something wrong happened during the recovery, let's 2915 * pause again. Pause is always better than throwing 2916 * data away. 2917 */ 2918 continue; 2919 } 2920 } else { 2921 /* This is not right... Time to quit. */ 2922 return MIG_THR_ERR_FATAL; 2923 } 2924 } 2925 } 2926 2927 static MigThrError migration_detect_error(MigrationState *s) 2928 { 2929 int ret; 2930 int state = s->state; 2931 Error *local_error = NULL; 2932 2933 if (state == MIGRATION_STATUS_CANCELLING || 2934 state == MIGRATION_STATUS_CANCELLED) { 2935 /* End the migration, but don't set the state to failed */ 2936 return MIG_THR_ERR_FATAL; 2937 } 2938 2939 /* 2940 * Try to detect any file errors. Note that postcopy_qemufile_src will 2941 * be NULL when postcopy preempt is not enabled. 2942 */ 2943 ret = qemu_file_get_error_obj_any(s->to_dst_file, 2944 s->postcopy_qemufile_src, 2945 &local_error); 2946 if (!ret) { 2947 /* Everything is fine */ 2948 assert(!local_error); 2949 return MIG_THR_ERR_NONE; 2950 } 2951 2952 if (local_error) { 2953 migrate_set_error(s, local_error); 2954 error_free(local_error); 2955 } 2956 2957 if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) { 2958 /* 2959 * For postcopy, we allow the network to be down for a 2960 * while. After that, it can be continued by a 2961 * recovery phase. 2962 */ 2963 return postcopy_pause(s); 2964 } else { 2965 /* 2966 * For precopy (or postcopy with error outside IO), we fail 2967 * with no time. 2968 */ 2969 migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); 2970 trace_migration_thread_file_err(); 2971 2972 /* Time to stop the migration, now. */ 2973 return MIG_THR_ERR_FATAL; 2974 } 2975 } 2976 2977 static void migration_calculate_complete(MigrationState *s) 2978 { 2979 uint64_t bytes = migration_transferred_bytes(); 2980 int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 2981 int64_t transfer_time; 2982 2983 migration_downtime_end(s); 2984 s->total_time = end_time - s->start_time; 2985 transfer_time = s->total_time - s->setup_time; 2986 if (transfer_time) { 2987 s->mbps = ((double) bytes * 8.0) / transfer_time / 1000; 2988 } 2989 } 2990 2991 static void update_iteration_initial_status(MigrationState *s) 2992 { 2993 /* 2994 * Update these three fields at the same time to avoid mismatch info lead 2995 * wrong speed calculation. 2996 */ 2997 s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 2998 s->iteration_initial_bytes = migration_transferred_bytes(); 2999 s->iteration_initial_pages = ram_get_total_transferred_pages(); 3000 } 3001 3002 static void migration_update_counters(MigrationState *s, 3003 int64_t current_time) 3004 { 3005 uint64_t transferred, transferred_pages, time_spent; 3006 uint64_t current_bytes; /* bytes transferred since the beginning */ 3007 uint64_t switchover_bw; 3008 /* Expected bandwidth when switching over to destination QEMU */ 3009 double expected_bw_per_ms; 3010 double bandwidth; 3011 3012 if (current_time < s->iteration_start_time + BUFFER_DELAY) { 3013 return; 3014 } 3015 3016 switchover_bw = migrate_avail_switchover_bandwidth(); 3017 current_bytes = migration_transferred_bytes(); 3018 transferred = current_bytes - s->iteration_initial_bytes; 3019 time_spent = current_time - s->iteration_start_time; 3020 bandwidth = (double)transferred / time_spent; 3021 3022 if (switchover_bw) { 3023 /* 3024 * If the user specified a switchover bandwidth, let's trust the 3025 * user so that can be more accurate than what we estimated. 3026 */ 3027 expected_bw_per_ms = switchover_bw / 1000; 3028 } else { 3029 /* If the user doesn't specify bandwidth, we use the estimated */ 3030 expected_bw_per_ms = bandwidth; 3031 } 3032 3033 s->threshold_size = expected_bw_per_ms * migrate_downtime_limit(); 3034 3035 s->mbps = (((double) transferred * 8.0) / 3036 ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; 3037 3038 transferred_pages = ram_get_total_transferred_pages() - 3039 s->iteration_initial_pages; 3040 s->pages_per_second = (double) transferred_pages / 3041 (((double) time_spent / 1000.0)); 3042 3043 /* 3044 * if we haven't sent anything, we don't want to 3045 * recalculate. 10000 is a small enough number for our purposes 3046 */ 3047 if (stat64_get(&mig_stats.dirty_pages_rate) && 3048 transferred > 10000) { 3049 s->expected_downtime = 3050 stat64_get(&mig_stats.dirty_bytes_last_sync) / expected_bw_per_ms; 3051 } 3052 3053 migration_rate_reset(); 3054 3055 update_iteration_initial_status(s); 3056 3057 trace_migrate_transferred(transferred, time_spent, 3058 /* Both in unit bytes/ms */ 3059 bandwidth, switchover_bw / 1000, 3060 s->threshold_size); 3061 } 3062 3063 static bool migration_can_switchover(MigrationState *s) 3064 { 3065 if (!migrate_switchover_ack()) { 3066 return true; 3067 } 3068 3069 /* No reason to wait for switchover ACK if VM is stopped */ 3070 if (!runstate_is_running()) { 3071 return true; 3072 } 3073 3074 return s->switchover_acked; 3075 } 3076 3077 /* Migration thread iteration status */ 3078 typedef enum { 3079 MIG_ITERATE_RESUME, /* Resume current iteration */ 3080 MIG_ITERATE_SKIP, /* Skip current iteration */ 3081 MIG_ITERATE_BREAK, /* Break the loop */ 3082 } MigIterateState; 3083 3084 /* 3085 * Return true if continue to the next iteration directly, false 3086 * otherwise. 3087 */ 3088 static MigIterateState migration_iteration_run(MigrationState *s) 3089 { 3090 uint64_t must_precopy, can_postcopy; 3091 Error *local_err = NULL; 3092 bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; 3093 bool can_switchover = migration_can_switchover(s); 3094 3095 qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy); 3096 uint64_t pending_size = must_precopy + can_postcopy; 3097 3098 trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy); 3099 3100 if (must_precopy <= s->threshold_size) { 3101 qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy); 3102 pending_size = must_precopy + can_postcopy; 3103 trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy); 3104 } 3105 3106 if ((!pending_size || pending_size < s->threshold_size) && can_switchover) { 3107 trace_migration_thread_low_pending(pending_size); 3108 migration_completion(s); 3109 return MIG_ITERATE_BREAK; 3110 } 3111 3112 /* Still a significant amount to transfer */ 3113 if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover && 3114 qatomic_read(&s->start_postcopy)) { 3115 if (postcopy_start(s, &local_err)) { 3116 migrate_set_error(s, local_err); 3117 error_report_err(local_err); 3118 } 3119 return MIG_ITERATE_SKIP; 3120 } 3121 3122 /* Just another iteration step */ 3123 qemu_savevm_state_iterate(s->to_dst_file, in_postcopy); 3124 return MIG_ITERATE_RESUME; 3125 } 3126 3127 static void migration_iteration_finish(MigrationState *s) 3128 { 3129 /* If we enabled cpu throttling for auto-converge, turn it off. */ 3130 cpu_throttle_stop(); 3131 3132 qemu_mutex_lock_iothread(); 3133 switch (s->state) { 3134 case MIGRATION_STATUS_COMPLETED: 3135 migration_calculate_complete(s); 3136 runstate_set(RUN_STATE_POSTMIGRATE); 3137 break; 3138 case MIGRATION_STATUS_COLO: 3139 assert(migrate_colo()); 3140 migrate_start_colo_process(s); 3141 s->vm_old_state = RUN_STATE_RUNNING; 3142 /* Fallthrough */ 3143 case MIGRATION_STATUS_FAILED: 3144 case MIGRATION_STATUS_CANCELLED: 3145 case MIGRATION_STATUS_CANCELLING: 3146 if (s->vm_old_state == RUN_STATE_RUNNING) { 3147 if (!runstate_check(RUN_STATE_SHUTDOWN)) { 3148 vm_start(); 3149 } 3150 } else { 3151 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { 3152 runstate_set(s->vm_old_state); 3153 } 3154 } 3155 break; 3156 3157 default: 3158 /* Should not reach here, but if so, forgive the VM. */ 3159 error_report("%s: Unknown ending state %d", __func__, s->state); 3160 break; 3161 } 3162 migrate_fd_cleanup_schedule(s); 3163 qemu_mutex_unlock_iothread(); 3164 } 3165 3166 static void bg_migration_iteration_finish(MigrationState *s) 3167 { 3168 /* 3169 * Stop tracking RAM writes - un-protect memory, un-register UFFD 3170 * memory ranges, flush kernel wait queues and wake up threads 3171 * waiting for write fault to be resolved. 3172 */ 3173 ram_write_tracking_stop(); 3174 3175 qemu_mutex_lock_iothread(); 3176 switch (s->state) { 3177 case MIGRATION_STATUS_COMPLETED: 3178 migration_calculate_complete(s); 3179 break; 3180 3181 case MIGRATION_STATUS_ACTIVE: 3182 case MIGRATION_STATUS_FAILED: 3183 case MIGRATION_STATUS_CANCELLED: 3184 case MIGRATION_STATUS_CANCELLING: 3185 break; 3186 3187 default: 3188 /* Should not reach here, but if so, forgive the VM. */ 3189 error_report("%s: Unknown ending state %d", __func__, s->state); 3190 break; 3191 } 3192 3193 migrate_fd_cleanup_schedule(s); 3194 qemu_mutex_unlock_iothread(); 3195 } 3196 3197 /* 3198 * Return true if continue to the next iteration directly, false 3199 * otherwise. 3200 */ 3201 static MigIterateState bg_migration_iteration_run(MigrationState *s) 3202 { 3203 int res; 3204 3205 res = qemu_savevm_state_iterate(s->to_dst_file, false); 3206 if (res > 0) { 3207 bg_migration_completion(s); 3208 return MIG_ITERATE_BREAK; 3209 } 3210 3211 return MIG_ITERATE_RESUME; 3212 } 3213 3214 void migration_make_urgent_request(void) 3215 { 3216 qemu_sem_post(&migrate_get_current()->rate_limit_sem); 3217 } 3218 3219 void migration_consume_urgent_request(void) 3220 { 3221 qemu_sem_wait(&migrate_get_current()->rate_limit_sem); 3222 } 3223 3224 /* Returns true if the rate limiting was broken by an urgent request */ 3225 bool migration_rate_limit(void) 3226 { 3227 int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 3228 MigrationState *s = migrate_get_current(); 3229 3230 bool urgent = false; 3231 migration_update_counters(s, now); 3232 if (migration_rate_exceeded(s->to_dst_file)) { 3233 3234 if (qemu_file_get_error(s->to_dst_file)) { 3235 return false; 3236 } 3237 /* 3238 * Wait for a delay to do rate limiting OR 3239 * something urgent to post the semaphore. 3240 */ 3241 int ms = s->iteration_start_time + BUFFER_DELAY - now; 3242 trace_migration_rate_limit_pre(ms); 3243 if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) { 3244 /* 3245 * We were woken by one or more urgent things but 3246 * the timedwait will have consumed one of them. 3247 * The service routine for the urgent wake will dec 3248 * the semaphore itself for each item it consumes, 3249 * so add this one we just eat back. 3250 */ 3251 qemu_sem_post(&s->rate_limit_sem); 3252 urgent = true; 3253 } 3254 trace_migration_rate_limit_post(urgent); 3255 } 3256 return urgent; 3257 } 3258 3259 /* 3260 * if failover devices are present, wait they are completely 3261 * unplugged 3262 */ 3263 3264 static void qemu_savevm_wait_unplug(MigrationState *s, int old_state, 3265 int new_state) 3266 { 3267 if (qemu_savevm_state_guest_unplug_pending()) { 3268 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG); 3269 3270 while (s->state == MIGRATION_STATUS_WAIT_UNPLUG && 3271 qemu_savevm_state_guest_unplug_pending()) { 3272 qemu_sem_timedwait(&s->wait_unplug_sem, 250); 3273 } 3274 if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) { 3275 int timeout = 120; /* 30 seconds */ 3276 /* 3277 * migration has been canceled 3278 * but as we have started an unplug we must wait the end 3279 * to be able to plug back the card 3280 */ 3281 while (timeout-- && qemu_savevm_state_guest_unplug_pending()) { 3282 qemu_sem_timedwait(&s->wait_unplug_sem, 250); 3283 } 3284 if (qemu_savevm_state_guest_unplug_pending() && 3285 !qtest_enabled()) { 3286 warn_report("migration: partially unplugged device on " 3287 "failure"); 3288 } 3289 } 3290 3291 migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state); 3292 } else { 3293 migrate_set_state(&s->state, old_state, new_state); 3294 } 3295 } 3296 3297 /* 3298 * Master migration thread on the source VM. 3299 * It drives the migration and pumps the data down the outgoing channel. 3300 */ 3301 static void *migration_thread(void *opaque) 3302 { 3303 MigrationState *s = opaque; 3304 MigrationThread *thread = NULL; 3305 int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); 3306 MigThrError thr_error; 3307 bool urgent = false; 3308 3309 thread = migration_threads_add("live_migration", qemu_get_thread_id()); 3310 3311 rcu_register_thread(); 3312 3313 object_ref(OBJECT(s)); 3314 update_iteration_initial_status(s); 3315 3316 qemu_mutex_lock_iothread(); 3317 qemu_savevm_state_header(s->to_dst_file); 3318 qemu_mutex_unlock_iothread(); 3319 3320 /* 3321 * If we opened the return path, we need to make sure dst has it 3322 * opened as well. 3323 */ 3324 if (s->rp_state.rp_thread_created) { 3325 /* Now tell the dest that it should open its end so it can reply */ 3326 qemu_savevm_send_open_return_path(s->to_dst_file); 3327 3328 /* And do a ping that will make stuff easier to debug */ 3329 qemu_savevm_send_ping(s->to_dst_file, 1); 3330 } 3331 3332 if (migrate_postcopy()) { 3333 /* 3334 * Tell the destination that we *might* want to do postcopy later; 3335 * if the other end can't do postcopy it should fail now, nice and 3336 * early. 3337 */ 3338 qemu_savevm_send_postcopy_advise(s->to_dst_file); 3339 } 3340 3341 if (migrate_colo()) { 3342 /* Notify migration destination that we enable COLO */ 3343 qemu_savevm_send_colo_enable(s->to_dst_file); 3344 } 3345 3346 qemu_mutex_lock_iothread(); 3347 qemu_savevm_state_setup(s->to_dst_file); 3348 qemu_mutex_unlock_iothread(); 3349 3350 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, 3351 MIGRATION_STATUS_ACTIVE); 3352 3353 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; 3354 3355 trace_migration_thread_setup_complete(); 3356 3357 while (migration_is_active(s)) { 3358 if (urgent || !migration_rate_exceeded(s->to_dst_file)) { 3359 MigIterateState iter_state = migration_iteration_run(s); 3360 if (iter_state == MIG_ITERATE_SKIP) { 3361 continue; 3362 } else if (iter_state == MIG_ITERATE_BREAK) { 3363 break; 3364 } 3365 } 3366 3367 /* 3368 * Try to detect any kind of failures, and see whether we 3369 * should stop the migration now. 3370 */ 3371 thr_error = migration_detect_error(s); 3372 if (thr_error == MIG_THR_ERR_FATAL) { 3373 /* Stop migration */ 3374 break; 3375 } else if (thr_error == MIG_THR_ERR_RECOVERED) { 3376 /* 3377 * Just recovered from a e.g. network failure, reset all 3378 * the local variables. This is important to avoid 3379 * breaking transferred_bytes and bandwidth calculation 3380 */ 3381 update_iteration_initial_status(s); 3382 } 3383 3384 urgent = migration_rate_limit(); 3385 } 3386 3387 trace_migration_thread_after_loop(); 3388 migration_iteration_finish(s); 3389 object_unref(OBJECT(s)); 3390 rcu_unregister_thread(); 3391 migration_threads_remove(thread); 3392 return NULL; 3393 } 3394 3395 static void bg_migration_vm_start_bh(void *opaque) 3396 { 3397 MigrationState *s = opaque; 3398 3399 qemu_bh_delete(s->vm_start_bh); 3400 s->vm_start_bh = NULL; 3401 3402 vm_start(); 3403 migration_downtime_end(s); 3404 } 3405 3406 /** 3407 * Background snapshot thread, based on live migration code. 3408 * This is an alternative implementation of live migration mechanism 3409 * introduced specifically to support background snapshots. 3410 * 3411 * It takes advantage of userfault_fd write protection mechanism introduced 3412 * in v5.7 kernel. Compared to existing dirty page logging migration much 3413 * lesser stream traffic is produced resulting in smaller snapshot images, 3414 * simply cause of no page duplicates can get into the stream. 3415 * 3416 * Another key point is that generated vmstate stream reflects machine state 3417 * 'frozen' at the beginning of snapshot creation compared to dirty page logging 3418 * mechanism, which effectively results in that saved snapshot is the state of VM 3419 * at the end of the process. 3420 */ 3421 static void *bg_migration_thread(void *opaque) 3422 { 3423 MigrationState *s = opaque; 3424 int64_t setup_start; 3425 MigThrError thr_error; 3426 QEMUFile *fb; 3427 bool early_fail = true; 3428 3429 rcu_register_thread(); 3430 object_ref(OBJECT(s)); 3431 3432 migration_rate_set(RATE_LIMIT_DISABLED); 3433 3434 setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); 3435 /* 3436 * We want to save vmstate for the moment when migration has been 3437 * initiated but also we want to save RAM content while VM is running. 3438 * The RAM content should appear first in the vmstate. So, we first 3439 * stash the non-RAM part of the vmstate to the temporary buffer, 3440 * then write RAM part of the vmstate to the migration stream 3441 * with vCPUs running and, finally, write stashed non-RAM part of 3442 * the vmstate from the buffer to the migration stream. 3443 */ 3444 s->bioc = qio_channel_buffer_new(512 * 1024); 3445 qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer"); 3446 fb = qemu_file_new_output(QIO_CHANNEL(s->bioc)); 3447 object_unref(OBJECT(s->bioc)); 3448 3449 update_iteration_initial_status(s); 3450 3451 /* 3452 * Prepare for tracking memory writes with UFFD-WP - populate 3453 * RAM pages before protecting. 3454 */ 3455 #ifdef __linux__ 3456 ram_write_tracking_prepare(); 3457 #endif 3458 3459 qemu_mutex_lock_iothread(); 3460 qemu_savevm_state_header(s->to_dst_file); 3461 qemu_savevm_state_setup(s->to_dst_file); 3462 qemu_mutex_unlock_iothread(); 3463 3464 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, 3465 MIGRATION_STATUS_ACTIVE); 3466 3467 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; 3468 3469 trace_migration_thread_setup_complete(); 3470 migration_downtime_start(s); 3471 3472 qemu_mutex_lock_iothread(); 3473 3474 /* 3475 * If VM is currently in suspended state, then, to make a valid runstate 3476 * transition in vm_stop_force_state() we need to wakeup it up. 3477 */ 3478 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL); 3479 s->vm_old_state = runstate_get(); 3480 3481 global_state_store(); 3482 /* Forcibly stop VM before saving state of vCPUs and devices */ 3483 if (migration_stop_vm(RUN_STATE_PAUSED)) { 3484 goto fail; 3485 } 3486 /* 3487 * Put vCPUs in sync with shadow context structures, then 3488 * save their state to channel-buffer along with devices. 3489 */ 3490 cpu_synchronize_all_states(); 3491 if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) { 3492 goto fail; 3493 } 3494 /* 3495 * Since we are going to get non-iterable state data directly 3496 * from s->bioc->data, explicit flush is needed here. 3497 */ 3498 qemu_fflush(fb); 3499 3500 /* Now initialize UFFD context and start tracking RAM writes */ 3501 if (ram_write_tracking_start()) { 3502 goto fail; 3503 } 3504 early_fail = false; 3505 3506 /* 3507 * Start VM from BH handler to avoid write-fault lock here. 3508 * UFFD-WP protection for the whole RAM is already enabled so 3509 * calling VM state change notifiers from vm_start() would initiate 3510 * writes to virtio VQs memory which is in write-protected region. 3511 */ 3512 s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s); 3513 qemu_bh_schedule(s->vm_start_bh); 3514 3515 qemu_mutex_unlock_iothread(); 3516 3517 while (migration_is_active(s)) { 3518 MigIterateState iter_state = bg_migration_iteration_run(s); 3519 if (iter_state == MIG_ITERATE_SKIP) { 3520 continue; 3521 } else if (iter_state == MIG_ITERATE_BREAK) { 3522 break; 3523 } 3524 3525 /* 3526 * Try to detect any kind of failures, and see whether we 3527 * should stop the migration now. 3528 */ 3529 thr_error = migration_detect_error(s); 3530 if (thr_error == MIG_THR_ERR_FATAL) { 3531 /* Stop migration */ 3532 break; 3533 } 3534 3535 migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME)); 3536 } 3537 3538 trace_migration_thread_after_loop(); 3539 3540 fail: 3541 if (early_fail) { 3542 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, 3543 MIGRATION_STATUS_FAILED); 3544 qemu_mutex_unlock_iothread(); 3545 } 3546 3547 bg_migration_iteration_finish(s); 3548 3549 qemu_fclose(fb); 3550 object_unref(OBJECT(s)); 3551 rcu_unregister_thread(); 3552 3553 return NULL; 3554 } 3555 3556 void migrate_fd_connect(MigrationState *s, Error *error_in) 3557 { 3558 Error *local_err = NULL; 3559 uint64_t rate_limit; 3560 bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED; 3561 3562 /* 3563 * If there's a previous error, free it and prepare for another one. 3564 * Meanwhile if migration completes successfully, there won't have an error 3565 * dumped when calling migrate_fd_cleanup(). 3566 */ 3567 migrate_error_free(s); 3568 3569 s->expected_downtime = migrate_downtime_limit(); 3570 if (resume) { 3571 assert(s->cleanup_bh); 3572 } else { 3573 assert(!s->cleanup_bh); 3574 s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s); 3575 } 3576 if (error_in) { 3577 migrate_fd_error(s, error_in); 3578 if (resume) { 3579 /* 3580 * Don't do cleanup for resume if channel is invalid, but only dump 3581 * the error. We wait for another channel connect from the user. 3582 * The error_report still gives HMP user a hint on what failed. 3583 * It's normally done in migrate_fd_cleanup(), but call it here 3584 * explicitly. 3585 */ 3586 error_report_err(error_copy(s->error)); 3587 } else { 3588 migrate_fd_cleanup(s); 3589 } 3590 return; 3591 } 3592 3593 if (resume) { 3594 /* This is a resumed migration */ 3595 rate_limit = migrate_max_postcopy_bandwidth(); 3596 } else { 3597 /* This is a fresh new migration */ 3598 rate_limit = migrate_max_bandwidth(); 3599 3600 /* Notify before starting migration thread */ 3601 migration_call_notifiers(s); 3602 } 3603 3604 migration_rate_set(rate_limit); 3605 qemu_file_set_blocking(s->to_dst_file, true); 3606 3607 /* 3608 * Open the return path. For postcopy, it is used exclusively. For 3609 * precopy, only if user specified "return-path" capability would 3610 * QEMU uses the return path. 3611 */ 3612 if (migrate_postcopy_ram() || migrate_return_path()) { 3613 if (open_return_path_on_source(s)) { 3614 error_setg(&local_err, "Unable to open return-path for postcopy"); 3615 migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); 3616 migrate_set_error(s, local_err); 3617 error_report_err(local_err); 3618 migrate_fd_cleanup(s); 3619 return; 3620 } 3621 } 3622 3623 /* 3624 * This needs to be done before resuming a postcopy. Note: for newer 3625 * QEMUs we will delay the channel creation until postcopy_start(), to 3626 * avoid disorder of channel creations. 3627 */ 3628 if (migrate_postcopy_preempt() && s->preempt_pre_7_2) { 3629 postcopy_preempt_setup(s); 3630 } 3631 3632 if (resume) { 3633 /* Wakeup the main migration thread to do the recovery */ 3634 migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED, 3635 MIGRATION_STATUS_POSTCOPY_RECOVER); 3636 qemu_sem_post(&s->postcopy_pause_sem); 3637 return; 3638 } 3639 3640 if (multifd_save_setup(&local_err) != 0) { 3641 migrate_set_error(s, local_err); 3642 error_report_err(local_err); 3643 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, 3644 MIGRATION_STATUS_FAILED); 3645 migrate_fd_cleanup(s); 3646 return; 3647 } 3648 3649 if (migrate_background_snapshot()) { 3650 qemu_thread_create(&s->thread, "bg_snapshot", 3651 bg_migration_thread, s, QEMU_THREAD_JOINABLE); 3652 } else { 3653 qemu_thread_create(&s->thread, "live_migration", 3654 migration_thread, s, QEMU_THREAD_JOINABLE); 3655 } 3656 s->migration_thread_running = true; 3657 } 3658 3659 static void migration_class_init(ObjectClass *klass, void *data) 3660 { 3661 DeviceClass *dc = DEVICE_CLASS(klass); 3662 3663 dc->user_creatable = false; 3664 device_class_set_props(dc, migration_properties); 3665 } 3666 3667 static void migration_instance_finalize(Object *obj) 3668 { 3669 MigrationState *ms = MIGRATION_OBJ(obj); 3670 3671 qemu_mutex_destroy(&ms->error_mutex); 3672 qemu_mutex_destroy(&ms->qemu_file_lock); 3673 qemu_sem_destroy(&ms->wait_unplug_sem); 3674 qemu_sem_destroy(&ms->rate_limit_sem); 3675 qemu_sem_destroy(&ms->pause_sem); 3676 qemu_sem_destroy(&ms->postcopy_pause_sem); 3677 qemu_sem_destroy(&ms->rp_state.rp_sem); 3678 qemu_sem_destroy(&ms->rp_state.rp_pong_acks); 3679 qemu_sem_destroy(&ms->postcopy_qemufile_src_sem); 3680 error_free(ms->error); 3681 } 3682 3683 static void migration_instance_init(Object *obj) 3684 { 3685 MigrationState *ms = MIGRATION_OBJ(obj); 3686 3687 ms->state = MIGRATION_STATUS_NONE; 3688 ms->mbps = -1; 3689 ms->pages_per_second = -1; 3690 qemu_sem_init(&ms->pause_sem, 0); 3691 qemu_mutex_init(&ms->error_mutex); 3692 3693 migrate_params_init(&ms->parameters); 3694 3695 qemu_sem_init(&ms->postcopy_pause_sem, 0); 3696 qemu_sem_init(&ms->rp_state.rp_sem, 0); 3697 qemu_sem_init(&ms->rp_state.rp_pong_acks, 0); 3698 qemu_sem_init(&ms->rate_limit_sem, 0); 3699 qemu_sem_init(&ms->wait_unplug_sem, 0); 3700 qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0); 3701 qemu_mutex_init(&ms->qemu_file_lock); 3702 } 3703 3704 /* 3705 * Return true if check pass, false otherwise. Error will be put 3706 * inside errp if provided. 3707 */ 3708 static bool migration_object_check(MigrationState *ms, Error **errp) 3709 { 3710 /* Assuming all off */ 3711 bool old_caps[MIGRATION_CAPABILITY__MAX] = { 0 }; 3712 3713 if (!migrate_params_check(&ms->parameters, errp)) { 3714 return false; 3715 } 3716 3717 return migrate_caps_check(old_caps, ms->capabilities, errp); 3718 } 3719 3720 static const TypeInfo migration_type = { 3721 .name = TYPE_MIGRATION, 3722 /* 3723 * NOTE: TYPE_MIGRATION is not really a device, as the object is 3724 * not created using qdev_new(), it is not attached to the qdev 3725 * device tree, and it is never realized. 3726 * 3727 * TODO: Make this TYPE_OBJECT once QOM provides something like 3728 * TYPE_DEVICE's "-global" properties. 3729 */ 3730 .parent = TYPE_DEVICE, 3731 .class_init = migration_class_init, 3732 .class_size = sizeof(MigrationClass), 3733 .instance_size = sizeof(MigrationState), 3734 .instance_init = migration_instance_init, 3735 .instance_finalize = migration_instance_finalize, 3736 }; 3737 3738 static void register_migration_types(void) 3739 { 3740 type_register_static(&migration_type); 3741 } 3742 3743 type_init(register_migration_types); 3744