1 /*
2 * QEMU live migration
3 *
4 * Copyright IBM, Corp. 2008
5 *
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
14 */
15
16 #include "qemu/osdep.h"
17 #include "qemu/cutils.h"
18 #include "qemu/error-report.h"
19 #include "qemu/main-loop.h"
20 #include "migration/blocker.h"
21 #include "exec.h"
22 #include "fd.h"
23 #include "file.h"
24 #include "socket.h"
25 #include "sysemu/runstate.h"
26 #include "sysemu/sysemu.h"
27 #include "sysemu/cpu-throttle.h"
28 #include "rdma.h"
29 #include "ram.h"
30 #include "ram-compress.h"
31 #include "migration/global_state.h"
32 #include "migration/misc.h"
33 #include "migration.h"
34 #include "migration-stats.h"
35 #include "savevm.h"
36 #include "qemu-file.h"
37 #include "channel.h"
38 #include "migration/vmstate.h"
39 #include "block/block.h"
40 #include "qapi/error.h"
41 #include "qapi/clone-visitor.h"
42 #include "qapi/qapi-visit-migration.h"
43 #include "qapi/qapi-visit-sockets.h"
44 #include "qapi/qapi-commands-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "qapi/qmp/qnull.h"
48 #include "qemu/rcu.h"
49 #include "block.h"
50 #include "postcopy-ram.h"
51 #include "qemu/thread.h"
52 #include "trace.h"
53 #include "exec/target_page.h"
54 #include "io/channel-buffer.h"
55 #include "io/channel-tls.h"
56 #include "migration/colo.h"
57 #include "hw/boards.h"
58 #include "monitor/monitor.h"
59 #include "net/announce.h"
60 #include "qemu/queue.h"
61 #include "multifd.h"
62 #include "threadinfo.h"
63 #include "qemu/yank.h"
64 #include "sysemu/cpus.h"
65 #include "yank_functions.h"
66 #include "sysemu/qtest.h"
67 #include "options.h"
68 #include "sysemu/dirtylimit.h"
69 #include "qemu/sockets.h"
70
71 static NotifierList migration_state_notifiers =
72 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
73
74 /* Messages sent on the return path from destination to source */
75 enum mig_rp_message_type {
76 MIG_RP_MSG_INVALID = 0, /* Must be 0 */
77 MIG_RP_MSG_SHUT, /* sibling will not send any more RP messages */
78 MIG_RP_MSG_PONG, /* Response to a PING; data (seq: be32 ) */
79
80 MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
81 MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */
82 MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */
83 MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */
84 MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */
85
86 MIG_RP_MSG_MAX
87 };
88
89 /* When we add fault tolerance, we could have several
90 migrations at once. For now we don't need to add
91 dynamic creation of migration */
92
93 static MigrationState *current_migration;
94 static MigrationIncomingState *current_incoming;
95
96 static GSList *migration_blockers[MIG_MODE__MAX];
97
98 static bool migration_object_check(MigrationState *ms, Error **errp);
99 static int migration_maybe_pause(MigrationState *s,
100 int *current_active_state,
101 int new_state);
102 static void migrate_fd_cancel(MigrationState *s);
103 static bool close_return_path_on_source(MigrationState *s);
104
migration_downtime_start(MigrationState * s)105 static void migration_downtime_start(MigrationState *s)
106 {
107 trace_vmstate_downtime_checkpoint("src-downtime-start");
108 s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
109 }
110
migration_downtime_end(MigrationState * s)111 static void migration_downtime_end(MigrationState *s)
112 {
113 int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
114
115 /*
116 * If downtime already set, should mean that postcopy already set it,
117 * then that should be the real downtime already.
118 */
119 if (!s->downtime) {
120 s->downtime = now - s->downtime_start;
121 }
122
123 trace_vmstate_downtime_checkpoint("src-downtime-end");
124 }
125
migration_needs_multiple_sockets(void)126 static bool migration_needs_multiple_sockets(void)
127 {
128 return migrate_multifd() || migrate_postcopy_preempt();
129 }
130
transport_supports_multi_channels(MigrationAddress * addr)131 static bool transport_supports_multi_channels(MigrationAddress *addr)
132 {
133 if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
134 SocketAddress *saddr = &addr->u.socket;
135
136 return saddr->type == SOCKET_ADDRESS_TYPE_INET ||
137 saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
138 saddr->type == SOCKET_ADDRESS_TYPE_VSOCK;
139 }
140
141 return false;
142 }
143
144 static bool
migration_channels_and_transport_compatible(MigrationAddress * addr,Error ** errp)145 migration_channels_and_transport_compatible(MigrationAddress *addr,
146 Error **errp)
147 {
148 if (migration_needs_multiple_sockets() &&
149 !transport_supports_multi_channels(addr)) {
150 error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)");
151 return false;
152 }
153
154 return true;
155 }
156
page_request_addr_cmp(gconstpointer ap,gconstpointer bp)157 static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
158 {
159 uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
160
161 return (a > b) - (a < b);
162 }
163
migration_stop_vm(RunState state)164 int migration_stop_vm(RunState state)
165 {
166 int ret = vm_stop_force_state(state);
167
168 trace_vmstate_downtime_checkpoint("src-vm-stopped");
169
170 return ret;
171 }
172
migration_object_init(void)173 void migration_object_init(void)
174 {
175 /* This can only be called once. */
176 assert(!current_migration);
177 current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
178
179 /*
180 * Init the migrate incoming object as well no matter whether
181 * we'll use it or not.
182 */
183 assert(!current_incoming);
184 current_incoming = g_new0(MigrationIncomingState, 1);
185 current_incoming->state = MIGRATION_STATUS_NONE;
186 current_incoming->postcopy_remote_fds =
187 g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
188 qemu_mutex_init(¤t_incoming->rp_mutex);
189 qemu_mutex_init(¤t_incoming->postcopy_prio_thread_mutex);
190 qemu_event_init(¤t_incoming->main_thread_load_event, false);
191 qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0);
192 qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0);
193 qemu_sem_init(¤t_incoming->postcopy_pause_sem_fast_load, 0);
194 qemu_sem_init(¤t_incoming->postcopy_qemufile_dst_done, 0);
195
196 qemu_mutex_init(¤t_incoming->page_request_mutex);
197 qemu_cond_init(¤t_incoming->page_request_cond);
198 current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
199
200 migration_object_check(current_migration, &error_fatal);
201
202 blk_mig_init();
203 ram_mig_init();
204 dirty_bitmap_mig_init();
205 }
206
migration_cancel(const Error * error)207 void migration_cancel(const Error *error)
208 {
209 if (error) {
210 migrate_set_error(current_migration, error);
211 }
212 if (migrate_dirty_limit()) {
213 qmp_cancel_vcpu_dirty_limit(false, -1, NULL);
214 }
215 migrate_fd_cancel(current_migration);
216 }
217
migration_shutdown(void)218 void migration_shutdown(void)
219 {
220 /*
221 * When the QEMU main thread exit, the COLO thread
222 * may wait a semaphore. So, we should wakeup the
223 * COLO thread before migration shutdown.
224 */
225 colo_shutdown();
226 /*
227 * Cancel the current migration - that will (eventually)
228 * stop the migration using this structure
229 */
230 migration_cancel(NULL);
231 object_unref(OBJECT(current_migration));
232
233 /*
234 * Cancel outgoing migration of dirty bitmaps. It should
235 * at least unref used block nodes.
236 */
237 dirty_bitmap_mig_cancel_outgoing();
238
239 /*
240 * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
241 * are non-critical data, and their loss never considered as
242 * something serious.
243 */
244 dirty_bitmap_mig_cancel_incoming();
245 }
246
247 /* For outgoing */
migrate_get_current(void)248 MigrationState *migrate_get_current(void)
249 {
250 /* This can only be called after the object created. */
251 assert(current_migration);
252 return current_migration;
253 }
254
migration_incoming_get_current(void)255 MigrationIncomingState *migration_incoming_get_current(void)
256 {
257 assert(current_incoming);
258 return current_incoming;
259 }
260
migration_incoming_transport_cleanup(MigrationIncomingState * mis)261 void migration_incoming_transport_cleanup(MigrationIncomingState *mis)
262 {
263 if (mis->socket_address_list) {
264 qapi_free_SocketAddressList(mis->socket_address_list);
265 mis->socket_address_list = NULL;
266 }
267
268 if (mis->transport_cleanup) {
269 mis->transport_cleanup(mis->transport_data);
270 mis->transport_data = mis->transport_cleanup = NULL;
271 }
272 }
273
migration_incoming_state_destroy(void)274 void migration_incoming_state_destroy(void)
275 {
276 struct MigrationIncomingState *mis = migration_incoming_get_current();
277
278 multifd_load_cleanup();
279 compress_threads_load_cleanup();
280
281 if (mis->to_src_file) {
282 /* Tell source that we are done */
283 migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
284 qemu_fclose(mis->to_src_file);
285 mis->to_src_file = NULL;
286 }
287
288 if (mis->from_src_file) {
289 migration_ioc_unregister_yank_from_file(mis->from_src_file);
290 qemu_fclose(mis->from_src_file);
291 mis->from_src_file = NULL;
292 }
293 if (mis->postcopy_remote_fds) {
294 g_array_free(mis->postcopy_remote_fds, TRUE);
295 mis->postcopy_remote_fds = NULL;
296 }
297
298 migration_incoming_transport_cleanup(mis);
299 qemu_event_reset(&mis->main_thread_load_event);
300
301 if (mis->page_requested) {
302 g_tree_destroy(mis->page_requested);
303 mis->page_requested = NULL;
304 }
305
306 if (mis->postcopy_qemufile_dst) {
307 migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
308 qemu_fclose(mis->postcopy_qemufile_dst);
309 mis->postcopy_qemufile_dst = NULL;
310 }
311
312 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
313 }
314
migrate_generate_event(int new_state)315 static void migrate_generate_event(int new_state)
316 {
317 if (migrate_events()) {
318 qapi_event_send_migration(new_state);
319 }
320 }
321
322 /*
323 * Send a message on the return channel back to the source
324 * of the migration.
325 */
migrate_send_rp_message(MigrationIncomingState * mis,enum mig_rp_message_type message_type,uint16_t len,void * data)326 static int migrate_send_rp_message(MigrationIncomingState *mis,
327 enum mig_rp_message_type message_type,
328 uint16_t len, void *data)
329 {
330 int ret = 0;
331
332 trace_migrate_send_rp_message((int)message_type, len);
333 QEMU_LOCK_GUARD(&mis->rp_mutex);
334
335 /*
336 * It's possible that the file handle got lost due to network
337 * failures.
338 */
339 if (!mis->to_src_file) {
340 ret = -EIO;
341 return ret;
342 }
343
344 qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
345 qemu_put_be16(mis->to_src_file, len);
346 qemu_put_buffer(mis->to_src_file, data, len);
347 return qemu_fflush(mis->to_src_file);
348 }
349
350 /* Request one page from the source VM at the given start address.
351 * rb: the RAMBlock to request the page in
352 * Start: Address offset within the RB
353 * Len: Length in bytes required - must be a multiple of pagesize
354 */
migrate_send_rp_message_req_pages(MigrationIncomingState * mis,RAMBlock * rb,ram_addr_t start)355 int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
356 RAMBlock *rb, ram_addr_t start)
357 {
358 uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
359 size_t msglen = 12; /* start + len */
360 size_t len = qemu_ram_pagesize(rb);
361 enum mig_rp_message_type msg_type;
362 const char *rbname;
363 int rbname_len;
364
365 *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
366 *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
367
368 /*
369 * We maintain the last ramblock that we requested for page. Note that we
370 * don't need locking because this function will only be called within the
371 * postcopy ram fault thread.
372 */
373 if (rb != mis->last_rb) {
374 mis->last_rb = rb;
375
376 rbname = qemu_ram_get_idstr(rb);
377 rbname_len = strlen(rbname);
378
379 assert(rbname_len < 256);
380
381 bufc[msglen++] = rbname_len;
382 memcpy(bufc + msglen, rbname, rbname_len);
383 msglen += rbname_len;
384 msg_type = MIG_RP_MSG_REQ_PAGES_ID;
385 } else {
386 msg_type = MIG_RP_MSG_REQ_PAGES;
387 }
388
389 return migrate_send_rp_message(mis, msg_type, msglen, bufc);
390 }
391
migrate_send_rp_req_pages(MigrationIncomingState * mis,RAMBlock * rb,ram_addr_t start,uint64_t haddr)392 int migrate_send_rp_req_pages(MigrationIncomingState *mis,
393 RAMBlock *rb, ram_addr_t start, uint64_t haddr)
394 {
395 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
396 bool received = false;
397
398 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
399 received = ramblock_recv_bitmap_test_byte_offset(rb, start);
400 if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
401 /*
402 * The page has not been received, and it's not yet in the page
403 * request list. Queue it. Set the value of element to 1, so that
404 * things like g_tree_lookup() will return TRUE (1) when found.
405 */
406 g_tree_insert(mis->page_requested, aligned, (gpointer)1);
407 qatomic_inc(&mis->page_requested_count);
408 trace_postcopy_page_req_add(aligned, mis->page_requested_count);
409 }
410 }
411
412 /*
413 * If the page is there, skip sending the message. We don't even need the
414 * lock because as long as the page arrived, it'll be there forever.
415 */
416 if (received) {
417 return 0;
418 }
419
420 return migrate_send_rp_message_req_pages(mis, rb, start);
421 }
422
423 static bool migration_colo_enabled;
migration_incoming_colo_enabled(void)424 bool migration_incoming_colo_enabled(void)
425 {
426 return migration_colo_enabled;
427 }
428
migration_incoming_disable_colo(void)429 void migration_incoming_disable_colo(void)
430 {
431 ram_block_discard_disable(false);
432 migration_colo_enabled = false;
433 }
434
migration_incoming_enable_colo(void)435 int migration_incoming_enable_colo(void)
436 {
437 #ifndef CONFIG_REPLICATION
438 error_report("ENABLE_COLO command come in migration stream, but COLO "
439 "module is not built in");
440 return -ENOTSUP;
441 #endif
442
443 if (!migrate_colo()) {
444 error_report("ENABLE_COLO command come in migration stream, but c-colo "
445 "capability is not set");
446 return -EINVAL;
447 }
448
449 if (ram_block_discard_disable(true)) {
450 error_report("COLO: cannot disable RAM discard");
451 return -EBUSY;
452 }
453 migration_colo_enabled = true;
454 return 0;
455 }
456
migrate_add_address(SocketAddress * address)457 void migrate_add_address(SocketAddress *address)
458 {
459 MigrationIncomingState *mis = migration_incoming_get_current();
460
461 QAPI_LIST_PREPEND(mis->socket_address_list,
462 QAPI_CLONE(SocketAddress, address));
463 }
464
migrate_uri_parse(const char * uri,MigrationChannel ** channel,Error ** errp)465 bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
466 Error **errp)
467 {
468 g_autoptr(MigrationChannel) val = g_new0(MigrationChannel, 1);
469 g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1);
470 InetSocketAddress *isock = &addr->u.rdma;
471 strList **tail = &addr->u.exec.args;
472
473 if (strstart(uri, "exec:", NULL)) {
474 addr->transport = MIGRATION_ADDRESS_TYPE_EXEC;
475 #ifdef WIN32
476 QAPI_LIST_APPEND(tail, g_strdup(exec_get_cmd_path()));
477 QAPI_LIST_APPEND(tail, g_strdup("/c"));
478 #else
479 QAPI_LIST_APPEND(tail, g_strdup("/bin/sh"));
480 QAPI_LIST_APPEND(tail, g_strdup("-c"));
481 #endif
482 QAPI_LIST_APPEND(tail, g_strdup(uri + strlen("exec:")));
483 } else if (strstart(uri, "rdma:", NULL)) {
484 if (inet_parse(isock, uri + strlen("rdma:"), errp)) {
485 qapi_free_InetSocketAddress(isock);
486 return false;
487 }
488 addr->transport = MIGRATION_ADDRESS_TYPE_RDMA;
489 } else if (strstart(uri, "tcp:", NULL) ||
490 strstart(uri, "unix:", NULL) ||
491 strstart(uri, "vsock:", NULL) ||
492 strstart(uri, "fd:", NULL)) {
493 addr->transport = MIGRATION_ADDRESS_TYPE_SOCKET;
494 SocketAddress *saddr = socket_parse(uri, errp);
495 if (!saddr) {
496 return false;
497 }
498 addr->u.socket.type = saddr->type;
499 addr->u.socket.u = saddr->u;
500 /* Don't free the objects inside; their ownership moved to "addr" */
501 g_free(saddr);
502 } else if (strstart(uri, "file:", NULL)) {
503 addr->transport = MIGRATION_ADDRESS_TYPE_FILE;
504 addr->u.file.filename = g_strdup(uri + strlen("file:"));
505 if (file_parse_offset(addr->u.file.filename, &addr->u.file.offset,
506 errp)) {
507 return false;
508 }
509 } else {
510 error_setg(errp, "unknown migration protocol: %s", uri);
511 return false;
512 }
513
514 val->channel_type = MIGRATION_CHANNEL_TYPE_MAIN;
515 val->addr = g_steal_pointer(&addr);
516 *channel = g_steal_pointer(&val);
517 return true;
518 }
519
qemu_start_incoming_migration(const char * uri,bool has_channels,MigrationChannelList * channels,Error ** errp)520 static void qemu_start_incoming_migration(const char *uri, bool has_channels,
521 MigrationChannelList *channels,
522 Error **errp)
523 {
524 g_autoptr(MigrationChannel) channel = NULL;
525 MigrationAddress *addr = NULL;
526 MigrationIncomingState *mis = migration_incoming_get_current();
527
528 /*
529 * Having preliminary checks for uri and channel
530 */
531 if (uri && has_channels) {
532 error_setg(errp, "'uri' and 'channels' arguments are mutually "
533 "exclusive; exactly one of the two should be present in "
534 "'migrate-incoming' qmp command ");
535 return;
536 } else if (channels) {
537 /* To verify that Migrate channel list has only item */
538 if (channels->next) {
539 error_setg(errp, "Channel list has more than one entries");
540 return;
541 }
542 addr = channels->value->addr;
543 } else if (uri) {
544 /* caller uses the old URI syntax */
545 if (!migrate_uri_parse(uri, &channel, errp)) {
546 return;
547 }
548 addr = channel->addr;
549 } else {
550 error_setg(errp, "neither 'uri' or 'channels' argument are "
551 "specified in 'migrate-incoming' qmp command ");
552 return;
553 }
554
555 /* transport mechanism not suitable for migration? */
556 if (!migration_channels_and_transport_compatible(addr, errp)) {
557 return;
558 }
559
560 migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
561 MIGRATION_STATUS_SETUP);
562
563 if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
564 SocketAddress *saddr = &addr->u.socket;
565 if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
566 saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
567 saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
568 socket_start_incoming_migration(saddr, errp);
569 } else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) {
570 fd_start_incoming_migration(saddr->u.fd.str, errp);
571 }
572 #ifdef CONFIG_RDMA
573 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
574 if (migrate_compress()) {
575 error_setg(errp, "RDMA and compression can't be used together");
576 return;
577 }
578 if (migrate_xbzrle()) {
579 error_setg(errp, "RDMA and XBZRLE can't be used together");
580 return;
581 }
582 if (migrate_multifd()) {
583 error_setg(errp, "RDMA and multifd can't be used together");
584 return;
585 }
586 rdma_start_incoming_migration(&addr->u.rdma, errp);
587 #endif
588 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
589 exec_start_incoming_migration(addr->u.exec.args, errp);
590 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
591 file_start_incoming_migration(&addr->u.file, errp);
592 } else {
593 error_setg(errp, "unknown migration protocol: %s", uri);
594 }
595 }
596
process_incoming_migration_bh(void * opaque)597 static void process_incoming_migration_bh(void *opaque)
598 {
599 Error *local_err = NULL;
600 MigrationIncomingState *mis = opaque;
601
602 trace_vmstate_downtime_checkpoint("dst-precopy-bh-enter");
603
604 /* If capability late_block_activate is set:
605 * Only fire up the block code now if we're going to restart the
606 * VM, else 'cont' will do it.
607 * This causes file locking to happen; so we don't want it to happen
608 * unless we really are starting the VM.
609 */
610 if (!migrate_late_block_activate() ||
611 (autostart && (!global_state_received() ||
612 global_state_get_runstate() == RUN_STATE_RUNNING))) {
613 /* Make sure all file formats throw away their mutable metadata.
614 * If we get an error here, just don't restart the VM yet. */
615 bdrv_activate_all(&local_err);
616 if (local_err) {
617 error_report_err(local_err);
618 local_err = NULL;
619 autostart = false;
620 }
621 }
622
623 /*
624 * This must happen after all error conditions are dealt with and
625 * we're sure the VM is going to be running on this host.
626 */
627 qemu_announce_self(&mis->announce_timer, migrate_announce_params());
628
629 trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced");
630
631 multifd_load_shutdown();
632
633 dirty_bitmap_mig_before_vm_start();
634
635 if (!global_state_received() ||
636 global_state_get_runstate() == RUN_STATE_RUNNING) {
637 if (autostart) {
638 vm_start();
639 } else {
640 runstate_set(RUN_STATE_PAUSED);
641 }
642 } else if (migration_incoming_colo_enabled()) {
643 migration_incoming_disable_colo();
644 vm_start();
645 } else {
646 runstate_set(global_state_get_runstate());
647 }
648 trace_vmstate_downtime_checkpoint("dst-precopy-bh-vm-started");
649 /*
650 * This must happen after any state changes since as soon as an external
651 * observer sees this event they might start to prod at the VM assuming
652 * it's ready to use.
653 */
654 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
655 MIGRATION_STATUS_COMPLETED);
656 qemu_bh_delete(mis->bh);
657 migration_incoming_state_destroy();
658 object_unref(OBJECT(migrate_get_current()));
659 }
660
661 static void coroutine_fn
process_incoming_migration_co(void * opaque)662 process_incoming_migration_co(void *opaque)
663 {
664 MigrationIncomingState *mis = migration_incoming_get_current();
665 PostcopyState ps;
666 int ret;
667
668 assert(mis->from_src_file);
669
670 if (compress_threads_load_setup(mis->from_src_file)) {
671 error_report("Failed to setup decompress threads");
672 goto fail;
673 }
674
675 mis->largest_page_size = qemu_ram_pagesize_largest();
676 postcopy_state_set(POSTCOPY_INCOMING_NONE);
677 migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP,
678 MIGRATION_STATUS_ACTIVE);
679
680 mis->loadvm_co = qemu_coroutine_self();
681 ret = qemu_loadvm_state(mis->from_src_file);
682 mis->loadvm_co = NULL;
683
684 trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
685
686 ps = postcopy_state_get();
687 trace_process_incoming_migration_co_end(ret, ps);
688 if (ps != POSTCOPY_INCOMING_NONE) {
689 if (ps == POSTCOPY_INCOMING_ADVISE) {
690 /*
691 * Where a migration had postcopy enabled (and thus went to advise)
692 * but managed to complete within the precopy period, we can use
693 * the normal exit.
694 */
695 postcopy_ram_incoming_cleanup(mis);
696 } else if (ret >= 0) {
697 /*
698 * Postcopy was started, cleanup should happen at the end of the
699 * postcopy thread.
700 */
701 trace_process_incoming_migration_co_postcopy_end_main();
702 return;
703 }
704 /* Else if something went wrong then just fall out of the normal exit */
705 }
706
707 if (ret < 0) {
708 error_report("load of migration failed: %s", strerror(-ret));
709 goto fail;
710 }
711
712 if (colo_incoming_co() < 0) {
713 goto fail;
714 }
715
716 mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
717 object_ref(OBJECT(migrate_get_current()));
718 qemu_bh_schedule(mis->bh);
719 return;
720 fail:
721 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
722 MIGRATION_STATUS_FAILED);
723 qemu_fclose(mis->from_src_file);
724
725 multifd_load_cleanup();
726 compress_threads_load_cleanup();
727
728 exit(EXIT_FAILURE);
729 }
730
731 /**
732 * migration_incoming_setup: Setup incoming migration
733 * @f: file for main migration channel
734 * @errp: where to put errors
735 *
736 * Returns: %true on success, %false on error.
737 */
migration_incoming_setup(QEMUFile * f,Error ** errp)738 static bool migration_incoming_setup(QEMUFile *f, Error **errp)
739 {
740 MigrationIncomingState *mis = migration_incoming_get_current();
741
742 if (!mis->from_src_file) {
743 mis->from_src_file = f;
744 }
745 qemu_file_set_blocking(f, false);
746 return true;
747 }
748
migration_incoming_process(void)749 void migration_incoming_process(void)
750 {
751 Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
752 qemu_coroutine_enter(co);
753 }
754
755 /* Returns true if recovered from a paused migration, otherwise false */
postcopy_try_recover(void)756 static bool postcopy_try_recover(void)
757 {
758 MigrationIncomingState *mis = migration_incoming_get_current();
759
760 if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
761 /* Resumed from a paused postcopy migration */
762
763 /* This should be set already in migration_incoming_setup() */
764 assert(mis->from_src_file);
765 /* Postcopy has standalone thread to do vm load */
766 qemu_file_set_blocking(mis->from_src_file, true);
767
768 /* Re-configure the return path */
769 mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
770
771 migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
772 MIGRATION_STATUS_POSTCOPY_RECOVER);
773
774 /*
775 * Here, we only wake up the main loading thread (while the
776 * rest threads will still be waiting), so that we can receive
777 * commands from source now, and answer it if needed. The
778 * rest threads will be woken up afterwards until we are sure
779 * that source is ready to reply to page requests.
780 */
781 qemu_sem_post(&mis->postcopy_pause_sem_dst);
782 return true;
783 }
784
785 return false;
786 }
787
migration_fd_process_incoming(QEMUFile * f,Error ** errp)788 void migration_fd_process_incoming(QEMUFile *f, Error **errp)
789 {
790 if (!migration_incoming_setup(f, errp)) {
791 return;
792 }
793 if (postcopy_try_recover()) {
794 return;
795 }
796 migration_incoming_process();
797 }
798
799 /*
800 * Returns true when we want to start a new incoming migration process,
801 * false otherwise.
802 */
migration_should_start_incoming(bool main_channel)803 static bool migration_should_start_incoming(bool main_channel)
804 {
805 /* Multifd doesn't start unless all channels are established */
806 if (migrate_multifd()) {
807 return migration_has_all_channels();
808 }
809
810 /* Preempt channel only starts when the main channel is created */
811 if (migrate_postcopy_preempt()) {
812 return main_channel;
813 }
814
815 /*
816 * For all the rest types of migration, we should only reach here when
817 * it's the main channel that's being created, and we should always
818 * proceed with this channel.
819 */
820 assert(main_channel);
821 return true;
822 }
823
migration_ioc_process_incoming(QIOChannel * ioc,Error ** errp)824 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
825 {
826 MigrationIncomingState *mis = migration_incoming_get_current();
827 Error *local_err = NULL;
828 QEMUFile *f;
829 bool default_channel = true;
830 uint32_t channel_magic = 0;
831 int ret = 0;
832
833 if (migrate_multifd() && !migrate_postcopy_ram() &&
834 qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
835 /*
836 * With multiple channels, it is possible that we receive channels
837 * out of order on destination side, causing incorrect mapping of
838 * source channels on destination side. Check channel MAGIC to
839 * decide type of channel. Please note this is best effort, postcopy
840 * preempt channel does not send any magic number so avoid it for
841 * postcopy live migration. Also tls live migration already does
842 * tls handshake while initializing main channel so with tls this
843 * issue is not possible.
844 */
845 ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
846 sizeof(channel_magic), &local_err);
847
848 if (ret != 0) {
849 error_propagate(errp, local_err);
850 return;
851 }
852
853 default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
854 } else {
855 default_channel = !mis->from_src_file;
856 }
857
858 if (multifd_load_setup(errp) != 0) {
859 error_setg(errp, "Failed to setup multifd channels");
860 return;
861 }
862
863 if (default_channel) {
864 f = qemu_file_new_input(ioc);
865
866 if (!migration_incoming_setup(f, errp)) {
867 return;
868 }
869 } else {
870 /* Multiple connections */
871 assert(migration_needs_multiple_sockets());
872 if (migrate_multifd()) {
873 multifd_recv_new_channel(ioc, &local_err);
874 } else {
875 assert(migrate_postcopy_preempt());
876 f = qemu_file_new_input(ioc);
877 postcopy_preempt_new_channel(mis, f);
878 }
879 if (local_err) {
880 error_propagate(errp, local_err);
881 return;
882 }
883 }
884
885 if (migration_should_start_incoming(default_channel)) {
886 /* If it's a recovery, we're done */
887 if (postcopy_try_recover()) {
888 return;
889 }
890 migration_incoming_process();
891 }
892 }
893
894 /**
895 * @migration_has_all_channels: We have received all channels that we need
896 *
897 * Returns true when we have got connections to all the channels that
898 * we need for migration.
899 */
migration_has_all_channels(void)900 bool migration_has_all_channels(void)
901 {
902 MigrationIncomingState *mis = migration_incoming_get_current();
903
904 if (!mis->from_src_file) {
905 return false;
906 }
907
908 if (migrate_multifd()) {
909 return multifd_recv_all_channels_created();
910 }
911
912 if (migrate_postcopy_preempt()) {
913 return mis->postcopy_qemufile_dst != NULL;
914 }
915
916 return true;
917 }
918
migrate_send_rp_switchover_ack(MigrationIncomingState * mis)919 int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
920 {
921 return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL);
922 }
923
924 /*
925 * Send a 'SHUT' message on the return channel with the given value
926 * to indicate that we've finished with the RP. Non-0 value indicates
927 * error.
928 */
migrate_send_rp_shut(MigrationIncomingState * mis,uint32_t value)929 void migrate_send_rp_shut(MigrationIncomingState *mis,
930 uint32_t value)
931 {
932 uint32_t buf;
933
934 buf = cpu_to_be32(value);
935 migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
936 }
937
938 /*
939 * Send a 'PONG' message on the return channel with the given value
940 * (normally in response to a 'PING')
941 */
migrate_send_rp_pong(MigrationIncomingState * mis,uint32_t value)942 void migrate_send_rp_pong(MigrationIncomingState *mis,
943 uint32_t value)
944 {
945 uint32_t buf;
946
947 buf = cpu_to_be32(value);
948 migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
949 }
950
migrate_send_rp_recv_bitmap(MigrationIncomingState * mis,char * block_name)951 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
952 char *block_name)
953 {
954 char buf[512];
955 int len;
956 int64_t res;
957
958 /*
959 * First, we send the header part. It contains only the len of
960 * idstr, and the idstr itself.
961 */
962 len = strlen(block_name);
963 buf[0] = len;
964 memcpy(buf + 1, block_name, len);
965
966 if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
967 error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
968 __func__);
969 return;
970 }
971
972 migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
973
974 /*
975 * Next, we dump the received bitmap to the stream.
976 *
977 * TODO: currently we are safe since we are the only one that is
978 * using the to_src_file handle (fault thread is still paused),
979 * and it's ok even not taking the mutex. However the best way is
980 * to take the lock before sending the message header, and release
981 * the lock after sending the bitmap.
982 */
983 qemu_mutex_lock(&mis->rp_mutex);
984 res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
985 qemu_mutex_unlock(&mis->rp_mutex);
986
987 trace_migrate_send_rp_recv_bitmap(block_name, res);
988 }
989
migrate_send_rp_resume_ack(MigrationIncomingState * mis,uint32_t value)990 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
991 {
992 uint32_t buf;
993
994 buf = cpu_to_be32(value);
995 migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
996 }
997
998 /*
999 * Return true if we're already in the middle of a migration
1000 * (i.e. any of the active or setup states)
1001 */
migration_is_setup_or_active(int state)1002 bool migration_is_setup_or_active(int state)
1003 {
1004 switch (state) {
1005 case MIGRATION_STATUS_ACTIVE:
1006 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1007 case MIGRATION_STATUS_POSTCOPY_PAUSED:
1008 case MIGRATION_STATUS_POSTCOPY_RECOVER:
1009 case MIGRATION_STATUS_SETUP:
1010 case MIGRATION_STATUS_PRE_SWITCHOVER:
1011 case MIGRATION_STATUS_DEVICE:
1012 case MIGRATION_STATUS_WAIT_UNPLUG:
1013 case MIGRATION_STATUS_COLO:
1014 return true;
1015
1016 default:
1017 return false;
1018
1019 }
1020 }
1021
migration_is_running(int state)1022 bool migration_is_running(int state)
1023 {
1024 switch (state) {
1025 case MIGRATION_STATUS_ACTIVE:
1026 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1027 case MIGRATION_STATUS_POSTCOPY_PAUSED:
1028 case MIGRATION_STATUS_POSTCOPY_RECOVER:
1029 case MIGRATION_STATUS_SETUP:
1030 case MIGRATION_STATUS_PRE_SWITCHOVER:
1031 case MIGRATION_STATUS_DEVICE:
1032 case MIGRATION_STATUS_WAIT_UNPLUG:
1033 case MIGRATION_STATUS_CANCELLING:
1034 return true;
1035
1036 default:
1037 return false;
1038
1039 }
1040 }
1041
migrate_show_downtime(MigrationState * s)1042 static bool migrate_show_downtime(MigrationState *s)
1043 {
1044 return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy();
1045 }
1046
populate_time_info(MigrationInfo * info,MigrationState * s)1047 static void populate_time_info(MigrationInfo *info, MigrationState *s)
1048 {
1049 info->has_status = true;
1050 info->has_setup_time = true;
1051 info->setup_time = s->setup_time;
1052
1053 if (s->state == MIGRATION_STATUS_COMPLETED) {
1054 info->has_total_time = true;
1055 info->total_time = s->total_time;
1056 } else {
1057 info->has_total_time = true;
1058 info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
1059 s->start_time;
1060 }
1061
1062 if (migrate_show_downtime(s)) {
1063 info->has_downtime = true;
1064 info->downtime = s->downtime;
1065 } else {
1066 info->has_expected_downtime = true;
1067 info->expected_downtime = s->expected_downtime;
1068 }
1069 }
1070
populate_ram_info(MigrationInfo * info,MigrationState * s)1071 static void populate_ram_info(MigrationInfo *info, MigrationState *s)
1072 {
1073 size_t page_size = qemu_target_page_size();
1074
1075 info->ram = g_malloc0(sizeof(*info->ram));
1076 info->ram->transferred = migration_transferred_bytes();
1077 info->ram->total = ram_bytes_total();
1078 info->ram->duplicate = stat64_get(&mig_stats.zero_pages);
1079 /* legacy value. It is not used anymore */
1080 info->ram->skipped = 0;
1081 info->ram->normal = stat64_get(&mig_stats.normal_pages);
1082 info->ram->normal_bytes = info->ram->normal * page_size;
1083 info->ram->mbps = s->mbps;
1084 info->ram->dirty_sync_count =
1085 stat64_get(&mig_stats.dirty_sync_count);
1086 info->ram->dirty_sync_missed_zero_copy =
1087 stat64_get(&mig_stats.dirty_sync_missed_zero_copy);
1088 info->ram->postcopy_requests =
1089 stat64_get(&mig_stats.postcopy_requests);
1090 info->ram->page_size = page_size;
1091 info->ram->multifd_bytes = stat64_get(&mig_stats.multifd_bytes);
1092 info->ram->pages_per_second = s->pages_per_second;
1093 info->ram->precopy_bytes = stat64_get(&mig_stats.precopy_bytes);
1094 info->ram->downtime_bytes = stat64_get(&mig_stats.downtime_bytes);
1095 info->ram->postcopy_bytes = stat64_get(&mig_stats.postcopy_bytes);
1096
1097 if (migrate_xbzrle()) {
1098 info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
1099 info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
1100 info->xbzrle_cache->bytes = xbzrle_counters.bytes;
1101 info->xbzrle_cache->pages = xbzrle_counters.pages;
1102 info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
1103 info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
1104 info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
1105 info->xbzrle_cache->overflow = xbzrle_counters.overflow;
1106 }
1107
1108 populate_compress(info);
1109
1110 if (cpu_throttle_active()) {
1111 info->has_cpu_throttle_percentage = true;
1112 info->cpu_throttle_percentage = cpu_throttle_get_percentage();
1113 }
1114
1115 if (s->state != MIGRATION_STATUS_COMPLETED) {
1116 info->ram->remaining = ram_bytes_remaining();
1117 info->ram->dirty_pages_rate =
1118 stat64_get(&mig_stats.dirty_pages_rate);
1119 }
1120
1121 if (migrate_dirty_limit() && dirtylimit_in_service()) {
1122 info->has_dirty_limit_throttle_time_per_round = true;
1123 info->dirty_limit_throttle_time_per_round =
1124 dirtylimit_throttle_time_per_round();
1125
1126 info->has_dirty_limit_ring_full_time = true;
1127 info->dirty_limit_ring_full_time = dirtylimit_ring_full_time();
1128 }
1129 }
1130
populate_disk_info(MigrationInfo * info)1131 static void populate_disk_info(MigrationInfo *info)
1132 {
1133 if (blk_mig_active()) {
1134 info->disk = g_malloc0(sizeof(*info->disk));
1135 info->disk->transferred = blk_mig_bytes_transferred();
1136 info->disk->remaining = blk_mig_bytes_remaining();
1137 info->disk->total = blk_mig_bytes_total();
1138 }
1139 }
1140
fill_source_migration_info(MigrationInfo * info)1141 static void fill_source_migration_info(MigrationInfo *info)
1142 {
1143 MigrationState *s = migrate_get_current();
1144 int state = qatomic_read(&s->state);
1145 GSList *cur_blocker = migration_blockers[migrate_mode()];
1146
1147 info->blocked_reasons = NULL;
1148
1149 /*
1150 * There are two types of reasons a migration might be blocked;
1151 * a) devices marked in VMState as non-migratable, and
1152 * b) Explicit migration blockers
1153 * We need to add both of them here.
1154 */
1155 qemu_savevm_non_migratable_list(&info->blocked_reasons);
1156
1157 while (cur_blocker) {
1158 QAPI_LIST_PREPEND(info->blocked_reasons,
1159 g_strdup(error_get_pretty(cur_blocker->data)));
1160 cur_blocker = g_slist_next(cur_blocker);
1161 }
1162 info->has_blocked_reasons = info->blocked_reasons != NULL;
1163
1164 switch (state) {
1165 case MIGRATION_STATUS_NONE:
1166 /* no migration has happened ever */
1167 /* do not overwrite destination migration status */
1168 return;
1169 case MIGRATION_STATUS_SETUP:
1170 info->has_status = true;
1171 info->has_total_time = false;
1172 break;
1173 case MIGRATION_STATUS_ACTIVE:
1174 case MIGRATION_STATUS_CANCELLING:
1175 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1176 case MIGRATION_STATUS_PRE_SWITCHOVER:
1177 case MIGRATION_STATUS_DEVICE:
1178 case MIGRATION_STATUS_POSTCOPY_PAUSED:
1179 case MIGRATION_STATUS_POSTCOPY_RECOVER:
1180 /* TODO add some postcopy stats */
1181 populate_time_info(info, s);
1182 populate_ram_info(info, s);
1183 populate_disk_info(info);
1184 migration_populate_vfio_info(info);
1185 break;
1186 case MIGRATION_STATUS_COLO:
1187 info->has_status = true;
1188 /* TODO: display COLO specific information (checkpoint info etc.) */
1189 break;
1190 case MIGRATION_STATUS_COMPLETED:
1191 populate_time_info(info, s);
1192 populate_ram_info(info, s);
1193 migration_populate_vfio_info(info);
1194 break;
1195 case MIGRATION_STATUS_FAILED:
1196 info->has_status = true;
1197 break;
1198 case MIGRATION_STATUS_CANCELLED:
1199 info->has_status = true;
1200 break;
1201 case MIGRATION_STATUS_WAIT_UNPLUG:
1202 info->has_status = true;
1203 break;
1204 }
1205 info->status = state;
1206
1207 QEMU_LOCK_GUARD(&s->error_mutex);
1208 if (s->error) {
1209 info->error_desc = g_strdup(error_get_pretty(s->error));
1210 }
1211 }
1212
fill_destination_migration_info(MigrationInfo * info)1213 static void fill_destination_migration_info(MigrationInfo *info)
1214 {
1215 MigrationIncomingState *mis = migration_incoming_get_current();
1216
1217 if (mis->socket_address_list) {
1218 info->has_socket_address = true;
1219 info->socket_address =
1220 QAPI_CLONE(SocketAddressList, mis->socket_address_list);
1221 }
1222
1223 switch (mis->state) {
1224 case MIGRATION_STATUS_NONE:
1225 return;
1226 case MIGRATION_STATUS_SETUP:
1227 case MIGRATION_STATUS_CANCELLING:
1228 case MIGRATION_STATUS_CANCELLED:
1229 case MIGRATION_STATUS_ACTIVE:
1230 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1231 case MIGRATION_STATUS_POSTCOPY_PAUSED:
1232 case MIGRATION_STATUS_POSTCOPY_RECOVER:
1233 case MIGRATION_STATUS_FAILED:
1234 case MIGRATION_STATUS_COLO:
1235 info->has_status = true;
1236 break;
1237 case MIGRATION_STATUS_COMPLETED:
1238 info->has_status = true;
1239 fill_destination_postcopy_migration_info(info);
1240 break;
1241 }
1242 info->status = mis->state;
1243 }
1244
qmp_query_migrate(Error ** errp)1245 MigrationInfo *qmp_query_migrate(Error **errp)
1246 {
1247 MigrationInfo *info = g_malloc0(sizeof(*info));
1248
1249 fill_destination_migration_info(info);
1250 fill_source_migration_info(info);
1251
1252 return info;
1253 }
1254
qmp_migrate_start_postcopy(Error ** errp)1255 void qmp_migrate_start_postcopy(Error **errp)
1256 {
1257 MigrationState *s = migrate_get_current();
1258
1259 if (!migrate_postcopy()) {
1260 error_setg(errp, "Enable postcopy with migrate_set_capability before"
1261 " the start of migration");
1262 return;
1263 }
1264
1265 if (s->state == MIGRATION_STATUS_NONE) {
1266 error_setg(errp, "Postcopy must be started after migration has been"
1267 " started");
1268 return;
1269 }
1270 /*
1271 * we don't error if migration has finished since that would be racy
1272 * with issuing this command.
1273 */
1274 qatomic_set(&s->start_postcopy, true);
1275 }
1276
1277 /* shared migration helpers */
1278
migrate_set_state(int * state,int old_state,int new_state)1279 void migrate_set_state(int *state, int old_state, int new_state)
1280 {
1281 assert(new_state < MIGRATION_STATUS__MAX);
1282 if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
1283 trace_migrate_set_state(MigrationStatus_str(new_state));
1284 migrate_generate_event(new_state);
1285 }
1286 }
1287
migrate_fd_cleanup(MigrationState * s)1288 static void migrate_fd_cleanup(MigrationState *s)
1289 {
1290 qemu_bh_delete(s->cleanup_bh);
1291 s->cleanup_bh = NULL;
1292
1293 g_free(s->hostname);
1294 s->hostname = NULL;
1295 json_writer_free(s->vmdesc);
1296 s->vmdesc = NULL;
1297
1298 qemu_savevm_state_cleanup();
1299
1300 if (s->to_dst_file) {
1301 QEMUFile *tmp;
1302
1303 trace_migrate_fd_cleanup();
1304 qemu_mutex_unlock_iothread();
1305 if (s->migration_thread_running) {
1306 qemu_thread_join(&s->thread);
1307 s->migration_thread_running = false;
1308 }
1309 qemu_mutex_lock_iothread();
1310
1311 multifd_save_cleanup();
1312 qemu_mutex_lock(&s->qemu_file_lock);
1313 tmp = s->to_dst_file;
1314 s->to_dst_file = NULL;
1315 qemu_mutex_unlock(&s->qemu_file_lock);
1316 /*
1317 * Close the file handle without the lock to make sure the
1318 * critical section won't block for long.
1319 */
1320 migration_ioc_unregister_yank_from_file(tmp);
1321 qemu_fclose(tmp);
1322 }
1323
1324 /*
1325 * We already cleaned up to_dst_file, so errors from the return
1326 * path might be due to that, ignore them.
1327 */
1328 close_return_path_on_source(s);
1329
1330 assert(!migration_is_active(s));
1331
1332 if (s->state == MIGRATION_STATUS_CANCELLING) {
1333 migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1334 MIGRATION_STATUS_CANCELLED);
1335 }
1336
1337 if (s->error) {
1338 /* It is used on info migrate. We can't free it */
1339 error_report_err(error_copy(s->error));
1340 }
1341 migration_call_notifiers(s);
1342 block_cleanup_parameters();
1343 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1344 }
1345
migrate_fd_cleanup_schedule(MigrationState * s)1346 static void migrate_fd_cleanup_schedule(MigrationState *s)
1347 {
1348 /*
1349 * Ref the state for bh, because it may be called when
1350 * there're already no other refs
1351 */
1352 object_ref(OBJECT(s));
1353 qemu_bh_schedule(s->cleanup_bh);
1354 }
1355
migrate_fd_cleanup_bh(void * opaque)1356 static void migrate_fd_cleanup_bh(void *opaque)
1357 {
1358 MigrationState *s = opaque;
1359 migrate_fd_cleanup(s);
1360 object_unref(OBJECT(s));
1361 }
1362
migrate_set_error(MigrationState * s,const Error * error)1363 void migrate_set_error(MigrationState *s, const Error *error)
1364 {
1365 QEMU_LOCK_GUARD(&s->error_mutex);
1366 if (!s->error) {
1367 s->error = error_copy(error);
1368 }
1369 }
1370
migrate_has_error(MigrationState * s)1371 bool migrate_has_error(MigrationState *s)
1372 {
1373 /* The lock is not helpful here, but still follow the rule */
1374 QEMU_LOCK_GUARD(&s->error_mutex);
1375 return qatomic_read(&s->error);
1376 }
1377
migrate_error_free(MigrationState * s)1378 static void migrate_error_free(MigrationState *s)
1379 {
1380 QEMU_LOCK_GUARD(&s->error_mutex);
1381 if (s->error) {
1382 error_free(s->error);
1383 s->error = NULL;
1384 }
1385 }
1386
migrate_fd_error(MigrationState * s,const Error * error)1387 static void migrate_fd_error(MigrationState *s, const Error *error)
1388 {
1389 trace_migrate_fd_error(error_get_pretty(error));
1390 assert(s->to_dst_file == NULL);
1391 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1392 MIGRATION_STATUS_FAILED);
1393 migrate_set_error(s, error);
1394 }
1395
migrate_fd_cancel(MigrationState * s)1396 static void migrate_fd_cancel(MigrationState *s)
1397 {
1398 int old_state ;
1399
1400 trace_migrate_fd_cancel();
1401
1402 WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1403 if (s->rp_state.from_dst_file) {
1404 /* shutdown the rp socket, so causing the rp thread to shutdown */
1405 qemu_file_shutdown(s->rp_state.from_dst_file);
1406 }
1407 }
1408
1409 do {
1410 old_state = s->state;
1411 if (!migration_is_running(old_state)) {
1412 break;
1413 }
1414 /* If the migration is paused, kick it out of the pause */
1415 if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1416 qemu_sem_post(&s->pause_sem);
1417 }
1418 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
1419 } while (s->state != MIGRATION_STATUS_CANCELLING);
1420
1421 /*
1422 * If we're unlucky the migration code might be stuck somewhere in a
1423 * send/write while the network has failed and is waiting to timeout;
1424 * if we've got shutdown(2) available then we can force it to quit.
1425 */
1426 if (s->state == MIGRATION_STATUS_CANCELLING) {
1427 WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1428 if (s->to_dst_file) {
1429 qemu_file_shutdown(s->to_dst_file);
1430 }
1431 }
1432 }
1433 if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
1434 Error *local_err = NULL;
1435
1436 bdrv_activate_all(&local_err);
1437 if (local_err) {
1438 error_report_err(local_err);
1439 } else {
1440 s->block_inactive = false;
1441 }
1442 }
1443 }
1444
migration_add_notifier(Notifier * notify,void (* func)(Notifier * notifier,void * data))1445 void migration_add_notifier(Notifier *notify,
1446 void (*func)(Notifier *notifier, void *data))
1447 {
1448 notify->notify = func;
1449 notifier_list_add(&migration_state_notifiers, notify);
1450 }
1451
migration_remove_notifier(Notifier * notify)1452 void migration_remove_notifier(Notifier *notify)
1453 {
1454 if (notify->notify) {
1455 notifier_remove(notify);
1456 notify->notify = NULL;
1457 }
1458 }
1459
migration_call_notifiers(MigrationState * s)1460 void migration_call_notifiers(MigrationState *s)
1461 {
1462 notifier_list_notify(&migration_state_notifiers, s);
1463 }
1464
migration_in_setup(MigrationState * s)1465 bool migration_in_setup(MigrationState *s)
1466 {
1467 return s->state == MIGRATION_STATUS_SETUP;
1468 }
1469
migration_has_finished(MigrationState * s)1470 bool migration_has_finished(MigrationState *s)
1471 {
1472 return s->state == MIGRATION_STATUS_COMPLETED;
1473 }
1474
migration_has_failed(MigrationState * s)1475 bool migration_has_failed(MigrationState *s)
1476 {
1477 return (s->state == MIGRATION_STATUS_CANCELLED ||
1478 s->state == MIGRATION_STATUS_FAILED);
1479 }
1480
migration_in_postcopy(void)1481 bool migration_in_postcopy(void)
1482 {
1483 MigrationState *s = migrate_get_current();
1484
1485 switch (s->state) {
1486 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1487 case MIGRATION_STATUS_POSTCOPY_PAUSED:
1488 case MIGRATION_STATUS_POSTCOPY_RECOVER:
1489 return true;
1490 default:
1491 return false;
1492 }
1493 }
1494
migration_postcopy_is_alive(int state)1495 bool migration_postcopy_is_alive(int state)
1496 {
1497 switch (state) {
1498 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1499 case MIGRATION_STATUS_POSTCOPY_RECOVER:
1500 return true;
1501 default:
1502 return false;
1503 }
1504 }
1505
migration_in_postcopy_after_devices(MigrationState * s)1506 bool migration_in_postcopy_after_devices(MigrationState *s)
1507 {
1508 return migration_in_postcopy() && s->postcopy_after_devices;
1509 }
1510
migration_in_incoming_postcopy(void)1511 bool migration_in_incoming_postcopy(void)
1512 {
1513 PostcopyState ps = postcopy_state_get();
1514
1515 return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
1516 }
1517
migration_incoming_postcopy_advised(void)1518 bool migration_incoming_postcopy_advised(void)
1519 {
1520 PostcopyState ps = postcopy_state_get();
1521
1522 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
1523 }
1524
migration_in_bg_snapshot(void)1525 bool migration_in_bg_snapshot(void)
1526 {
1527 MigrationState *s = migrate_get_current();
1528
1529 return migrate_background_snapshot() &&
1530 migration_is_setup_or_active(s->state);
1531 }
1532
migration_is_idle(void)1533 bool migration_is_idle(void)
1534 {
1535 MigrationState *s = current_migration;
1536
1537 if (!s) {
1538 return true;
1539 }
1540
1541 switch (s->state) {
1542 case MIGRATION_STATUS_NONE:
1543 case MIGRATION_STATUS_CANCELLED:
1544 case MIGRATION_STATUS_COMPLETED:
1545 case MIGRATION_STATUS_FAILED:
1546 return true;
1547 case MIGRATION_STATUS_SETUP:
1548 case MIGRATION_STATUS_CANCELLING:
1549 case MIGRATION_STATUS_ACTIVE:
1550 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1551 case MIGRATION_STATUS_COLO:
1552 case MIGRATION_STATUS_PRE_SWITCHOVER:
1553 case MIGRATION_STATUS_DEVICE:
1554 case MIGRATION_STATUS_WAIT_UNPLUG:
1555 return false;
1556 case MIGRATION_STATUS__MAX:
1557 g_assert_not_reached();
1558 }
1559
1560 return false;
1561 }
1562
migration_is_active(MigrationState * s)1563 bool migration_is_active(MigrationState *s)
1564 {
1565 return (s->state == MIGRATION_STATUS_ACTIVE ||
1566 s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
1567 }
1568
migrate_init(MigrationState * s,Error ** errp)1569 int migrate_init(MigrationState *s, Error **errp)
1570 {
1571 int ret;
1572
1573 ret = qemu_savevm_state_prepare(errp);
1574 if (ret) {
1575 return ret;
1576 }
1577
1578 /*
1579 * Reinitialise all migration state, except
1580 * parameters/capabilities that the user set, and
1581 * locks.
1582 */
1583 s->cleanup_bh = 0;
1584 s->vm_start_bh = 0;
1585 s->to_dst_file = NULL;
1586 s->state = MIGRATION_STATUS_NONE;
1587 s->rp_state.from_dst_file = NULL;
1588 s->mbps = 0.0;
1589 s->pages_per_second = 0.0;
1590 s->downtime = 0;
1591 s->expected_downtime = 0;
1592 s->setup_time = 0;
1593 s->start_postcopy = false;
1594 s->postcopy_after_devices = false;
1595 s->migration_thread_running = false;
1596 error_free(s->error);
1597 s->error = NULL;
1598 s->hostname = NULL;
1599 s->vmdesc = NULL;
1600
1601 migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
1602
1603 s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1604 s->total_time = 0;
1605 s->vm_old_state = -1;
1606 s->iteration_initial_bytes = 0;
1607 s->threshold_size = 0;
1608 s->switchover_acked = false;
1609 s->rdma_migration = false;
1610 /*
1611 * set mig_stats memory to zero for a new migration
1612 */
1613 memset(&mig_stats, 0, sizeof(mig_stats));
1614 migration_reset_vfio_bytes_transferred();
1615
1616 return 0;
1617 }
1618
is_busy(Error ** reasonp,Error ** errp)1619 static bool is_busy(Error **reasonp, Error **errp)
1620 {
1621 ERRP_GUARD();
1622
1623 /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
1624 if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
1625 error_propagate_prepend(errp, *reasonp,
1626 "disallowing migration blocker "
1627 "(migration/snapshot in progress) for: ");
1628 *reasonp = NULL;
1629 return true;
1630 }
1631 return false;
1632 }
1633
is_only_migratable(Error ** reasonp,Error ** errp,int modes)1634 static bool is_only_migratable(Error **reasonp, Error **errp, int modes)
1635 {
1636 ERRP_GUARD();
1637
1638 if (only_migratable && (modes & BIT(MIG_MODE_NORMAL))) {
1639 error_propagate_prepend(errp, *reasonp,
1640 "disallowing migration blocker "
1641 "(--only-migratable) for: ");
1642 *reasonp = NULL;
1643 return true;
1644 }
1645 return false;
1646 }
1647
get_modes(MigMode mode,va_list ap)1648 static int get_modes(MigMode mode, va_list ap)
1649 {
1650 int modes = 0;
1651
1652 while (mode != -1 && mode != MIG_MODE_ALL) {
1653 assert(mode >= MIG_MODE_NORMAL && mode < MIG_MODE__MAX);
1654 modes |= BIT(mode);
1655 mode = va_arg(ap, MigMode);
1656 }
1657 if (mode == MIG_MODE_ALL) {
1658 modes = BIT(MIG_MODE__MAX) - 1;
1659 }
1660 return modes;
1661 }
1662
add_blockers(Error ** reasonp,Error ** errp,int modes)1663 static int add_blockers(Error **reasonp, Error **errp, int modes)
1664 {
1665 for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) {
1666 if (modes & BIT(mode)) {
1667 migration_blockers[mode] = g_slist_prepend(migration_blockers[mode],
1668 *reasonp);
1669 }
1670 }
1671 return 0;
1672 }
1673
migrate_add_blocker(Error ** reasonp,Error ** errp)1674 int migrate_add_blocker(Error **reasonp, Error **errp)
1675 {
1676 return migrate_add_blocker_modes(reasonp, errp, MIG_MODE_ALL);
1677 }
1678
migrate_add_blocker_normal(Error ** reasonp,Error ** errp)1679 int migrate_add_blocker_normal(Error **reasonp, Error **errp)
1680 {
1681 return migrate_add_blocker_modes(reasonp, errp, MIG_MODE_NORMAL, -1);
1682 }
1683
migrate_add_blocker_modes(Error ** reasonp,Error ** errp,MigMode mode,...)1684 int migrate_add_blocker_modes(Error **reasonp, Error **errp, MigMode mode, ...)
1685 {
1686 int modes;
1687 va_list ap;
1688
1689 va_start(ap, mode);
1690 modes = get_modes(mode, ap);
1691 va_end(ap);
1692
1693 if (is_only_migratable(reasonp, errp, modes)) {
1694 return -EACCES;
1695 } else if (is_busy(reasonp, errp)) {
1696 return -EBUSY;
1697 }
1698 return add_blockers(reasonp, errp, modes);
1699 }
1700
migrate_add_blocker_internal(Error ** reasonp,Error ** errp)1701 int migrate_add_blocker_internal(Error **reasonp, Error **errp)
1702 {
1703 int modes = BIT(MIG_MODE__MAX) - 1;
1704
1705 if (is_busy(reasonp, errp)) {
1706 return -EBUSY;
1707 }
1708 return add_blockers(reasonp, errp, modes);
1709 }
1710
migrate_del_blocker(Error ** reasonp)1711 void migrate_del_blocker(Error **reasonp)
1712 {
1713 if (*reasonp) {
1714 for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) {
1715 migration_blockers[mode] = g_slist_remove(migration_blockers[mode],
1716 *reasonp);
1717 }
1718 error_free(*reasonp);
1719 *reasonp = NULL;
1720 }
1721 }
1722
qmp_migrate_incoming(const char * uri,bool has_channels,MigrationChannelList * channels,Error ** errp)1723 void qmp_migrate_incoming(const char *uri, bool has_channels,
1724 MigrationChannelList *channels, Error **errp)
1725 {
1726 Error *local_err = NULL;
1727 static bool once = true;
1728
1729 if (!once) {
1730 error_setg(errp, "The incoming migration has already been started");
1731 return;
1732 }
1733 if (!runstate_check(RUN_STATE_INMIGRATE)) {
1734 error_setg(errp, "'-incoming' was not specified on the command line");
1735 return;
1736 }
1737
1738 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
1739 return;
1740 }
1741
1742 qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
1743
1744 if (local_err) {
1745 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1746 error_propagate(errp, local_err);
1747 return;
1748 }
1749
1750 once = false;
1751 }
1752
qmp_migrate_recover(const char * uri,Error ** errp)1753 void qmp_migrate_recover(const char *uri, Error **errp)
1754 {
1755 MigrationIncomingState *mis = migration_incoming_get_current();
1756
1757 /*
1758 * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
1759 * callers (no one should ignore a recover failure); if there is, it's a
1760 * programming error.
1761 */
1762 assert(errp);
1763
1764 if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1765 error_setg(errp, "Migrate recover can only be run "
1766 "when postcopy is paused.");
1767 return;
1768 }
1769
1770 /* If there's an existing transport, release it */
1771 migration_incoming_transport_cleanup(mis);
1772
1773 /*
1774 * Note that this call will never start a real migration; it will
1775 * only re-setup the migration stream and poke existing migration
1776 * to continue using that newly established channel.
1777 */
1778 qemu_start_incoming_migration(uri, false, NULL, errp);
1779 }
1780
qmp_migrate_pause(Error ** errp)1781 void qmp_migrate_pause(Error **errp)
1782 {
1783 MigrationState *ms = migrate_get_current();
1784 MigrationIncomingState *mis = migration_incoming_get_current();
1785 int ret = 0;
1786
1787 if (migration_postcopy_is_alive(ms->state)) {
1788 /* Source side, during postcopy */
1789 Error *error = NULL;
1790
1791 /* Tell the core migration that we're pausing */
1792 error_setg(&error, "Postcopy migration is paused by the user");
1793 migrate_set_error(ms, error);
1794 error_free(error);
1795
1796 qemu_mutex_lock(&ms->qemu_file_lock);
1797 if (ms->to_dst_file) {
1798 ret = qemu_file_shutdown(ms->to_dst_file);
1799 }
1800 qemu_mutex_unlock(&ms->qemu_file_lock);
1801 if (ret) {
1802 error_setg(errp, "Failed to pause source migration");
1803 }
1804
1805 /*
1806 * Kick the migration thread out of any waiting windows (on behalf
1807 * of the rp thread).
1808 */
1809 migration_rp_kick(ms);
1810
1811 return;
1812 }
1813
1814 if (migration_postcopy_is_alive(mis->state)) {
1815 ret = qemu_file_shutdown(mis->from_src_file);
1816 if (ret) {
1817 error_setg(errp, "Failed to pause destination migration");
1818 }
1819 return;
1820 }
1821
1822 error_setg(errp, "migrate-pause is currently only supported "
1823 "during postcopy-active or postcopy-recover state");
1824 }
1825
migration_is_blocked(Error ** errp)1826 bool migration_is_blocked(Error **errp)
1827 {
1828 GSList *blockers = migration_blockers[migrate_mode()];
1829
1830 if (qemu_savevm_state_blocked(errp)) {
1831 return true;
1832 }
1833
1834 if (blockers) {
1835 error_propagate(errp, error_copy(blockers->data));
1836 return true;
1837 }
1838
1839 return false;
1840 }
1841
1842 /* Returns true if continue to migrate, or false if error detected */
migrate_prepare(MigrationState * s,bool blk,bool blk_inc,bool resume,Error ** errp)1843 static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
1844 bool resume, Error **errp)
1845 {
1846 Error *local_err = NULL;
1847
1848 if (blk_inc) {
1849 warn_report("parameter 'inc' is deprecated;"
1850 " use blockdev-mirror with NBD instead");
1851 }
1852
1853 if (blk) {
1854 warn_report("parameter 'blk' is deprecated;"
1855 " use blockdev-mirror with NBD instead");
1856 }
1857
1858 if (resume) {
1859 if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1860 error_setg(errp, "Cannot resume if there is no "
1861 "paused migration");
1862 return false;
1863 }
1864
1865 /*
1866 * Postcopy recovery won't work well with release-ram
1867 * capability since release-ram will drop the page buffer as
1868 * long as the page is put into the send buffer. So if there
1869 * is a network failure happened, any page buffers that have
1870 * not yet reached the destination VM but have already been
1871 * sent from the source VM will be lost forever. Let's refuse
1872 * the client from resuming such a postcopy migration.
1873 * Luckily release-ram was designed to only be used when src
1874 * and destination VMs are on the same host, so it should be
1875 * fine.
1876 */
1877 if (migrate_release_ram()) {
1878 error_setg(errp, "Postcopy recovery cannot work "
1879 "when release-ram capability is set");
1880 return false;
1881 }
1882
1883 /* This is a resume, skip init status */
1884 return true;
1885 }
1886
1887 if (migration_is_running(s->state)) {
1888 error_setg(errp, QERR_MIGRATION_ACTIVE);
1889 return false;
1890 }
1891
1892 if (runstate_check(RUN_STATE_INMIGRATE)) {
1893 error_setg(errp, "Guest is waiting for an incoming migration");
1894 return false;
1895 }
1896
1897 if (runstate_check(RUN_STATE_POSTMIGRATE)) {
1898 error_setg(errp, "Can't migrate the vm that was paused due to "
1899 "previous migration");
1900 return false;
1901 }
1902
1903 if (migration_is_blocked(errp)) {
1904 return false;
1905 }
1906
1907 if (blk || blk_inc) {
1908 if (migrate_colo()) {
1909 error_setg(errp, "No disk migration is required in COLO mode");
1910 return false;
1911 }
1912 if (migrate_block() || migrate_block_incremental()) {
1913 error_setg(errp, "Command options are incompatible with "
1914 "current migration capabilities");
1915 return false;
1916 }
1917 if (!migrate_cap_set(MIGRATION_CAPABILITY_BLOCK, true, &local_err)) {
1918 error_propagate(errp, local_err);
1919 return false;
1920 }
1921 s->must_remove_block_options = true;
1922 }
1923
1924 if (blk_inc) {
1925 migrate_set_block_incremental(true);
1926 }
1927
1928 if (migrate_init(s, errp)) {
1929 return false;
1930 }
1931
1932 return true;
1933 }
1934
qmp_migrate(const char * uri,bool has_channels,MigrationChannelList * channels,bool has_blk,bool blk,bool has_inc,bool inc,bool has_detach,bool detach,bool has_resume,bool resume,Error ** errp)1935 void qmp_migrate(const char *uri, bool has_channels,
1936 MigrationChannelList *channels, bool has_blk, bool blk,
1937 bool has_inc, bool inc, bool has_detach, bool detach,
1938 bool has_resume, bool resume, Error **errp)
1939 {
1940 bool resume_requested;
1941 Error *local_err = NULL;
1942 MigrationState *s = migrate_get_current();
1943 g_autoptr(MigrationChannel) channel = NULL;
1944 MigrationAddress *addr = NULL;
1945
1946 /*
1947 * Having preliminary checks for uri and channel
1948 */
1949 if (uri && has_channels) {
1950 error_setg(errp, "'uri' and 'channels' arguments are mutually "
1951 "exclusive; exactly one of the two should be present in "
1952 "'migrate' qmp command ");
1953 return;
1954 } else if (channels) {
1955 /* To verify that Migrate channel list has only item */
1956 if (channels->next) {
1957 error_setg(errp, "Channel list has more than one entries");
1958 return;
1959 }
1960 addr = channels->value->addr;
1961 } else if (uri) {
1962 /* caller uses the old URI syntax */
1963 if (!migrate_uri_parse(uri, &channel, errp)) {
1964 return;
1965 }
1966 addr = channel->addr;
1967 } else {
1968 error_setg(errp, "neither 'uri' or 'channels' argument are "
1969 "specified in 'migrate' qmp command ");
1970 return;
1971 }
1972
1973 /* transport mechanism not suitable for migration? */
1974 if (!migration_channels_and_transport_compatible(addr, errp)) {
1975 return;
1976 }
1977
1978 resume_requested = has_resume && resume;
1979 if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
1980 resume_requested, errp)) {
1981 /* Error detected, put into errp */
1982 return;
1983 }
1984
1985 if (!resume_requested) {
1986 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
1987 return;
1988 }
1989 }
1990
1991 if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
1992 SocketAddress *saddr = &addr->u.socket;
1993 if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
1994 saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
1995 saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
1996 socket_start_outgoing_migration(s, saddr, &local_err);
1997 } else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) {
1998 fd_start_outgoing_migration(s, saddr->u.fd.str, &local_err);
1999 }
2000 #ifdef CONFIG_RDMA
2001 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
2002 rdma_start_outgoing_migration(s, &addr->u.rdma, &local_err);
2003 #endif
2004 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
2005 exec_start_outgoing_migration(s, addr->u.exec.args, &local_err);
2006 } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
2007 file_start_outgoing_migration(s, &addr->u.file, &local_err);
2008 } else {
2009 error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri",
2010 "a valid migration protocol");
2011 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2012 MIGRATION_STATUS_FAILED);
2013 block_cleanup_parameters();
2014 }
2015
2016 if (local_err) {
2017 if (!resume_requested) {
2018 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2019 }
2020 migrate_fd_error(s, local_err);
2021 error_propagate(errp, local_err);
2022 return;
2023 }
2024 }
2025
qmp_migrate_cancel(Error ** errp)2026 void qmp_migrate_cancel(Error **errp)
2027 {
2028 migration_cancel(NULL);
2029 }
2030
qmp_migrate_continue(MigrationStatus state,Error ** errp)2031 void qmp_migrate_continue(MigrationStatus state, Error **errp)
2032 {
2033 MigrationState *s = migrate_get_current();
2034 if (s->state != state) {
2035 error_setg(errp, "Migration not in expected state: %s",
2036 MigrationStatus_str(s->state));
2037 return;
2038 }
2039 qemu_sem_post(&s->pause_sem);
2040 }
2041
migration_rp_wait(MigrationState * s)2042 int migration_rp_wait(MigrationState *s)
2043 {
2044 /* If migration has failure already, ignore the wait */
2045 if (migrate_has_error(s)) {
2046 return -1;
2047 }
2048
2049 qemu_sem_wait(&s->rp_state.rp_sem);
2050
2051 /* After wait, double check that there's no failure */
2052 if (migrate_has_error(s)) {
2053 return -1;
2054 }
2055
2056 return 0;
2057 }
2058
migration_rp_kick(MigrationState * s)2059 void migration_rp_kick(MigrationState *s)
2060 {
2061 qemu_sem_post(&s->rp_state.rp_sem);
2062 }
2063
2064 static struct rp_cmd_args {
2065 ssize_t len; /* -1 = variable */
2066 const char *name;
2067 } rp_cmd_args[] = {
2068 [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" },
2069 [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" },
2070 [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" },
2071 [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" },
2072 [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" },
2073 [MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" },
2074 [MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" },
2075 [MIG_RP_MSG_SWITCHOVER_ACK] = { .len = 0, .name = "SWITCHOVER_ACK" },
2076 [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" },
2077 };
2078
2079 /*
2080 * Process a request for pages received on the return path,
2081 * We're allowed to send more than requested (e.g. to round to our page size)
2082 * and we don't need to send pages that have already been sent.
2083 */
2084 static void
migrate_handle_rp_req_pages(MigrationState * ms,const char * rbname,ram_addr_t start,size_t len,Error ** errp)2085 migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
2086 ram_addr_t start, size_t len, Error **errp)
2087 {
2088 long our_host_ps = qemu_real_host_page_size();
2089
2090 trace_migrate_handle_rp_req_pages(rbname, start, len);
2091
2092 /*
2093 * Since we currently insist on matching page sizes, just sanity check
2094 * we're being asked for whole host pages.
2095 */
2096 if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
2097 !QEMU_IS_ALIGNED(len, our_host_ps)) {
2098 error_setg(errp, "MIG_RP_MSG_REQ_PAGES: Misaligned page request, start:"
2099 RAM_ADDR_FMT " len: %zd", start, len);
2100 return;
2101 }
2102
2103 ram_save_queue_pages(rbname, start, len, errp);
2104 }
2105
migrate_handle_rp_recv_bitmap(MigrationState * s,char * block_name,Error ** errp)2106 static bool migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name,
2107 Error **errp)
2108 {
2109 RAMBlock *block = qemu_ram_block_by_name(block_name);
2110
2111 if (!block) {
2112 error_setg(errp, "MIG_RP_MSG_RECV_BITMAP has invalid block name '%s'",
2113 block_name);
2114 return false;
2115 }
2116
2117 /* Fetch the received bitmap and refresh the dirty bitmap */
2118 return ram_dirty_bitmap_reload(s, block, errp);
2119 }
2120
migrate_handle_rp_resume_ack(MigrationState * s,uint32_t value,Error ** errp)2121 static bool migrate_handle_rp_resume_ack(MigrationState *s,
2122 uint32_t value, Error **errp)
2123 {
2124 trace_source_return_path_thread_resume_ack(value);
2125
2126 if (value != MIGRATION_RESUME_ACK_VALUE) {
2127 error_setg(errp, "illegal resume_ack value %"PRIu32, value);
2128 return false;
2129 }
2130
2131 /* Now both sides are active. */
2132 migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2133 MIGRATION_STATUS_POSTCOPY_ACTIVE);
2134
2135 /* Notify send thread that time to continue send pages */
2136 migration_rp_kick(s);
2137
2138 return true;
2139 }
2140
2141 /*
2142 * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
2143 * existed) in a safe way.
2144 */
migration_release_dst_files(MigrationState * ms)2145 static void migration_release_dst_files(MigrationState *ms)
2146 {
2147 QEMUFile *file;
2148
2149 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2150 /*
2151 * Reset the from_dst_file pointer first before releasing it, as we
2152 * can't block within lock section
2153 */
2154 file = ms->rp_state.from_dst_file;
2155 ms->rp_state.from_dst_file = NULL;
2156 }
2157
2158 /*
2159 * Do the same to postcopy fast path socket too if there is. No
2160 * locking needed because this qemufile should only be managed by
2161 * return path thread.
2162 */
2163 if (ms->postcopy_qemufile_src) {
2164 migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
2165 qemu_file_shutdown(ms->postcopy_qemufile_src);
2166 qemu_fclose(ms->postcopy_qemufile_src);
2167 ms->postcopy_qemufile_src = NULL;
2168 }
2169
2170 qemu_fclose(file);
2171 }
2172
2173 /*
2174 * Handles messages sent on the return path towards the source VM
2175 *
2176 */
source_return_path_thread(void * opaque)2177 static void *source_return_path_thread(void *opaque)
2178 {
2179 MigrationState *ms = opaque;
2180 QEMUFile *rp = ms->rp_state.from_dst_file;
2181 uint16_t header_len, header_type;
2182 uint8_t buf[512];
2183 uint32_t tmp32, sibling_error;
2184 ram_addr_t start = 0; /* =0 to silence warning */
2185 size_t len = 0, expected_len;
2186 Error *err = NULL;
2187 int res;
2188
2189 trace_source_return_path_thread_entry();
2190 rcu_register_thread();
2191
2192 while (migration_is_setup_or_active(ms->state)) {
2193 trace_source_return_path_thread_loop_top();
2194
2195 header_type = qemu_get_be16(rp);
2196 header_len = qemu_get_be16(rp);
2197
2198 if (qemu_file_get_error(rp)) {
2199 qemu_file_get_error_obj(rp, &err);
2200 goto out;
2201 }
2202
2203 if (header_type >= MIG_RP_MSG_MAX ||
2204 header_type == MIG_RP_MSG_INVALID) {
2205 error_setg(&err, "Received invalid message 0x%04x length 0x%04x",
2206 header_type, header_len);
2207 goto out;
2208 }
2209
2210 if ((rp_cmd_args[header_type].len != -1 &&
2211 header_len != rp_cmd_args[header_type].len) ||
2212 header_len > sizeof(buf)) {
2213 error_setg(&err, "Received '%s' message (0x%04x) with"
2214 "incorrect length %d expecting %zu",
2215 rp_cmd_args[header_type].name, header_type, header_len,
2216 (size_t)rp_cmd_args[header_type].len);
2217 goto out;
2218 }
2219
2220 /* We know we've got a valid header by this point */
2221 res = qemu_get_buffer(rp, buf, header_len);
2222 if (res != header_len) {
2223 error_setg(&err, "Failed reading data for message 0x%04x"
2224 " read %d expected %d",
2225 header_type, res, header_len);
2226 goto out;
2227 }
2228
2229 /* OK, we have the message and the data */
2230 switch (header_type) {
2231 case MIG_RP_MSG_SHUT:
2232 sibling_error = ldl_be_p(buf);
2233 trace_source_return_path_thread_shut(sibling_error);
2234 if (sibling_error) {
2235 error_setg(&err, "Sibling indicated error %d", sibling_error);
2236 }
2237 /*
2238 * We'll let the main thread deal with closing the RP
2239 * we could do a shutdown(2) on it, but we're the only user
2240 * anyway, so there's nothing gained.
2241 */
2242 goto out;
2243
2244 case MIG_RP_MSG_PONG:
2245 tmp32 = ldl_be_p(buf);
2246 trace_source_return_path_thread_pong(tmp32);
2247 qemu_sem_post(&ms->rp_state.rp_pong_acks);
2248 break;
2249
2250 case MIG_RP_MSG_REQ_PAGES:
2251 start = ldq_be_p(buf);
2252 len = ldl_be_p(buf + 8);
2253 migrate_handle_rp_req_pages(ms, NULL, start, len, &err);
2254 if (err) {
2255 goto out;
2256 }
2257 break;
2258
2259 case MIG_RP_MSG_REQ_PAGES_ID:
2260 expected_len = 12 + 1; /* header + termination */
2261
2262 if (header_len >= expected_len) {
2263 start = ldq_be_p(buf);
2264 len = ldl_be_p(buf + 8);
2265 /* Now we expect an idstr */
2266 tmp32 = buf[12]; /* Length of the following idstr */
2267 buf[13 + tmp32] = '\0';
2268 expected_len += tmp32;
2269 }
2270 if (header_len != expected_len) {
2271 error_setg(&err, "Req_Page_id with length %d expecting %zd",
2272 header_len, expected_len);
2273 goto out;
2274 }
2275 migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len,
2276 &err);
2277 if (err) {
2278 goto out;
2279 }
2280 break;
2281
2282 case MIG_RP_MSG_RECV_BITMAP:
2283 if (header_len < 1) {
2284 error_setg(&err, "MIG_RP_MSG_RECV_BITMAP missing block name");
2285 goto out;
2286 }
2287 /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2288 buf[buf[0] + 1] = '\0';
2289 if (!migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1), &err)) {
2290 goto out;
2291 }
2292 break;
2293
2294 case MIG_RP_MSG_RESUME_ACK:
2295 tmp32 = ldl_be_p(buf);
2296 if (!migrate_handle_rp_resume_ack(ms, tmp32, &err)) {
2297 goto out;
2298 }
2299 break;
2300
2301 case MIG_RP_MSG_SWITCHOVER_ACK:
2302 ms->switchover_acked = true;
2303 trace_source_return_path_thread_switchover_acked();
2304 break;
2305
2306 default:
2307 break;
2308 }
2309 }
2310
2311 out:
2312 if (err) {
2313 migrate_set_error(ms, err);
2314 error_free(err);
2315 trace_source_return_path_thread_bad_end();
2316 }
2317
2318 if (ms->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2319 /*
2320 * this will be extremely unlikely: that we got yet another network
2321 * issue during recovering of the 1st network failure.. during this
2322 * period the main migration thread can be waiting on rp_sem for
2323 * this thread to sync with the other side.
2324 *
2325 * When this happens, explicitly kick the migration thread out of
2326 * RECOVER stage and back to PAUSED, so the admin can try
2327 * everything again.
2328 */
2329 migration_rp_kick(ms);
2330 }
2331
2332 trace_source_return_path_thread_end();
2333 rcu_unregister_thread();
2334
2335 return NULL;
2336 }
2337
open_return_path_on_source(MigrationState * ms)2338 static int open_return_path_on_source(MigrationState *ms)
2339 {
2340 ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
2341 if (!ms->rp_state.from_dst_file) {
2342 return -1;
2343 }
2344
2345 trace_open_return_path_on_source();
2346
2347 qemu_thread_create(&ms->rp_state.rp_thread, "return path",
2348 source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
2349 ms->rp_state.rp_thread_created = true;
2350
2351 trace_open_return_path_on_source_continue();
2352
2353 return 0;
2354 }
2355
2356 /* Return true if error detected, or false otherwise */
close_return_path_on_source(MigrationState * ms)2357 static bool close_return_path_on_source(MigrationState *ms)
2358 {
2359 if (!ms->rp_state.rp_thread_created) {
2360 return false;
2361 }
2362
2363 trace_migration_return_path_end_before();
2364
2365 /*
2366 * If this is a normal exit then the destination will send a SHUT
2367 * and the rp_thread will exit, however if there's an error we
2368 * need to cause it to exit. shutdown(2), if we have it, will
2369 * cause it to unblock if it's stuck waiting for the destination.
2370 */
2371 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2372 if (ms->to_dst_file && ms->rp_state.from_dst_file &&
2373 qemu_file_get_error(ms->to_dst_file)) {
2374 qemu_file_shutdown(ms->rp_state.from_dst_file);
2375 }
2376 }
2377
2378 qemu_thread_join(&ms->rp_state.rp_thread);
2379 ms->rp_state.rp_thread_created = false;
2380 migration_release_dst_files(ms);
2381 trace_migration_return_path_end_after();
2382
2383 /* Return path will persist the error in MigrationState when quit */
2384 return migrate_has_error(ms);
2385 }
2386
2387 static inline void
migration_wait_main_channel(MigrationState * ms)2388 migration_wait_main_channel(MigrationState *ms)
2389 {
2390 /* Wait until one PONG message received */
2391 qemu_sem_wait(&ms->rp_state.rp_pong_acks);
2392 }
2393
2394 /*
2395 * Switch from normal iteration to postcopy
2396 * Returns non-0 on error
2397 */
postcopy_start(MigrationState * ms,Error ** errp)2398 static int postcopy_start(MigrationState *ms, Error **errp)
2399 {
2400 int ret;
2401 QIOChannelBuffer *bioc;
2402 QEMUFile *fb;
2403 uint64_t bandwidth = migrate_max_postcopy_bandwidth();
2404 bool restart_block = false;
2405 int cur_state = MIGRATION_STATUS_ACTIVE;
2406
2407 if (migrate_postcopy_preempt()) {
2408 migration_wait_main_channel(ms);
2409 if (postcopy_preempt_establish_channel(ms)) {
2410 migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
2411 return -1;
2412 }
2413 }
2414
2415 if (!migrate_pause_before_switchover()) {
2416 migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
2417 MIGRATION_STATUS_POSTCOPY_ACTIVE);
2418 }
2419
2420 trace_postcopy_start();
2421 qemu_mutex_lock_iothread();
2422 trace_postcopy_start_set_run();
2423
2424 migration_downtime_start(ms);
2425
2426 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
2427 global_state_store();
2428 ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
2429 if (ret < 0) {
2430 goto fail;
2431 }
2432
2433 ret = migration_maybe_pause(ms, &cur_state,
2434 MIGRATION_STATUS_POSTCOPY_ACTIVE);
2435 if (ret < 0) {
2436 goto fail;
2437 }
2438
2439 ret = bdrv_inactivate_all();
2440 if (ret < 0) {
2441 goto fail;
2442 }
2443 restart_block = true;
2444
2445 /*
2446 * Cause any non-postcopiable, but iterative devices to
2447 * send out their final data.
2448 */
2449 qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
2450
2451 /*
2452 * in Finish migrate and with the io-lock held everything should
2453 * be quiet, but we've potentially still got dirty pages and we
2454 * need to tell the destination to throw any pages it's already received
2455 * that are dirty
2456 */
2457 if (migrate_postcopy_ram()) {
2458 ram_postcopy_send_discard_bitmap(ms);
2459 }
2460
2461 /*
2462 * send rest of state - note things that are doing postcopy
2463 * will notice we're in POSTCOPY_ACTIVE and not actually
2464 * wrap their state up here
2465 */
2466 migration_rate_set(bandwidth);
2467 if (migrate_postcopy_ram()) {
2468 /* Ping just for debugging, helps line traces up */
2469 qemu_savevm_send_ping(ms->to_dst_file, 2);
2470 }
2471
2472 /*
2473 * While loading the device state we may trigger page transfer
2474 * requests and the fd must be free to process those, and thus
2475 * the destination must read the whole device state off the fd before
2476 * it starts processing it. Unfortunately the ad-hoc migration format
2477 * doesn't allow the destination to know the size to read without fully
2478 * parsing it through each devices load-state code (especially the open
2479 * coded devices that use get/put).
2480 * So we wrap the device state up in a package with a length at the start;
2481 * to do this we use a qemu_buf to hold the whole of the device state.
2482 */
2483 bioc = qio_channel_buffer_new(4096);
2484 qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
2485 fb = qemu_file_new_output(QIO_CHANNEL(bioc));
2486 object_unref(OBJECT(bioc));
2487
2488 /*
2489 * Make sure the receiver can get incoming pages before we send the rest
2490 * of the state
2491 */
2492 qemu_savevm_send_postcopy_listen(fb);
2493
2494 qemu_savevm_state_complete_precopy(fb, false, false);
2495 if (migrate_postcopy_ram()) {
2496 qemu_savevm_send_ping(fb, 3);
2497 }
2498
2499 qemu_savevm_send_postcopy_run(fb);
2500
2501 /* <><> end of stuff going into the package */
2502
2503 /* Last point of recovery; as soon as we send the package the destination
2504 * can open devices and potentially start running.
2505 * Lets just check again we've not got any errors.
2506 */
2507 ret = qemu_file_get_error(ms->to_dst_file);
2508 if (ret) {
2509 error_setg(errp, "postcopy_start: Migration stream errored (pre package)");
2510 goto fail_closefb;
2511 }
2512
2513 restart_block = false;
2514
2515 /* Now send that blob */
2516 if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
2517 goto fail_closefb;
2518 }
2519 qemu_fclose(fb);
2520
2521 /* Send a notify to give a chance for anything that needs to happen
2522 * at the transition to postcopy and after the device state; in particular
2523 * spice needs to trigger a transition now
2524 */
2525 ms->postcopy_after_devices = true;
2526 migration_call_notifiers(ms);
2527
2528 migration_downtime_end(ms);
2529
2530 qemu_mutex_unlock_iothread();
2531
2532 if (migrate_postcopy_ram()) {
2533 /*
2534 * Although this ping is just for debug, it could potentially be
2535 * used for getting a better measurement of downtime at the source.
2536 */
2537 qemu_savevm_send_ping(ms->to_dst_file, 4);
2538 }
2539
2540 if (migrate_release_ram()) {
2541 ram_postcopy_migrated_memory_release(ms);
2542 }
2543
2544 ret = qemu_file_get_error(ms->to_dst_file);
2545 if (ret) {
2546 error_setg(errp, "postcopy_start: Migration stream errored");
2547 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2548 MIGRATION_STATUS_FAILED);
2549 }
2550
2551 trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
2552
2553 return ret;
2554
2555 fail_closefb:
2556 qemu_fclose(fb);
2557 fail:
2558 migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2559 MIGRATION_STATUS_FAILED);
2560 if (restart_block) {
2561 /* A failure happened early enough that we know the destination hasn't
2562 * accessed block devices, so we're safe to recover.
2563 */
2564 Error *local_err = NULL;
2565
2566 bdrv_activate_all(&local_err);
2567 if (local_err) {
2568 error_report_err(local_err);
2569 }
2570 }
2571 qemu_mutex_unlock_iothread();
2572 return -1;
2573 }
2574
2575 /**
2576 * migration_maybe_pause: Pause if required to by
2577 * migrate_pause_before_switchover called with the iothread locked
2578 * Returns: 0 on success
2579 */
migration_maybe_pause(MigrationState * s,int * current_active_state,int new_state)2580 static int migration_maybe_pause(MigrationState *s,
2581 int *current_active_state,
2582 int new_state)
2583 {
2584 if (!migrate_pause_before_switchover()) {
2585 return 0;
2586 }
2587
2588 /* Since leaving this state is not atomic with posting the semaphore
2589 * it's possible that someone could have issued multiple migrate_continue
2590 * and the semaphore is incorrectly positive at this point;
2591 * the docs say it's undefined to reinit a semaphore that's already
2592 * init'd, so use timedwait to eat up any existing posts.
2593 */
2594 while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
2595 /* This block intentionally left blank */
2596 }
2597
2598 /*
2599 * If the migration is cancelled when it is in the completion phase,
2600 * the migration state is set to MIGRATION_STATUS_CANCELLING.
2601 * So we don't need to wait a semaphore, otherwise we would always
2602 * wait for the 'pause_sem' semaphore.
2603 */
2604 if (s->state != MIGRATION_STATUS_CANCELLING) {
2605 qemu_mutex_unlock_iothread();
2606 migrate_set_state(&s->state, *current_active_state,
2607 MIGRATION_STATUS_PRE_SWITCHOVER);
2608 qemu_sem_wait(&s->pause_sem);
2609 migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
2610 new_state);
2611 *current_active_state = new_state;
2612 qemu_mutex_lock_iothread();
2613 }
2614
2615 return s->state == new_state ? 0 : -EINVAL;
2616 }
2617
migration_completion_precopy(MigrationState * s,int * current_active_state)2618 static int migration_completion_precopy(MigrationState *s,
2619 int *current_active_state)
2620 {
2621 int ret;
2622
2623 qemu_mutex_lock_iothread();
2624 migration_downtime_start(s);
2625 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
2626
2627 s->vm_old_state = runstate_get();
2628 global_state_store();
2629
2630 ret = migration_stop_vm(RUN_STATE_FINISH_MIGRATE);
2631 trace_migration_completion_vm_stop(ret);
2632 if (ret < 0) {
2633 goto out_unlock;
2634 }
2635
2636 ret = migration_maybe_pause(s, current_active_state,
2637 MIGRATION_STATUS_DEVICE);
2638 if (ret < 0) {
2639 goto out_unlock;
2640 }
2641
2642 /*
2643 * Inactivate disks except in COLO, and track that we have done so in order
2644 * to remember to reactivate them if migration fails or is cancelled.
2645 */
2646 s->block_inactive = !migrate_colo();
2647 migration_rate_set(RATE_LIMIT_DISABLED);
2648 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
2649 s->block_inactive);
2650 out_unlock:
2651 qemu_mutex_unlock_iothread();
2652 return ret;
2653 }
2654
migration_completion_postcopy(MigrationState * s)2655 static void migration_completion_postcopy(MigrationState *s)
2656 {
2657 trace_migration_completion_postcopy_end();
2658
2659 qemu_mutex_lock_iothread();
2660 qemu_savevm_state_complete_postcopy(s->to_dst_file);
2661 qemu_mutex_unlock_iothread();
2662
2663 /*
2664 * Shutdown the postcopy fast path thread. This is only needed when dest
2665 * QEMU binary is old (7.1/7.2). QEMU 8.0+ doesn't need this.
2666 */
2667 if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
2668 postcopy_preempt_shutdown_file(s);
2669 }
2670
2671 trace_migration_completion_postcopy_end_after_complete();
2672 }
2673
migration_completion_failed(MigrationState * s,int current_active_state)2674 static void migration_completion_failed(MigrationState *s,
2675 int current_active_state)
2676 {
2677 if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE ||
2678 s->state == MIGRATION_STATUS_DEVICE)) {
2679 /*
2680 * If not doing postcopy, vm_start() will be called: let's
2681 * regain control on images.
2682 */
2683 Error *local_err = NULL;
2684
2685 qemu_mutex_lock_iothread();
2686 bdrv_activate_all(&local_err);
2687 if (local_err) {
2688 error_report_err(local_err);
2689 } else {
2690 s->block_inactive = false;
2691 }
2692 qemu_mutex_unlock_iothread();
2693 }
2694
2695 migrate_set_state(&s->state, current_active_state,
2696 MIGRATION_STATUS_FAILED);
2697 }
2698
2699 /**
2700 * migration_completion: Used by migration_thread when there's not much left.
2701 * The caller 'breaks' the loop when this returns.
2702 *
2703 * @s: Current migration state
2704 */
migration_completion(MigrationState * s)2705 static void migration_completion(MigrationState *s)
2706 {
2707 int ret = 0;
2708 int current_active_state = s->state;
2709
2710 if (s->state == MIGRATION_STATUS_ACTIVE) {
2711 ret = migration_completion_precopy(s, ¤t_active_state);
2712 } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2713 migration_completion_postcopy(s);
2714 } else {
2715 ret = -1;
2716 }
2717
2718 if (ret < 0) {
2719 goto fail;
2720 }
2721
2722 if (close_return_path_on_source(s)) {
2723 goto fail;
2724 }
2725
2726 if (qemu_file_get_error(s->to_dst_file)) {
2727 trace_migration_completion_file_err();
2728 goto fail;
2729 }
2730
2731 if (migrate_colo() && s->state == MIGRATION_STATUS_ACTIVE) {
2732 /* COLO does not support postcopy */
2733 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
2734 MIGRATION_STATUS_COLO);
2735 } else {
2736 migrate_set_state(&s->state, current_active_state,
2737 MIGRATION_STATUS_COMPLETED);
2738 }
2739
2740 return;
2741
2742 fail:
2743 migration_completion_failed(s, current_active_state);
2744 }
2745
2746 /**
2747 * bg_migration_completion: Used by bg_migration_thread when after all the
2748 * RAM has been saved. The caller 'breaks' the loop when this returns.
2749 *
2750 * @s: Current migration state
2751 */
bg_migration_completion(MigrationState * s)2752 static void bg_migration_completion(MigrationState *s)
2753 {
2754 int current_active_state = s->state;
2755
2756 if (s->state == MIGRATION_STATUS_ACTIVE) {
2757 /*
2758 * By this moment we have RAM content saved into the migration stream.
2759 * The next step is to flush the non-RAM content (device state)
2760 * right after the ram content. The device state has been stored into
2761 * the temporary buffer before RAM saving started.
2762 */
2763 qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
2764 qemu_fflush(s->to_dst_file);
2765 } else if (s->state == MIGRATION_STATUS_CANCELLING) {
2766 goto fail;
2767 }
2768
2769 if (qemu_file_get_error(s->to_dst_file)) {
2770 trace_migration_completion_file_err();
2771 goto fail;
2772 }
2773
2774 migrate_set_state(&s->state, current_active_state,
2775 MIGRATION_STATUS_COMPLETED);
2776 return;
2777
2778 fail:
2779 migrate_set_state(&s->state, current_active_state,
2780 MIGRATION_STATUS_FAILED);
2781 }
2782
2783 typedef enum MigThrError {
2784 /* No error detected */
2785 MIG_THR_ERR_NONE = 0,
2786 /* Detected error, but resumed successfully */
2787 MIG_THR_ERR_RECOVERED = 1,
2788 /* Detected fatal error, need to exit */
2789 MIG_THR_ERR_FATAL = 2,
2790 } MigThrError;
2791
postcopy_resume_handshake(MigrationState * s)2792 static int postcopy_resume_handshake(MigrationState *s)
2793 {
2794 qemu_savevm_send_postcopy_resume(s->to_dst_file);
2795
2796 while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2797 if (migration_rp_wait(s)) {
2798 return -1;
2799 }
2800 }
2801
2802 if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2803 return 0;
2804 }
2805
2806 return -1;
2807 }
2808
2809 /* Return zero if success, or <0 for error */
postcopy_do_resume(MigrationState * s)2810 static int postcopy_do_resume(MigrationState *s)
2811 {
2812 int ret;
2813
2814 /*
2815 * Call all the resume_prepare() hooks, so that modules can be
2816 * ready for the migration resume.
2817 */
2818 ret = qemu_savevm_state_resume_prepare(s);
2819 if (ret) {
2820 error_report("%s: resume_prepare() failure detected: %d",
2821 __func__, ret);
2822 return ret;
2823 }
2824
2825 /*
2826 * If preempt is enabled, re-establish the preempt channel. Note that
2827 * we do it after resume prepare to make sure the main channel will be
2828 * created before the preempt channel. E.g. with weak network, the
2829 * dest QEMU may get messed up with the preempt and main channels on
2830 * the order of connection setup. This guarantees the correct order.
2831 */
2832 ret = postcopy_preempt_establish_channel(s);
2833 if (ret) {
2834 error_report("%s: postcopy_preempt_establish_channel(): %d",
2835 __func__, ret);
2836 return ret;
2837 }
2838
2839 /*
2840 * Last handshake with destination on the resume (destination will
2841 * switch to postcopy-active afterwards)
2842 */
2843 ret = postcopy_resume_handshake(s);
2844 if (ret) {
2845 error_report("%s: handshake failed: %d", __func__, ret);
2846 return ret;
2847 }
2848
2849 return 0;
2850 }
2851
2852 /*
2853 * We don't return until we are in a safe state to continue current
2854 * postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or
2855 * MIG_THR_ERR_FATAL if unrecovery failure happened.
2856 */
postcopy_pause(MigrationState * s)2857 static MigThrError postcopy_pause(MigrationState *s)
2858 {
2859 assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2860
2861 while (true) {
2862 QEMUFile *file;
2863
2864 /*
2865 * Current channel is possibly broken. Release it. Note that this is
2866 * guaranteed even without lock because to_dst_file should only be
2867 * modified by the migration thread. That also guarantees that the
2868 * unregister of yank is safe too without the lock. It should be safe
2869 * even to be within the qemu_file_lock, but we didn't do that to avoid
2870 * taking more mutex (yank_lock) within qemu_file_lock. TL;DR: we make
2871 * the qemu_file_lock critical section as small as possible.
2872 */
2873 assert(s->to_dst_file);
2874 migration_ioc_unregister_yank_from_file(s->to_dst_file);
2875 qemu_mutex_lock(&s->qemu_file_lock);
2876 file = s->to_dst_file;
2877 s->to_dst_file = NULL;
2878 qemu_mutex_unlock(&s->qemu_file_lock);
2879
2880 qemu_file_shutdown(file);
2881 qemu_fclose(file);
2882
2883 /*
2884 * We're already pausing, so ignore any errors on the return
2885 * path and just wait for the thread to finish. It will be
2886 * re-created when we resume.
2887 */
2888 close_return_path_on_source(s);
2889
2890 migrate_set_state(&s->state, s->state,
2891 MIGRATION_STATUS_POSTCOPY_PAUSED);
2892
2893 error_report("Detected IO failure for postcopy. "
2894 "Migration paused.");
2895
2896 /*
2897 * We wait until things fixed up. Then someone will setup the
2898 * status back for us.
2899 */
2900 while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2901 qemu_sem_wait(&s->postcopy_pause_sem);
2902 }
2903
2904 if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2905 /* Woken up by a recover procedure. Give it a shot */
2906
2907 /* Do the resume logic */
2908 if (postcopy_do_resume(s) == 0) {
2909 /* Let's continue! */
2910 trace_postcopy_pause_continued();
2911 return MIG_THR_ERR_RECOVERED;
2912 } else {
2913 /*
2914 * Something wrong happened during the recovery, let's
2915 * pause again. Pause is always better than throwing
2916 * data away.
2917 */
2918 continue;
2919 }
2920 } else {
2921 /* This is not right... Time to quit. */
2922 return MIG_THR_ERR_FATAL;
2923 }
2924 }
2925 }
2926
migration_detect_error(MigrationState * s)2927 static MigThrError migration_detect_error(MigrationState *s)
2928 {
2929 int ret;
2930 int state = s->state;
2931 Error *local_error = NULL;
2932
2933 if (state == MIGRATION_STATUS_CANCELLING ||
2934 state == MIGRATION_STATUS_CANCELLED) {
2935 /* End the migration, but don't set the state to failed */
2936 return MIG_THR_ERR_FATAL;
2937 }
2938
2939 /*
2940 * Try to detect any file errors. Note that postcopy_qemufile_src will
2941 * be NULL when postcopy preempt is not enabled.
2942 */
2943 ret = qemu_file_get_error_obj_any(s->to_dst_file,
2944 s->postcopy_qemufile_src,
2945 &local_error);
2946 if (!ret) {
2947 /* Everything is fine */
2948 assert(!local_error);
2949 return MIG_THR_ERR_NONE;
2950 }
2951
2952 if (local_error) {
2953 migrate_set_error(s, local_error);
2954 error_free(local_error);
2955 }
2956
2957 if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) {
2958 /*
2959 * For postcopy, we allow the network to be down for a
2960 * while. After that, it can be continued by a
2961 * recovery phase.
2962 */
2963 return postcopy_pause(s);
2964 } else {
2965 /*
2966 * For precopy (or postcopy with error outside IO), we fail
2967 * with no time.
2968 */
2969 migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
2970 trace_migration_thread_file_err();
2971
2972 /* Time to stop the migration, now. */
2973 return MIG_THR_ERR_FATAL;
2974 }
2975 }
2976
migration_calculate_complete(MigrationState * s)2977 static void migration_calculate_complete(MigrationState *s)
2978 {
2979 uint64_t bytes = migration_transferred_bytes();
2980 int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2981 int64_t transfer_time;
2982
2983 migration_downtime_end(s);
2984 s->total_time = end_time - s->start_time;
2985 transfer_time = s->total_time - s->setup_time;
2986 if (transfer_time) {
2987 s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
2988 }
2989 }
2990
update_iteration_initial_status(MigrationState * s)2991 static void update_iteration_initial_status(MigrationState *s)
2992 {
2993 /*
2994 * Update these three fields at the same time to avoid mismatch info lead
2995 * wrong speed calculation.
2996 */
2997 s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2998 s->iteration_initial_bytes = migration_transferred_bytes();
2999 s->iteration_initial_pages = ram_get_total_transferred_pages();
3000 }
3001
migration_update_counters(MigrationState * s,int64_t current_time)3002 static void migration_update_counters(MigrationState *s,
3003 int64_t current_time)
3004 {
3005 uint64_t transferred, transferred_pages, time_spent;
3006 uint64_t current_bytes; /* bytes transferred since the beginning */
3007 uint64_t switchover_bw;
3008 /* Expected bandwidth when switching over to destination QEMU */
3009 double expected_bw_per_ms;
3010 double bandwidth;
3011
3012 if (current_time < s->iteration_start_time + BUFFER_DELAY) {
3013 return;
3014 }
3015
3016 switchover_bw = migrate_avail_switchover_bandwidth();
3017 current_bytes = migration_transferred_bytes();
3018 transferred = current_bytes - s->iteration_initial_bytes;
3019 time_spent = current_time - s->iteration_start_time;
3020 bandwidth = (double)transferred / time_spent;
3021
3022 if (switchover_bw) {
3023 /*
3024 * If the user specified a switchover bandwidth, let's trust the
3025 * user so that can be more accurate than what we estimated.
3026 */
3027 expected_bw_per_ms = switchover_bw / 1000;
3028 } else {
3029 /* If the user doesn't specify bandwidth, we use the estimated */
3030 expected_bw_per_ms = bandwidth;
3031 }
3032
3033 s->threshold_size = expected_bw_per_ms * migrate_downtime_limit();
3034
3035 s->mbps = (((double) transferred * 8.0) /
3036 ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
3037
3038 transferred_pages = ram_get_total_transferred_pages() -
3039 s->iteration_initial_pages;
3040 s->pages_per_second = (double) transferred_pages /
3041 (((double) time_spent / 1000.0));
3042
3043 /*
3044 * if we haven't sent anything, we don't want to
3045 * recalculate. 10000 is a small enough number for our purposes
3046 */
3047 if (stat64_get(&mig_stats.dirty_pages_rate) &&
3048 transferred > 10000) {
3049 s->expected_downtime =
3050 stat64_get(&mig_stats.dirty_bytes_last_sync) / expected_bw_per_ms;
3051 }
3052
3053 migration_rate_reset();
3054
3055 update_iteration_initial_status(s);
3056
3057 trace_migrate_transferred(transferred, time_spent,
3058 /* Both in unit bytes/ms */
3059 bandwidth, switchover_bw / 1000,
3060 s->threshold_size);
3061 }
3062
migration_can_switchover(MigrationState * s)3063 static bool migration_can_switchover(MigrationState *s)
3064 {
3065 if (!migrate_switchover_ack()) {
3066 return true;
3067 }
3068
3069 /* No reason to wait for switchover ACK if VM is stopped */
3070 if (!runstate_is_running()) {
3071 return true;
3072 }
3073
3074 return s->switchover_acked;
3075 }
3076
3077 /* Migration thread iteration status */
3078 typedef enum {
3079 MIG_ITERATE_RESUME, /* Resume current iteration */
3080 MIG_ITERATE_SKIP, /* Skip current iteration */
3081 MIG_ITERATE_BREAK, /* Break the loop */
3082 } MigIterateState;
3083
3084 /*
3085 * Return true if continue to the next iteration directly, false
3086 * otherwise.
3087 */
migration_iteration_run(MigrationState * s)3088 static MigIterateState migration_iteration_run(MigrationState *s)
3089 {
3090 uint64_t must_precopy, can_postcopy;
3091 Error *local_err = NULL;
3092 bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
3093 bool can_switchover = migration_can_switchover(s);
3094
3095 qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy);
3096 uint64_t pending_size = must_precopy + can_postcopy;
3097
3098 trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy);
3099
3100 if (must_precopy <= s->threshold_size) {
3101 qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy);
3102 pending_size = must_precopy + can_postcopy;
3103 trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy);
3104 }
3105
3106 if ((!pending_size || pending_size < s->threshold_size) && can_switchover) {
3107 trace_migration_thread_low_pending(pending_size);
3108 migration_completion(s);
3109 return MIG_ITERATE_BREAK;
3110 }
3111
3112 /* Still a significant amount to transfer */
3113 if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover &&
3114 qatomic_read(&s->start_postcopy)) {
3115 if (postcopy_start(s, &local_err)) {
3116 migrate_set_error(s, local_err);
3117 error_report_err(local_err);
3118 }
3119 return MIG_ITERATE_SKIP;
3120 }
3121
3122 /* Just another iteration step */
3123 qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
3124 return MIG_ITERATE_RESUME;
3125 }
3126
migration_iteration_finish(MigrationState * s)3127 static void migration_iteration_finish(MigrationState *s)
3128 {
3129 /* If we enabled cpu throttling for auto-converge, turn it off. */
3130 cpu_throttle_stop();
3131
3132 qemu_mutex_lock_iothread();
3133 switch (s->state) {
3134 case MIGRATION_STATUS_COMPLETED:
3135 migration_calculate_complete(s);
3136 runstate_set(RUN_STATE_POSTMIGRATE);
3137 break;
3138 case MIGRATION_STATUS_COLO:
3139 assert(migrate_colo());
3140 migrate_start_colo_process(s);
3141 s->vm_old_state = RUN_STATE_RUNNING;
3142 /* Fallthrough */
3143 case MIGRATION_STATUS_FAILED:
3144 case MIGRATION_STATUS_CANCELLED:
3145 case MIGRATION_STATUS_CANCELLING:
3146 if (s->vm_old_state == RUN_STATE_RUNNING) {
3147 if (!runstate_check(RUN_STATE_SHUTDOWN)) {
3148 vm_start();
3149 }
3150 } else {
3151 if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
3152 runstate_set(s->vm_old_state);
3153 }
3154 }
3155 break;
3156
3157 default:
3158 /* Should not reach here, but if so, forgive the VM. */
3159 error_report("%s: Unknown ending state %d", __func__, s->state);
3160 break;
3161 }
3162 migrate_fd_cleanup_schedule(s);
3163 qemu_mutex_unlock_iothread();
3164 }
3165
bg_migration_iteration_finish(MigrationState * s)3166 static void bg_migration_iteration_finish(MigrationState *s)
3167 {
3168 /*
3169 * Stop tracking RAM writes - un-protect memory, un-register UFFD
3170 * memory ranges, flush kernel wait queues and wake up threads
3171 * waiting for write fault to be resolved.
3172 */
3173 ram_write_tracking_stop();
3174
3175 qemu_mutex_lock_iothread();
3176 switch (s->state) {
3177 case MIGRATION_STATUS_COMPLETED:
3178 migration_calculate_complete(s);
3179 break;
3180
3181 case MIGRATION_STATUS_ACTIVE:
3182 case MIGRATION_STATUS_FAILED:
3183 case MIGRATION_STATUS_CANCELLED:
3184 case MIGRATION_STATUS_CANCELLING:
3185 break;
3186
3187 default:
3188 /* Should not reach here, but if so, forgive the VM. */
3189 error_report("%s: Unknown ending state %d", __func__, s->state);
3190 break;
3191 }
3192
3193 migrate_fd_cleanup_schedule(s);
3194 qemu_mutex_unlock_iothread();
3195 }
3196
3197 /*
3198 * Return true if continue to the next iteration directly, false
3199 * otherwise.
3200 */
bg_migration_iteration_run(MigrationState * s)3201 static MigIterateState bg_migration_iteration_run(MigrationState *s)
3202 {
3203 int res;
3204
3205 res = qemu_savevm_state_iterate(s->to_dst_file, false);
3206 if (res > 0) {
3207 bg_migration_completion(s);
3208 return MIG_ITERATE_BREAK;
3209 }
3210
3211 return MIG_ITERATE_RESUME;
3212 }
3213
migration_make_urgent_request(void)3214 void migration_make_urgent_request(void)
3215 {
3216 qemu_sem_post(&migrate_get_current()->rate_limit_sem);
3217 }
3218
migration_consume_urgent_request(void)3219 void migration_consume_urgent_request(void)
3220 {
3221 qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
3222 }
3223
3224 /* Returns true if the rate limiting was broken by an urgent request */
migration_rate_limit(void)3225 bool migration_rate_limit(void)
3226 {
3227 int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3228 MigrationState *s = migrate_get_current();
3229
3230 bool urgent = false;
3231 migration_update_counters(s, now);
3232 if (migration_rate_exceeded(s->to_dst_file)) {
3233
3234 if (qemu_file_get_error(s->to_dst_file)) {
3235 return false;
3236 }
3237 /*
3238 * Wait for a delay to do rate limiting OR
3239 * something urgent to post the semaphore.
3240 */
3241 int ms = s->iteration_start_time + BUFFER_DELAY - now;
3242 trace_migration_rate_limit_pre(ms);
3243 if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
3244 /*
3245 * We were woken by one or more urgent things but
3246 * the timedwait will have consumed one of them.
3247 * The service routine for the urgent wake will dec
3248 * the semaphore itself for each item it consumes,
3249 * so add this one we just eat back.
3250 */
3251 qemu_sem_post(&s->rate_limit_sem);
3252 urgent = true;
3253 }
3254 trace_migration_rate_limit_post(urgent);
3255 }
3256 return urgent;
3257 }
3258
3259 /*
3260 * if failover devices are present, wait they are completely
3261 * unplugged
3262 */
3263
qemu_savevm_wait_unplug(MigrationState * s,int old_state,int new_state)3264 static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
3265 int new_state)
3266 {
3267 if (qemu_savevm_state_guest_unplug_pending()) {
3268 migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
3269
3270 while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
3271 qemu_savevm_state_guest_unplug_pending()) {
3272 qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3273 }
3274 if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
3275 int timeout = 120; /* 30 seconds */
3276 /*
3277 * migration has been canceled
3278 * but as we have started an unplug we must wait the end
3279 * to be able to plug back the card
3280 */
3281 while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
3282 qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3283 }
3284 if (qemu_savevm_state_guest_unplug_pending() &&
3285 !qtest_enabled()) {
3286 warn_report("migration: partially unplugged device on "
3287 "failure");
3288 }
3289 }
3290
3291 migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
3292 } else {
3293 migrate_set_state(&s->state, old_state, new_state);
3294 }
3295 }
3296
3297 /*
3298 * Master migration thread on the source VM.
3299 * It drives the migration and pumps the data down the outgoing channel.
3300 */
migration_thread(void * opaque)3301 static void *migration_thread(void *opaque)
3302 {
3303 MigrationState *s = opaque;
3304 MigrationThread *thread = NULL;
3305 int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3306 MigThrError thr_error;
3307 bool urgent = false;
3308
3309 thread = migration_threads_add("live_migration", qemu_get_thread_id());
3310
3311 rcu_register_thread();
3312
3313 object_ref(OBJECT(s));
3314 update_iteration_initial_status(s);
3315
3316 qemu_mutex_lock_iothread();
3317 qemu_savevm_state_header(s->to_dst_file);
3318 qemu_mutex_unlock_iothread();
3319
3320 /*
3321 * If we opened the return path, we need to make sure dst has it
3322 * opened as well.
3323 */
3324 if (s->rp_state.rp_thread_created) {
3325 /* Now tell the dest that it should open its end so it can reply */
3326 qemu_savevm_send_open_return_path(s->to_dst_file);
3327
3328 /* And do a ping that will make stuff easier to debug */
3329 qemu_savevm_send_ping(s->to_dst_file, 1);
3330 }
3331
3332 if (migrate_postcopy()) {
3333 /*
3334 * Tell the destination that we *might* want to do postcopy later;
3335 * if the other end can't do postcopy it should fail now, nice and
3336 * early.
3337 */
3338 qemu_savevm_send_postcopy_advise(s->to_dst_file);
3339 }
3340
3341 if (migrate_colo()) {
3342 /* Notify migration destination that we enable COLO */
3343 qemu_savevm_send_colo_enable(s->to_dst_file);
3344 }
3345
3346 qemu_mutex_lock_iothread();
3347 qemu_savevm_state_setup(s->to_dst_file);
3348 qemu_mutex_unlock_iothread();
3349
3350 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3351 MIGRATION_STATUS_ACTIVE);
3352
3353 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3354
3355 trace_migration_thread_setup_complete();
3356
3357 while (migration_is_active(s)) {
3358 if (urgent || !migration_rate_exceeded(s->to_dst_file)) {
3359 MigIterateState iter_state = migration_iteration_run(s);
3360 if (iter_state == MIG_ITERATE_SKIP) {
3361 continue;
3362 } else if (iter_state == MIG_ITERATE_BREAK) {
3363 break;
3364 }
3365 }
3366
3367 /*
3368 * Try to detect any kind of failures, and see whether we
3369 * should stop the migration now.
3370 */
3371 thr_error = migration_detect_error(s);
3372 if (thr_error == MIG_THR_ERR_FATAL) {
3373 /* Stop migration */
3374 break;
3375 } else if (thr_error == MIG_THR_ERR_RECOVERED) {
3376 /*
3377 * Just recovered from a e.g. network failure, reset all
3378 * the local variables. This is important to avoid
3379 * breaking transferred_bytes and bandwidth calculation
3380 */
3381 update_iteration_initial_status(s);
3382 }
3383
3384 urgent = migration_rate_limit();
3385 }
3386
3387 trace_migration_thread_after_loop();
3388 migration_iteration_finish(s);
3389 object_unref(OBJECT(s));
3390 rcu_unregister_thread();
3391 migration_threads_remove(thread);
3392 return NULL;
3393 }
3394
bg_migration_vm_start_bh(void * opaque)3395 static void bg_migration_vm_start_bh(void *opaque)
3396 {
3397 MigrationState *s = opaque;
3398
3399 qemu_bh_delete(s->vm_start_bh);
3400 s->vm_start_bh = NULL;
3401
3402 vm_start();
3403 migration_downtime_end(s);
3404 }
3405
3406 /**
3407 * Background snapshot thread, based on live migration code.
3408 * This is an alternative implementation of live migration mechanism
3409 * introduced specifically to support background snapshots.
3410 *
3411 * It takes advantage of userfault_fd write protection mechanism introduced
3412 * in v5.7 kernel. Compared to existing dirty page logging migration much
3413 * lesser stream traffic is produced resulting in smaller snapshot images,
3414 * simply cause of no page duplicates can get into the stream.
3415 *
3416 * Another key point is that generated vmstate stream reflects machine state
3417 * 'frozen' at the beginning of snapshot creation compared to dirty page logging
3418 * mechanism, which effectively results in that saved snapshot is the state of VM
3419 * at the end of the process.
3420 */
bg_migration_thread(void * opaque)3421 static void *bg_migration_thread(void *opaque)
3422 {
3423 MigrationState *s = opaque;
3424 int64_t setup_start;
3425 MigThrError thr_error;
3426 QEMUFile *fb;
3427 bool early_fail = true;
3428
3429 rcu_register_thread();
3430 object_ref(OBJECT(s));
3431
3432 migration_rate_set(RATE_LIMIT_DISABLED);
3433
3434 setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3435 /*
3436 * We want to save vmstate for the moment when migration has been
3437 * initiated but also we want to save RAM content while VM is running.
3438 * The RAM content should appear first in the vmstate. So, we first
3439 * stash the non-RAM part of the vmstate to the temporary buffer,
3440 * then write RAM part of the vmstate to the migration stream
3441 * with vCPUs running and, finally, write stashed non-RAM part of
3442 * the vmstate from the buffer to the migration stream.
3443 */
3444 s->bioc = qio_channel_buffer_new(512 * 1024);
3445 qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
3446 fb = qemu_file_new_output(QIO_CHANNEL(s->bioc));
3447 object_unref(OBJECT(s->bioc));
3448
3449 update_iteration_initial_status(s);
3450
3451 /*
3452 * Prepare for tracking memory writes with UFFD-WP - populate
3453 * RAM pages before protecting.
3454 */
3455 #ifdef __linux__
3456 ram_write_tracking_prepare();
3457 #endif
3458
3459 qemu_mutex_lock_iothread();
3460 qemu_savevm_state_header(s->to_dst_file);
3461 qemu_savevm_state_setup(s->to_dst_file);
3462 qemu_mutex_unlock_iothread();
3463
3464 qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3465 MIGRATION_STATUS_ACTIVE);
3466
3467 s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3468
3469 trace_migration_thread_setup_complete();
3470 migration_downtime_start(s);
3471
3472 qemu_mutex_lock_iothread();
3473
3474 /*
3475 * If VM is currently in suspended state, then, to make a valid runstate
3476 * transition in vm_stop_force_state() we need to wakeup it up.
3477 */
3478 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3479 s->vm_old_state = runstate_get();
3480
3481 global_state_store();
3482 /* Forcibly stop VM before saving state of vCPUs and devices */
3483 if (migration_stop_vm(RUN_STATE_PAUSED)) {
3484 goto fail;
3485 }
3486 /*
3487 * Put vCPUs in sync with shadow context structures, then
3488 * save their state to channel-buffer along with devices.
3489 */
3490 cpu_synchronize_all_states();
3491 if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
3492 goto fail;
3493 }
3494 /*
3495 * Since we are going to get non-iterable state data directly
3496 * from s->bioc->data, explicit flush is needed here.
3497 */
3498 qemu_fflush(fb);
3499
3500 /* Now initialize UFFD context and start tracking RAM writes */
3501 if (ram_write_tracking_start()) {
3502 goto fail;
3503 }
3504 early_fail = false;
3505
3506 /*
3507 * Start VM from BH handler to avoid write-fault lock here.
3508 * UFFD-WP protection for the whole RAM is already enabled so
3509 * calling VM state change notifiers from vm_start() would initiate
3510 * writes to virtio VQs memory which is in write-protected region.
3511 */
3512 s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
3513 qemu_bh_schedule(s->vm_start_bh);
3514
3515 qemu_mutex_unlock_iothread();
3516
3517 while (migration_is_active(s)) {
3518 MigIterateState iter_state = bg_migration_iteration_run(s);
3519 if (iter_state == MIG_ITERATE_SKIP) {
3520 continue;
3521 } else if (iter_state == MIG_ITERATE_BREAK) {
3522 break;
3523 }
3524
3525 /*
3526 * Try to detect any kind of failures, and see whether we
3527 * should stop the migration now.
3528 */
3529 thr_error = migration_detect_error(s);
3530 if (thr_error == MIG_THR_ERR_FATAL) {
3531 /* Stop migration */
3532 break;
3533 }
3534
3535 migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
3536 }
3537
3538 trace_migration_thread_after_loop();
3539
3540 fail:
3541 if (early_fail) {
3542 migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3543 MIGRATION_STATUS_FAILED);
3544 qemu_mutex_unlock_iothread();
3545 }
3546
3547 bg_migration_iteration_finish(s);
3548
3549 qemu_fclose(fb);
3550 object_unref(OBJECT(s));
3551 rcu_unregister_thread();
3552
3553 return NULL;
3554 }
3555
migrate_fd_connect(MigrationState * s,Error * error_in)3556 void migrate_fd_connect(MigrationState *s, Error *error_in)
3557 {
3558 Error *local_err = NULL;
3559 uint64_t rate_limit;
3560 bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
3561
3562 /*
3563 * If there's a previous error, free it and prepare for another one.
3564 * Meanwhile if migration completes successfully, there won't have an error
3565 * dumped when calling migrate_fd_cleanup().
3566 */
3567 migrate_error_free(s);
3568
3569 s->expected_downtime = migrate_downtime_limit();
3570 if (resume) {
3571 assert(s->cleanup_bh);
3572 } else {
3573 assert(!s->cleanup_bh);
3574 s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s);
3575 }
3576 if (error_in) {
3577 migrate_fd_error(s, error_in);
3578 if (resume) {
3579 /*
3580 * Don't do cleanup for resume if channel is invalid, but only dump
3581 * the error. We wait for another channel connect from the user.
3582 * The error_report still gives HMP user a hint on what failed.
3583 * It's normally done in migrate_fd_cleanup(), but call it here
3584 * explicitly.
3585 */
3586 error_report_err(error_copy(s->error));
3587 } else {
3588 migrate_fd_cleanup(s);
3589 }
3590 return;
3591 }
3592
3593 if (resume) {
3594 /* This is a resumed migration */
3595 rate_limit = migrate_max_postcopy_bandwidth();
3596 } else {
3597 /* This is a fresh new migration */
3598 rate_limit = migrate_max_bandwidth();
3599
3600 /* Notify before starting migration thread */
3601 migration_call_notifiers(s);
3602 }
3603
3604 migration_rate_set(rate_limit);
3605 qemu_file_set_blocking(s->to_dst_file, true);
3606
3607 /*
3608 * Open the return path. For postcopy, it is used exclusively. For
3609 * precopy, only if user specified "return-path" capability would
3610 * QEMU uses the return path.
3611 */
3612 if (migrate_postcopy_ram() || migrate_return_path()) {
3613 if (open_return_path_on_source(s)) {
3614 error_setg(&local_err, "Unable to open return-path for postcopy");
3615 migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
3616 migrate_set_error(s, local_err);
3617 error_report_err(local_err);
3618 migrate_fd_cleanup(s);
3619 return;
3620 }
3621 }
3622
3623 /*
3624 * This needs to be done before resuming a postcopy. Note: for newer
3625 * QEMUs we will delay the channel creation until postcopy_start(), to
3626 * avoid disorder of channel creations.
3627 */
3628 if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
3629 postcopy_preempt_setup(s);
3630 }
3631
3632 if (resume) {
3633 /* Wakeup the main migration thread to do the recovery */
3634 migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
3635 MIGRATION_STATUS_POSTCOPY_RECOVER);
3636 qemu_sem_post(&s->postcopy_pause_sem);
3637 return;
3638 }
3639
3640 if (multifd_save_setup(&local_err) != 0) {
3641 migrate_set_error(s, local_err);
3642 error_report_err(local_err);
3643 migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
3644 MIGRATION_STATUS_FAILED);
3645 migrate_fd_cleanup(s);
3646 return;
3647 }
3648
3649 if (migrate_background_snapshot()) {
3650 qemu_thread_create(&s->thread, "bg_snapshot",
3651 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
3652 } else {
3653 qemu_thread_create(&s->thread, "live_migration",
3654 migration_thread, s, QEMU_THREAD_JOINABLE);
3655 }
3656 s->migration_thread_running = true;
3657 }
3658
migration_class_init(ObjectClass * klass,void * data)3659 static void migration_class_init(ObjectClass *klass, void *data)
3660 {
3661 DeviceClass *dc = DEVICE_CLASS(klass);
3662
3663 dc->user_creatable = false;
3664 device_class_set_props(dc, migration_properties);
3665 }
3666
migration_instance_finalize(Object * obj)3667 static void migration_instance_finalize(Object *obj)
3668 {
3669 MigrationState *ms = MIGRATION_OBJ(obj);
3670
3671 qemu_mutex_destroy(&ms->error_mutex);
3672 qemu_mutex_destroy(&ms->qemu_file_lock);
3673 qemu_sem_destroy(&ms->wait_unplug_sem);
3674 qemu_sem_destroy(&ms->rate_limit_sem);
3675 qemu_sem_destroy(&ms->pause_sem);
3676 qemu_sem_destroy(&ms->postcopy_pause_sem);
3677 qemu_sem_destroy(&ms->rp_state.rp_sem);
3678 qemu_sem_destroy(&ms->rp_state.rp_pong_acks);
3679 qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
3680 error_free(ms->error);
3681 }
3682
migration_instance_init(Object * obj)3683 static void migration_instance_init(Object *obj)
3684 {
3685 MigrationState *ms = MIGRATION_OBJ(obj);
3686
3687 ms->state = MIGRATION_STATUS_NONE;
3688 ms->mbps = -1;
3689 ms->pages_per_second = -1;
3690 qemu_sem_init(&ms->pause_sem, 0);
3691 qemu_mutex_init(&ms->error_mutex);
3692
3693 migrate_params_init(&ms->parameters);
3694
3695 qemu_sem_init(&ms->postcopy_pause_sem, 0);
3696 qemu_sem_init(&ms->rp_state.rp_sem, 0);
3697 qemu_sem_init(&ms->rp_state.rp_pong_acks, 0);
3698 qemu_sem_init(&ms->rate_limit_sem, 0);
3699 qemu_sem_init(&ms->wait_unplug_sem, 0);
3700 qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
3701 qemu_mutex_init(&ms->qemu_file_lock);
3702 }
3703
3704 /*
3705 * Return true if check pass, false otherwise. Error will be put
3706 * inside errp if provided.
3707 */
migration_object_check(MigrationState * ms,Error ** errp)3708 static bool migration_object_check(MigrationState *ms, Error **errp)
3709 {
3710 /* Assuming all off */
3711 bool old_caps[MIGRATION_CAPABILITY__MAX] = { 0 };
3712
3713 if (!migrate_params_check(&ms->parameters, errp)) {
3714 return false;
3715 }
3716
3717 return migrate_caps_check(old_caps, ms->capabilities, errp);
3718 }
3719
3720 static const TypeInfo migration_type = {
3721 .name = TYPE_MIGRATION,
3722 /*
3723 * NOTE: TYPE_MIGRATION is not really a device, as the object is
3724 * not created using qdev_new(), it is not attached to the qdev
3725 * device tree, and it is never realized.
3726 *
3727 * TODO: Make this TYPE_OBJECT once QOM provides something like
3728 * TYPE_DEVICE's "-global" properties.
3729 */
3730 .parent = TYPE_DEVICE,
3731 .class_init = migration_class_init,
3732 .class_size = sizeof(MigrationClass),
3733 .instance_size = sizeof(MigrationState),
3734 .instance_init = migration_instance_init,
3735 .instance_finalize = migration_instance_finalize,
3736 };
3737
register_migration_types(void)3738 static void register_migration_types(void)
3739 {
3740 type_register_static(&migration_type);
3741 }
3742
3743 type_init(register_migration_types);
3744