xref: /openbmc/qemu/migration/migration.c (revision 9b42e33b)
1 /*
2  * QEMU live migration
3  *
4  * Copyright IBM, Corp. 2008
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qemu/cutils.h"
18 #include "qemu/error-report.h"
19 #include "qemu/main-loop.h"
20 #include "migration/blocker.h"
21 #include "exec.h"
22 #include "fd.h"
23 #include "file.h"
24 #include "socket.h"
25 #include "sysemu/runstate.h"
26 #include "sysemu/sysemu.h"
27 #include "sysemu/cpu-throttle.h"
28 #include "rdma.h"
29 #include "ram.h"
30 #include "migration/global_state.h"
31 #include "migration/misc.h"
32 #include "migration.h"
33 #include "migration-stats.h"
34 #include "savevm.h"
35 #include "qemu-file.h"
36 #include "channel.h"
37 #include "migration/vmstate.h"
38 #include "block/block.h"
39 #include "qapi/error.h"
40 #include "qapi/clone-visitor.h"
41 #include "qapi/qapi-visit-migration.h"
42 #include "qapi/qapi-visit-sockets.h"
43 #include "qapi/qapi-commands-migration.h"
44 #include "qapi/qapi-events-migration.h"
45 #include "qapi/qmp/qerror.h"
46 #include "qapi/qmp/qnull.h"
47 #include "qemu/rcu.h"
48 #include "postcopy-ram.h"
49 #include "qemu/thread.h"
50 #include "trace.h"
51 #include "exec/target_page.h"
52 #include "io/channel-buffer.h"
53 #include "io/channel-tls.h"
54 #include "migration/colo.h"
55 #include "hw/boards.h"
56 #include "monitor/monitor.h"
57 #include "net/announce.h"
58 #include "qemu/queue.h"
59 #include "multifd.h"
60 #include "threadinfo.h"
61 #include "qemu/yank.h"
62 #include "sysemu/cpus.h"
63 #include "yank_functions.h"
64 #include "sysemu/qtest.h"
65 #include "options.h"
66 #include "sysemu/dirtylimit.h"
67 #include "qemu/sockets.h"
68 #include "sysemu/kvm.h"
69 
70 #define NOTIFIER_ELEM_INIT(array, elem)    \
71     [elem] = NOTIFIER_WITH_RETURN_LIST_INITIALIZER((array)[elem])
72 
73 #define INMIGRATE_DEFAULT_EXIT_ON_ERROR true
74 
75 static NotifierWithReturnList migration_state_notifiers[] = {
76     NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_NORMAL),
77     NOTIFIER_ELEM_INIT(migration_state_notifiers, MIG_MODE_CPR_REBOOT),
78 };
79 
80 /* Messages sent on the return path from destination to source */
81 enum mig_rp_message_type {
82     MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
83     MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
84     MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
85 
86     MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
87     MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
88     MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
89     MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
90     MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */
91 
92     MIG_RP_MSG_MAX
93 };
94 
95 /* When we add fault tolerance, we could have several
96    migrations at once.  For now we don't need to add
97    dynamic creation of migration */
98 
99 static MigrationState *current_migration;
100 static MigrationIncomingState *current_incoming;
101 
102 static GSList *migration_blockers[MIG_MODE__MAX];
103 
104 static bool migration_object_check(MigrationState *ms, Error **errp);
105 static int migration_maybe_pause(MigrationState *s,
106                                  int *current_active_state,
107                                  int new_state);
108 static void migrate_fd_cancel(MigrationState *s);
109 static bool close_return_path_on_source(MigrationState *s);
110 static void migration_completion_end(MigrationState *s);
111 
112 static void migration_downtime_start(MigrationState *s)
113 {
114     trace_vmstate_downtime_checkpoint("src-downtime-start");
115     s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
116 }
117 
118 static void migration_downtime_end(MigrationState *s)
119 {
120     int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
121 
122     /*
123      * If downtime already set, should mean that postcopy already set it,
124      * then that should be the real downtime already.
125      */
126     if (!s->downtime) {
127         s->downtime = now - s->downtime_start;
128     }
129 
130     trace_vmstate_downtime_checkpoint("src-downtime-end");
131 }
132 
133 static bool migration_needs_multiple_sockets(void)
134 {
135     return migrate_multifd() || migrate_postcopy_preempt();
136 }
137 
138 static bool transport_supports_multi_channels(MigrationAddress *addr)
139 {
140     if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
141         SocketAddress *saddr = &addr->u.socket;
142 
143         return (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
144                 saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
145                 saddr->type == SOCKET_ADDRESS_TYPE_VSOCK);
146     } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
147         return migrate_mapped_ram();
148     } else {
149         return false;
150     }
151 }
152 
153 static bool migration_needs_seekable_channel(void)
154 {
155     return migrate_mapped_ram();
156 }
157 
158 static bool migration_needs_extra_fds(void)
159 {
160     /*
161      * When doing direct-io, multifd requires two different,
162      * non-duplicated file descriptors so we can use one of them for
163      * unaligned IO.
164      */
165     return migrate_multifd() && migrate_direct_io();
166 }
167 
168 static bool transport_supports_seeking(MigrationAddress *addr)
169 {
170     if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
171         return true;
172     }
173 
174     return false;
175 }
176 
177 static bool transport_supports_extra_fds(MigrationAddress *addr)
178 {
179     /* file: works because QEMU can open it multiple times */
180     return addr->transport == MIGRATION_ADDRESS_TYPE_FILE;
181 }
182 
183 static bool
184 migration_channels_and_transport_compatible(MigrationAddress *addr,
185                                             Error **errp)
186 {
187     if (migration_needs_seekable_channel() &&
188         !transport_supports_seeking(addr)) {
189         error_setg(errp, "Migration requires seekable transport (e.g. file)");
190         return false;
191     }
192 
193     if (migration_needs_multiple_sockets() &&
194         !transport_supports_multi_channels(addr)) {
195         error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)");
196         return false;
197     }
198 
199     if (migration_needs_extra_fds() &&
200         !transport_supports_extra_fds(addr)) {
201         error_setg(errp,
202                    "Migration requires a transport that allows for extra fds (e.g. file)");
203         return false;
204     }
205 
206     return true;
207 }
208 
209 static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
210 {
211     uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
212 
213     return (a > b) - (a < b);
214 }
215 
216 static int migration_stop_vm(MigrationState *s, RunState state)
217 {
218     int ret;
219 
220     migration_downtime_start(s);
221 
222     s->vm_old_state = runstate_get();
223     global_state_store();
224 
225     ret = vm_stop_force_state(state);
226 
227     trace_vmstate_downtime_checkpoint("src-vm-stopped");
228     trace_migration_completion_vm_stop(ret);
229 
230     return ret;
231 }
232 
233 void migration_object_init(void)
234 {
235     /* This can only be called once. */
236     assert(!current_migration);
237     current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
238 
239     /*
240      * Init the migrate incoming object as well no matter whether
241      * we'll use it or not.
242      */
243     assert(!current_incoming);
244     current_incoming = g_new0(MigrationIncomingState, 1);
245     current_incoming->state = MIGRATION_STATUS_NONE;
246     current_incoming->postcopy_remote_fds =
247         g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
248     qemu_mutex_init(&current_incoming->rp_mutex);
249     qemu_mutex_init(&current_incoming->postcopy_prio_thread_mutex);
250     qemu_event_init(&current_incoming->main_thread_load_event, false);
251     qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
252     qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
253     qemu_sem_init(&current_incoming->postcopy_pause_sem_fast_load, 0);
254     qemu_sem_init(&current_incoming->postcopy_qemufile_dst_done, 0);
255 
256     qemu_mutex_init(&current_incoming->page_request_mutex);
257     qemu_cond_init(&current_incoming->page_request_cond);
258     current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
259 
260     current_incoming->exit_on_error = INMIGRATE_DEFAULT_EXIT_ON_ERROR;
261 
262     migration_object_check(current_migration, &error_fatal);
263 
264     ram_mig_init();
265     dirty_bitmap_mig_init();
266 }
267 
268 typedef struct {
269     QEMUBH *bh;
270     QEMUBHFunc *cb;
271     void *opaque;
272 } MigrationBH;
273 
274 static void migration_bh_dispatch_bh(void *opaque)
275 {
276     MigrationState *s = migrate_get_current();
277     MigrationBH *migbh = opaque;
278 
279     /* cleanup this BH */
280     qemu_bh_delete(migbh->bh);
281     migbh->bh = NULL;
282 
283     /* dispatch the other one */
284     migbh->cb(migbh->opaque);
285     object_unref(OBJECT(s));
286 
287     g_free(migbh);
288 }
289 
290 void migration_bh_schedule(QEMUBHFunc *cb, void *opaque)
291 {
292     MigrationState *s = migrate_get_current();
293     MigrationBH *migbh = g_new0(MigrationBH, 1);
294     QEMUBH *bh = qemu_bh_new(migration_bh_dispatch_bh, migbh);
295 
296     /* Store these to dispatch when the BH runs */
297     migbh->bh = bh;
298     migbh->cb = cb;
299     migbh->opaque = opaque;
300 
301     /*
302      * Ref the state for bh, because it may be called when
303      * there're already no other refs
304      */
305     object_ref(OBJECT(s));
306     qemu_bh_schedule(bh);
307 }
308 
309 void migration_cancel(const Error *error)
310 {
311     if (error) {
312         migrate_set_error(current_migration, error);
313     }
314     if (migrate_dirty_limit()) {
315         qmp_cancel_vcpu_dirty_limit(false, -1, NULL);
316     }
317     migrate_fd_cancel(current_migration);
318 }
319 
320 void migration_shutdown(void)
321 {
322     /*
323      * When the QEMU main thread exit, the COLO thread
324      * may wait a semaphore. So, we should wakeup the
325      * COLO thread before migration shutdown.
326      */
327     colo_shutdown();
328     /*
329      * Cancel the current migration - that will (eventually)
330      * stop the migration using this structure
331      */
332     migration_cancel(NULL);
333     object_unref(OBJECT(current_migration));
334 
335     /*
336      * Cancel outgoing migration of dirty bitmaps. It should
337      * at least unref used block nodes.
338      */
339     dirty_bitmap_mig_cancel_outgoing();
340 
341     /*
342      * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
343      * are non-critical data, and their loss never considered as
344      * something serious.
345      */
346     dirty_bitmap_mig_cancel_incoming();
347 }
348 
349 /* For outgoing */
350 MigrationState *migrate_get_current(void)
351 {
352     /* This can only be called after the object created. */
353     assert(current_migration);
354     return current_migration;
355 }
356 
357 MigrationIncomingState *migration_incoming_get_current(void)
358 {
359     assert(current_incoming);
360     return current_incoming;
361 }
362 
363 void migration_incoming_transport_cleanup(MigrationIncomingState *mis)
364 {
365     if (mis->socket_address_list) {
366         qapi_free_SocketAddressList(mis->socket_address_list);
367         mis->socket_address_list = NULL;
368     }
369 
370     if (mis->transport_cleanup) {
371         mis->transport_cleanup(mis->transport_data);
372         mis->transport_data = mis->transport_cleanup = NULL;
373     }
374 }
375 
376 void migration_incoming_state_destroy(void)
377 {
378     struct MigrationIncomingState *mis = migration_incoming_get_current();
379 
380     multifd_recv_cleanup();
381     /*
382      * RAM state cleanup needs to happen after multifd cleanup, because
383      * multifd threads can use some of its states (receivedmap).
384      */
385     qemu_loadvm_state_cleanup();
386 
387     if (mis->to_src_file) {
388         /* Tell source that we are done */
389         migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
390         qemu_fclose(mis->to_src_file);
391         mis->to_src_file = NULL;
392     }
393 
394     if (mis->from_src_file) {
395         migration_ioc_unregister_yank_from_file(mis->from_src_file);
396         qemu_fclose(mis->from_src_file);
397         mis->from_src_file = NULL;
398     }
399     if (mis->postcopy_remote_fds) {
400         g_array_free(mis->postcopy_remote_fds, TRUE);
401         mis->postcopy_remote_fds = NULL;
402     }
403 
404     migration_incoming_transport_cleanup(mis);
405     qemu_event_reset(&mis->main_thread_load_event);
406 
407     if (mis->page_requested) {
408         g_tree_destroy(mis->page_requested);
409         mis->page_requested = NULL;
410     }
411 
412     if (mis->postcopy_qemufile_dst) {
413         migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
414         qemu_fclose(mis->postcopy_qemufile_dst);
415         mis->postcopy_qemufile_dst = NULL;
416     }
417 
418     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
419 }
420 
421 static void migrate_generate_event(MigrationStatus new_state)
422 {
423     if (migrate_events()) {
424         qapi_event_send_migration(new_state);
425     }
426 }
427 
428 /*
429  * Send a message on the return channel back to the source
430  * of the migration.
431  */
432 static int migrate_send_rp_message(MigrationIncomingState *mis,
433                                    enum mig_rp_message_type message_type,
434                                    uint16_t len, void *data)
435 {
436     int ret = 0;
437 
438     trace_migrate_send_rp_message((int)message_type, len);
439     QEMU_LOCK_GUARD(&mis->rp_mutex);
440 
441     /*
442      * It's possible that the file handle got lost due to network
443      * failures.
444      */
445     if (!mis->to_src_file) {
446         ret = -EIO;
447         return ret;
448     }
449 
450     qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
451     qemu_put_be16(mis->to_src_file, len);
452     qemu_put_buffer(mis->to_src_file, data, len);
453     return qemu_fflush(mis->to_src_file);
454 }
455 
456 /* Request one page from the source VM at the given start address.
457  *   rb: the RAMBlock to request the page in
458  *   Start: Address offset within the RB
459  *   Len: Length in bytes required - must be a multiple of pagesize
460  */
461 int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
462                                       RAMBlock *rb, ram_addr_t start)
463 {
464     uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
465     size_t msglen = 12; /* start + len */
466     size_t len = qemu_ram_pagesize(rb);
467     enum mig_rp_message_type msg_type;
468     const char *rbname;
469     int rbname_len;
470 
471     *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
472     *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
473 
474     /*
475      * We maintain the last ramblock that we requested for page.  Note that we
476      * don't need locking because this function will only be called within the
477      * postcopy ram fault thread.
478      */
479     if (rb != mis->last_rb) {
480         mis->last_rb = rb;
481 
482         rbname = qemu_ram_get_idstr(rb);
483         rbname_len = strlen(rbname);
484 
485         assert(rbname_len < 256);
486 
487         bufc[msglen++] = rbname_len;
488         memcpy(bufc + msglen, rbname, rbname_len);
489         msglen += rbname_len;
490         msg_type = MIG_RP_MSG_REQ_PAGES_ID;
491     } else {
492         msg_type = MIG_RP_MSG_REQ_PAGES;
493     }
494 
495     return migrate_send_rp_message(mis, msg_type, msglen, bufc);
496 }
497 
498 int migrate_send_rp_req_pages(MigrationIncomingState *mis,
499                               RAMBlock *rb, ram_addr_t start, uint64_t haddr)
500 {
501     void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
502     bool received = false;
503 
504     WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
505         received = ramblock_recv_bitmap_test_byte_offset(rb, start);
506         if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
507             /*
508              * The page has not been received, and it's not yet in the page
509              * request list.  Queue it.  Set the value of element to 1, so that
510              * things like g_tree_lookup() will return TRUE (1) when found.
511              */
512             g_tree_insert(mis->page_requested, aligned, (gpointer)1);
513             qatomic_inc(&mis->page_requested_count);
514             trace_postcopy_page_req_add(aligned, mis->page_requested_count);
515         }
516     }
517 
518     /*
519      * If the page is there, skip sending the message.  We don't even need the
520      * lock because as long as the page arrived, it'll be there forever.
521      */
522     if (received) {
523         return 0;
524     }
525 
526     return migrate_send_rp_message_req_pages(mis, rb, start);
527 }
528 
529 static bool migration_colo_enabled;
530 bool migration_incoming_colo_enabled(void)
531 {
532     return migration_colo_enabled;
533 }
534 
535 void migration_incoming_disable_colo(void)
536 {
537     ram_block_discard_disable(false);
538     migration_colo_enabled = false;
539 }
540 
541 int migration_incoming_enable_colo(void)
542 {
543 #ifndef CONFIG_REPLICATION
544     error_report("ENABLE_COLO command come in migration stream, but the "
545                  "replication module is not built in");
546     return -ENOTSUP;
547 #endif
548 
549     if (!migrate_colo()) {
550         error_report("ENABLE_COLO command come in migration stream, but x-colo "
551                      "capability is not set");
552         return -EINVAL;
553     }
554 
555     if (ram_block_discard_disable(true)) {
556         error_report("COLO: cannot disable RAM discard");
557         return -EBUSY;
558     }
559     migration_colo_enabled = true;
560     return 0;
561 }
562 
563 void migrate_add_address(SocketAddress *address)
564 {
565     MigrationIncomingState *mis = migration_incoming_get_current();
566 
567     QAPI_LIST_PREPEND(mis->socket_address_list,
568                       QAPI_CLONE(SocketAddress, address));
569 }
570 
571 bool migrate_uri_parse(const char *uri, MigrationChannel **channel,
572                        Error **errp)
573 {
574     g_autoptr(MigrationChannel) val = g_new0(MigrationChannel, 1);
575     g_autoptr(MigrationAddress) addr = g_new0(MigrationAddress, 1);
576     InetSocketAddress *isock = &addr->u.rdma;
577     strList **tail = &addr->u.exec.args;
578 
579     if (strstart(uri, "exec:", NULL)) {
580         addr->transport = MIGRATION_ADDRESS_TYPE_EXEC;
581 #ifdef WIN32
582         QAPI_LIST_APPEND(tail, g_strdup(exec_get_cmd_path()));
583         QAPI_LIST_APPEND(tail, g_strdup("/c"));
584 #else
585         QAPI_LIST_APPEND(tail, g_strdup("/bin/sh"));
586         QAPI_LIST_APPEND(tail, g_strdup("-c"));
587 #endif
588         QAPI_LIST_APPEND(tail, g_strdup(uri + strlen("exec:")));
589     } else if (strstart(uri, "rdma:", NULL)) {
590         if (inet_parse(isock, uri + strlen("rdma:"), errp)) {
591             qapi_free_InetSocketAddress(isock);
592             return false;
593         }
594         addr->transport = MIGRATION_ADDRESS_TYPE_RDMA;
595     } else if (strstart(uri, "tcp:", NULL) ||
596                 strstart(uri, "unix:", NULL) ||
597                 strstart(uri, "vsock:", NULL) ||
598                 strstart(uri, "fd:", NULL)) {
599         addr->transport = MIGRATION_ADDRESS_TYPE_SOCKET;
600         SocketAddress *saddr = socket_parse(uri, errp);
601         if (!saddr) {
602             return false;
603         }
604         addr->u.socket.type = saddr->type;
605         addr->u.socket.u = saddr->u;
606         /* Don't free the objects inside; their ownership moved to "addr" */
607         g_free(saddr);
608     } else if (strstart(uri, "file:", NULL)) {
609         addr->transport = MIGRATION_ADDRESS_TYPE_FILE;
610         addr->u.file.filename = g_strdup(uri + strlen("file:"));
611         if (file_parse_offset(addr->u.file.filename, &addr->u.file.offset,
612                               errp)) {
613             return false;
614         }
615     } else {
616         error_setg(errp, "unknown migration protocol: %s", uri);
617         return false;
618     }
619 
620     val->channel_type = MIGRATION_CHANNEL_TYPE_MAIN;
621     val->addr = g_steal_pointer(&addr);
622     *channel = g_steal_pointer(&val);
623     return true;
624 }
625 
626 static bool
627 migration_incoming_state_setup(MigrationIncomingState *mis, Error **errp)
628 {
629     MigrationStatus current = mis->state;
630 
631     if (current == MIGRATION_STATUS_POSTCOPY_PAUSED) {
632         /*
633          * Incoming postcopy migration will stay in PAUSED state even if
634          * reconnection happened.
635          */
636         return true;
637     }
638 
639     if (current != MIGRATION_STATUS_NONE) {
640         error_setg(errp, "Illegal migration incoming state: %s",
641                    MigrationStatus_str(current));
642         return false;
643     }
644 
645     migrate_set_state(&mis->state, current, MIGRATION_STATUS_SETUP);
646     return true;
647 }
648 
649 static void qemu_start_incoming_migration(const char *uri, bool has_channels,
650                                           MigrationChannelList *channels,
651                                           Error **errp)
652 {
653     g_autoptr(MigrationChannel) channel = NULL;
654     MigrationAddress *addr = NULL;
655     MigrationIncomingState *mis = migration_incoming_get_current();
656 
657     /*
658      * Having preliminary checks for uri and channel
659      */
660     if (!uri == !channels) {
661         error_setg(errp, "need either 'uri' or 'channels' argument");
662         return;
663     }
664 
665     if (channels) {
666         /* To verify that Migrate channel list has only item */
667         if (channels->next) {
668             error_setg(errp, "Channel list has more than one entries");
669             return;
670         }
671         addr = channels->value->addr;
672     }
673 
674     if (uri) {
675         /* caller uses the old URI syntax */
676         if (!migrate_uri_parse(uri, &channel, errp)) {
677             return;
678         }
679         addr = channel->addr;
680     }
681 
682     /* transport mechanism not suitable for migration? */
683     if (!migration_channels_and_transport_compatible(addr, errp)) {
684         return;
685     }
686 
687     if (!migration_incoming_state_setup(mis, errp)) {
688         return;
689     }
690 
691     if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
692         SocketAddress *saddr = &addr->u.socket;
693         if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
694             saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
695             saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
696             socket_start_incoming_migration(saddr, errp);
697         } else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) {
698             fd_start_incoming_migration(saddr->u.fd.str, errp);
699         }
700 #ifdef CONFIG_RDMA
701     } else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
702         if (migrate_xbzrle()) {
703             error_setg(errp, "RDMA and XBZRLE can't be used together");
704             return;
705         }
706         if (migrate_multifd()) {
707             error_setg(errp, "RDMA and multifd can't be used together");
708             return;
709         }
710         rdma_start_incoming_migration(&addr->u.rdma, errp);
711 #endif
712     } else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
713         exec_start_incoming_migration(addr->u.exec.args, errp);
714     } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
715         file_start_incoming_migration(&addr->u.file, errp);
716     } else {
717         error_setg(errp, "unknown migration protocol: %s", uri);
718     }
719 }
720 
721 static void process_incoming_migration_bh(void *opaque)
722 {
723     Error *local_err = NULL;
724     MigrationIncomingState *mis = opaque;
725 
726     trace_vmstate_downtime_checkpoint("dst-precopy-bh-enter");
727 
728     /* If capability late_block_activate is set:
729      * Only fire up the block code now if we're going to restart the
730      * VM, else 'cont' will do it.
731      * This causes file locking to happen; so we don't want it to happen
732      * unless we really are starting the VM.
733      */
734     if (!migrate_late_block_activate() ||
735          (autostart && (!global_state_received() ||
736             runstate_is_live(global_state_get_runstate())))) {
737         /* Make sure all file formats throw away their mutable metadata.
738          * If we get an error here, just don't restart the VM yet. */
739         bdrv_activate_all(&local_err);
740         if (local_err) {
741             error_report_err(local_err);
742             local_err = NULL;
743             autostart = false;
744         }
745     }
746 
747     /*
748      * This must happen after all error conditions are dealt with and
749      * we're sure the VM is going to be running on this host.
750      */
751     qemu_announce_self(&mis->announce_timer, migrate_announce_params());
752 
753     trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced");
754 
755     multifd_recv_shutdown();
756 
757     dirty_bitmap_mig_before_vm_start();
758 
759     if (!global_state_received() ||
760         runstate_is_live(global_state_get_runstate())) {
761         if (autostart) {
762             vm_start();
763         } else {
764             runstate_set(RUN_STATE_PAUSED);
765         }
766     } else if (migration_incoming_colo_enabled()) {
767         migration_incoming_disable_colo();
768         vm_start();
769     } else {
770         runstate_set(global_state_get_runstate());
771     }
772     trace_vmstate_downtime_checkpoint("dst-precopy-bh-vm-started");
773     /*
774      * This must happen after any state changes since as soon as an external
775      * observer sees this event they might start to prod at the VM assuming
776      * it's ready to use.
777      */
778     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
779                       MIGRATION_STATUS_COMPLETED);
780     migration_incoming_state_destroy();
781 }
782 
783 static void coroutine_fn
784 process_incoming_migration_co(void *opaque)
785 {
786     MigrationState *s = migrate_get_current();
787     MigrationIncomingState *mis = migration_incoming_get_current();
788     PostcopyState ps;
789     int ret;
790     Error *local_err = NULL;
791 
792     assert(mis->from_src_file);
793 
794     mis->largest_page_size = qemu_ram_pagesize_largest();
795     postcopy_state_set(POSTCOPY_INCOMING_NONE);
796     migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP,
797                       MIGRATION_STATUS_ACTIVE);
798 
799     mis->loadvm_co = qemu_coroutine_self();
800     ret = qemu_loadvm_state(mis->from_src_file);
801     mis->loadvm_co = NULL;
802 
803     trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed");
804 
805     ps = postcopy_state_get();
806     trace_process_incoming_migration_co_end(ret, ps);
807     if (ps != POSTCOPY_INCOMING_NONE) {
808         if (ps == POSTCOPY_INCOMING_ADVISE) {
809             /*
810              * Where a migration had postcopy enabled (and thus went to advise)
811              * but managed to complete within the precopy period, we can use
812              * the normal exit.
813              */
814             postcopy_ram_incoming_cleanup(mis);
815         } else if (ret >= 0) {
816             /*
817              * Postcopy was started, cleanup should happen at the end of the
818              * postcopy thread.
819              */
820             trace_process_incoming_migration_co_postcopy_end_main();
821             return;
822         }
823         /* Else if something went wrong then just fall out of the normal exit */
824     }
825 
826     if (ret < 0) {
827         error_setg(&local_err, "load of migration failed: %s", strerror(-ret));
828         goto fail;
829     }
830 
831     if (migration_incoming_colo_enabled()) {
832         /* yield until COLO exit */
833         colo_incoming_co();
834     }
835 
836     migration_bh_schedule(process_incoming_migration_bh, mis);
837     return;
838 fail:
839     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
840                       MIGRATION_STATUS_FAILED);
841     migrate_set_error(s, local_err);
842     error_free(local_err);
843 
844     migration_incoming_state_destroy();
845 
846     if (mis->exit_on_error) {
847         WITH_QEMU_LOCK_GUARD(&s->error_mutex) {
848             error_report_err(s->error);
849             s->error = NULL;
850         }
851 
852         exit(EXIT_FAILURE);
853     }
854 }
855 
856 /**
857  * migration_incoming_setup: Setup incoming migration
858  * @f: file for main migration channel
859  */
860 static void migration_incoming_setup(QEMUFile *f)
861 {
862     MigrationIncomingState *mis = migration_incoming_get_current();
863 
864     if (!mis->from_src_file) {
865         mis->from_src_file = f;
866     }
867     qemu_file_set_blocking(f, false);
868 }
869 
870 void migration_incoming_process(void)
871 {
872     Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
873     qemu_coroutine_enter(co);
874 }
875 
876 /* Returns true if recovered from a paused migration, otherwise false */
877 static bool postcopy_try_recover(void)
878 {
879     MigrationIncomingState *mis = migration_incoming_get_current();
880 
881     if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
882         /* Resumed from a paused postcopy migration */
883 
884         /* This should be set already in migration_incoming_setup() */
885         assert(mis->from_src_file);
886         /* Postcopy has standalone thread to do vm load */
887         qemu_file_set_blocking(mis->from_src_file, true);
888 
889         /* Re-configure the return path */
890         mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
891 
892         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
893                           MIGRATION_STATUS_POSTCOPY_RECOVER);
894 
895         /*
896          * Here, we only wake up the main loading thread (while the
897          * rest threads will still be waiting), so that we can receive
898          * commands from source now, and answer it if needed. The
899          * rest threads will be woken up afterwards until we are sure
900          * that source is ready to reply to page requests.
901          */
902         qemu_sem_post(&mis->postcopy_pause_sem_dst);
903         return true;
904     }
905 
906     return false;
907 }
908 
909 void migration_fd_process_incoming(QEMUFile *f)
910 {
911     migration_incoming_setup(f);
912     if (postcopy_try_recover()) {
913         return;
914     }
915     migration_incoming_process();
916 }
917 
918 /*
919  * Returns true when we want to start a new incoming migration process,
920  * false otherwise.
921  */
922 static bool migration_should_start_incoming(bool main_channel)
923 {
924     /* Multifd doesn't start unless all channels are established */
925     if (migrate_multifd()) {
926         return migration_has_all_channels();
927     }
928 
929     /* Preempt channel only starts when the main channel is created */
930     if (migrate_postcopy_preempt()) {
931         return main_channel;
932     }
933 
934     /*
935      * For all the rest types of migration, we should only reach here when
936      * it's the main channel that's being created, and we should always
937      * proceed with this channel.
938      */
939     assert(main_channel);
940     return true;
941 }
942 
943 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
944 {
945     MigrationIncomingState *mis = migration_incoming_get_current();
946     Error *local_err = NULL;
947     QEMUFile *f;
948     bool default_channel = true;
949     uint32_t channel_magic = 0;
950     int ret = 0;
951 
952     if (migrate_multifd() && !migrate_mapped_ram() &&
953         !migrate_postcopy_ram() &&
954         qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) {
955         /*
956          * With multiple channels, it is possible that we receive channels
957          * out of order on destination side, causing incorrect mapping of
958          * source channels on destination side. Check channel MAGIC to
959          * decide type of channel. Please note this is best effort, postcopy
960          * preempt channel does not send any magic number so avoid it for
961          * postcopy live migration. Also tls live migration already does
962          * tls handshake while initializing main channel so with tls this
963          * issue is not possible.
964          */
965         ret = migration_channel_read_peek(ioc, (void *)&channel_magic,
966                                           sizeof(channel_magic), errp);
967 
968         if (ret != 0) {
969             return;
970         }
971 
972         default_channel = (channel_magic == cpu_to_be32(QEMU_VM_FILE_MAGIC));
973     } else {
974         default_channel = !mis->from_src_file;
975     }
976 
977     if (multifd_recv_setup(errp) != 0) {
978         return;
979     }
980 
981     if (default_channel) {
982         f = qemu_file_new_input(ioc);
983         migration_incoming_setup(f);
984     } else {
985         /* Multiple connections */
986         assert(migration_needs_multiple_sockets());
987         if (migrate_multifd()) {
988             multifd_recv_new_channel(ioc, &local_err);
989         } else {
990             assert(migrate_postcopy_preempt());
991             f = qemu_file_new_input(ioc);
992             postcopy_preempt_new_channel(mis, f);
993         }
994         if (local_err) {
995             error_propagate(errp, local_err);
996             return;
997         }
998     }
999 
1000     if (migration_should_start_incoming(default_channel)) {
1001         /* If it's a recovery, we're done */
1002         if (postcopy_try_recover()) {
1003             return;
1004         }
1005         migration_incoming_process();
1006     }
1007 }
1008 
1009 /**
1010  * @migration_has_all_channels: We have received all channels that we need
1011  *
1012  * Returns true when we have got connections to all the channels that
1013  * we need for migration.
1014  */
1015 bool migration_has_all_channels(void)
1016 {
1017     MigrationIncomingState *mis = migration_incoming_get_current();
1018 
1019     if (!mis->from_src_file) {
1020         return false;
1021     }
1022 
1023     if (migrate_multifd()) {
1024         return multifd_recv_all_channels_created();
1025     }
1026 
1027     if (migrate_postcopy_preempt()) {
1028         return mis->postcopy_qemufile_dst != NULL;
1029     }
1030 
1031     return true;
1032 }
1033 
1034 int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
1035 {
1036     return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL);
1037 }
1038 
1039 /*
1040  * Send a 'SHUT' message on the return channel with the given value
1041  * to indicate that we've finished with the RP.  Non-0 value indicates
1042  * error.
1043  */
1044 void migrate_send_rp_shut(MigrationIncomingState *mis,
1045                           uint32_t value)
1046 {
1047     uint32_t buf;
1048 
1049     buf = cpu_to_be32(value);
1050     migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
1051 }
1052 
1053 /*
1054  * Send a 'PONG' message on the return channel with the given value
1055  * (normally in response to a 'PING')
1056  */
1057 void migrate_send_rp_pong(MigrationIncomingState *mis,
1058                           uint32_t value)
1059 {
1060     uint32_t buf;
1061 
1062     buf = cpu_to_be32(value);
1063     migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
1064 }
1065 
1066 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
1067                                  char *block_name)
1068 {
1069     char buf[512];
1070     int len;
1071     int64_t res;
1072 
1073     /*
1074      * First, we send the header part. It contains only the len of
1075      * idstr, and the idstr itself.
1076      */
1077     len = strlen(block_name);
1078     buf[0] = len;
1079     memcpy(buf + 1, block_name, len);
1080 
1081     if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
1082         error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
1083                      __func__);
1084         return;
1085     }
1086 
1087     migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
1088 
1089     /*
1090      * Next, we dump the received bitmap to the stream.
1091      *
1092      * TODO: currently we are safe since we are the only one that is
1093      * using the to_src_file handle (fault thread is still paused),
1094      * and it's ok even not taking the mutex. However the best way is
1095      * to take the lock before sending the message header, and release
1096      * the lock after sending the bitmap.
1097      */
1098     qemu_mutex_lock(&mis->rp_mutex);
1099     res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
1100     qemu_mutex_unlock(&mis->rp_mutex);
1101 
1102     trace_migrate_send_rp_recv_bitmap(block_name, res);
1103 }
1104 
1105 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
1106 {
1107     uint32_t buf;
1108 
1109     buf = cpu_to_be32(value);
1110     migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
1111 }
1112 
1113 /*
1114  * Return true if we're already in the middle of a migration
1115  * (i.e. any of the active or setup states)
1116  */
1117 bool migration_is_setup_or_active(void)
1118 {
1119     MigrationState *s = current_migration;
1120 
1121     switch (s->state) {
1122     case MIGRATION_STATUS_ACTIVE:
1123     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1124     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1125     case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
1126     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1127     case MIGRATION_STATUS_SETUP:
1128     case MIGRATION_STATUS_PRE_SWITCHOVER:
1129     case MIGRATION_STATUS_DEVICE:
1130     case MIGRATION_STATUS_WAIT_UNPLUG:
1131     case MIGRATION_STATUS_COLO:
1132         return true;
1133 
1134     default:
1135         return false;
1136 
1137     }
1138 }
1139 
1140 bool migration_is_running(void)
1141 {
1142     MigrationState *s = current_migration;
1143 
1144     switch (s->state) {
1145     case MIGRATION_STATUS_ACTIVE:
1146     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1147     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1148     case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
1149     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1150     case MIGRATION_STATUS_SETUP:
1151     case MIGRATION_STATUS_PRE_SWITCHOVER:
1152     case MIGRATION_STATUS_DEVICE:
1153     case MIGRATION_STATUS_WAIT_UNPLUG:
1154     case MIGRATION_STATUS_CANCELLING:
1155         return true;
1156 
1157     default:
1158         return false;
1159 
1160     }
1161 }
1162 
1163 static bool migrate_show_downtime(MigrationState *s)
1164 {
1165     return (s->state == MIGRATION_STATUS_COMPLETED) || migration_in_postcopy();
1166 }
1167 
1168 static void populate_time_info(MigrationInfo *info, MigrationState *s)
1169 {
1170     info->has_status = true;
1171     info->has_setup_time = true;
1172     info->setup_time = s->setup_time;
1173 
1174     if (s->state == MIGRATION_STATUS_COMPLETED) {
1175         info->has_total_time = true;
1176         info->total_time = s->total_time;
1177     } else {
1178         info->has_total_time = true;
1179         info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
1180                            s->start_time;
1181     }
1182 
1183     if (migrate_show_downtime(s)) {
1184         info->has_downtime = true;
1185         info->downtime = s->downtime;
1186     } else {
1187         info->has_expected_downtime = true;
1188         info->expected_downtime = s->expected_downtime;
1189     }
1190 }
1191 
1192 static void populate_ram_info(MigrationInfo *info, MigrationState *s)
1193 {
1194     size_t page_size = qemu_target_page_size();
1195 
1196     info->ram = g_malloc0(sizeof(*info->ram));
1197     info->ram->transferred = migration_transferred_bytes();
1198     info->ram->total = ram_bytes_total();
1199     info->ram->duplicate = stat64_get(&mig_stats.zero_pages);
1200     info->ram->normal = stat64_get(&mig_stats.normal_pages);
1201     info->ram->normal_bytes = info->ram->normal * page_size;
1202     info->ram->mbps = s->mbps;
1203     info->ram->dirty_sync_count =
1204         stat64_get(&mig_stats.dirty_sync_count);
1205     info->ram->dirty_sync_missed_zero_copy =
1206         stat64_get(&mig_stats.dirty_sync_missed_zero_copy);
1207     info->ram->postcopy_requests =
1208         stat64_get(&mig_stats.postcopy_requests);
1209     info->ram->page_size = page_size;
1210     info->ram->multifd_bytes = stat64_get(&mig_stats.multifd_bytes);
1211     info->ram->pages_per_second = s->pages_per_second;
1212     info->ram->precopy_bytes = stat64_get(&mig_stats.precopy_bytes);
1213     info->ram->downtime_bytes = stat64_get(&mig_stats.downtime_bytes);
1214     info->ram->postcopy_bytes = stat64_get(&mig_stats.postcopy_bytes);
1215 
1216     if (migrate_xbzrle()) {
1217         info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
1218         info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
1219         info->xbzrle_cache->bytes = xbzrle_counters.bytes;
1220         info->xbzrle_cache->pages = xbzrle_counters.pages;
1221         info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
1222         info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
1223         info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
1224         info->xbzrle_cache->overflow = xbzrle_counters.overflow;
1225     }
1226 
1227     if (cpu_throttle_active()) {
1228         info->has_cpu_throttle_percentage = true;
1229         info->cpu_throttle_percentage = cpu_throttle_get_percentage();
1230     }
1231 
1232     if (s->state != MIGRATION_STATUS_COMPLETED) {
1233         info->ram->remaining = ram_bytes_remaining();
1234         info->ram->dirty_pages_rate =
1235            stat64_get(&mig_stats.dirty_pages_rate);
1236     }
1237 
1238     if (migrate_dirty_limit() && dirtylimit_in_service()) {
1239         info->has_dirty_limit_throttle_time_per_round = true;
1240         info->dirty_limit_throttle_time_per_round =
1241                             dirtylimit_throttle_time_per_round();
1242 
1243         info->has_dirty_limit_ring_full_time = true;
1244         info->dirty_limit_ring_full_time = dirtylimit_ring_full_time();
1245     }
1246 }
1247 
1248 static void fill_source_migration_info(MigrationInfo *info)
1249 {
1250     MigrationState *s = migrate_get_current();
1251     int state = qatomic_read(&s->state);
1252     GSList *cur_blocker = migration_blockers[migrate_mode()];
1253 
1254     info->blocked_reasons = NULL;
1255 
1256     /*
1257      * There are two types of reasons a migration might be blocked;
1258      * a) devices marked in VMState as non-migratable, and
1259      * b) Explicit migration blockers
1260      * We need to add both of them here.
1261      */
1262     qemu_savevm_non_migratable_list(&info->blocked_reasons);
1263 
1264     while (cur_blocker) {
1265         QAPI_LIST_PREPEND(info->blocked_reasons,
1266                           g_strdup(error_get_pretty(cur_blocker->data)));
1267         cur_blocker = g_slist_next(cur_blocker);
1268     }
1269     info->has_blocked_reasons = info->blocked_reasons != NULL;
1270 
1271     switch (state) {
1272     case MIGRATION_STATUS_NONE:
1273         /* no migration has happened ever */
1274         /* do not overwrite destination migration status */
1275         return;
1276     case MIGRATION_STATUS_SETUP:
1277         info->has_status = true;
1278         info->has_total_time = false;
1279         break;
1280     case MIGRATION_STATUS_ACTIVE:
1281     case MIGRATION_STATUS_CANCELLING:
1282     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1283     case MIGRATION_STATUS_PRE_SWITCHOVER:
1284     case MIGRATION_STATUS_DEVICE:
1285     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1286     case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
1287     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1288         /* TODO add some postcopy stats */
1289         populate_time_info(info, s);
1290         populate_ram_info(info, s);
1291         migration_populate_vfio_info(info);
1292         break;
1293     case MIGRATION_STATUS_COLO:
1294         info->has_status = true;
1295         /* TODO: display COLO specific information (checkpoint info etc.) */
1296         break;
1297     case MIGRATION_STATUS_COMPLETED:
1298         populate_time_info(info, s);
1299         populate_ram_info(info, s);
1300         migration_populate_vfio_info(info);
1301         break;
1302     case MIGRATION_STATUS_FAILED:
1303         info->has_status = true;
1304         break;
1305     case MIGRATION_STATUS_CANCELLED:
1306         info->has_status = true;
1307         break;
1308     case MIGRATION_STATUS_WAIT_UNPLUG:
1309         info->has_status = true;
1310         break;
1311     }
1312     info->status = state;
1313 
1314     QEMU_LOCK_GUARD(&s->error_mutex);
1315     if (s->error) {
1316         info->error_desc = g_strdup(error_get_pretty(s->error));
1317     }
1318 }
1319 
1320 static void fill_destination_migration_info(MigrationInfo *info)
1321 {
1322     MigrationIncomingState *mis = migration_incoming_get_current();
1323 
1324     if (mis->socket_address_list) {
1325         info->has_socket_address = true;
1326         info->socket_address =
1327             QAPI_CLONE(SocketAddressList, mis->socket_address_list);
1328     }
1329 
1330     switch (mis->state) {
1331     case MIGRATION_STATUS_SETUP:
1332     case MIGRATION_STATUS_CANCELLING:
1333     case MIGRATION_STATUS_CANCELLED:
1334     case MIGRATION_STATUS_ACTIVE:
1335     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1336     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1337     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1338     case MIGRATION_STATUS_FAILED:
1339     case MIGRATION_STATUS_COLO:
1340         info->has_status = true;
1341         break;
1342     case MIGRATION_STATUS_COMPLETED:
1343         info->has_status = true;
1344         fill_destination_postcopy_migration_info(info);
1345         break;
1346     default:
1347         return;
1348     }
1349     info->status = mis->state;
1350 
1351     if (!info->error_desc) {
1352         MigrationState *s = migrate_get_current();
1353         QEMU_LOCK_GUARD(&s->error_mutex);
1354 
1355         if (s->error) {
1356             info->error_desc = g_strdup(error_get_pretty(s->error));
1357         }
1358     }
1359 }
1360 
1361 MigrationInfo *qmp_query_migrate(Error **errp)
1362 {
1363     MigrationInfo *info = g_malloc0(sizeof(*info));
1364 
1365     fill_destination_migration_info(info);
1366     fill_source_migration_info(info);
1367 
1368     return info;
1369 }
1370 
1371 void qmp_migrate_start_postcopy(Error **errp)
1372 {
1373     MigrationState *s = migrate_get_current();
1374 
1375     if (!migrate_postcopy()) {
1376         error_setg(errp, "Enable postcopy with migrate_set_capability before"
1377                          " the start of migration");
1378         return;
1379     }
1380 
1381     if (s->state == MIGRATION_STATUS_NONE) {
1382         error_setg(errp, "Postcopy must be started after migration has been"
1383                          " started");
1384         return;
1385     }
1386     /*
1387      * we don't error if migration has finished since that would be racy
1388      * with issuing this command.
1389      */
1390     qatomic_set(&s->start_postcopy, true);
1391 }
1392 
1393 /* shared migration helpers */
1394 
1395 void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
1396                        MigrationStatus new_state)
1397 {
1398     assert(new_state < MIGRATION_STATUS__MAX);
1399     if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
1400         trace_migrate_set_state(MigrationStatus_str(new_state));
1401         migrate_generate_event(new_state);
1402     }
1403 }
1404 
1405 static void migrate_fd_cleanup(MigrationState *s)
1406 {
1407     MigrationEventType type;
1408 
1409     g_free(s->hostname);
1410     s->hostname = NULL;
1411     json_writer_free(s->vmdesc);
1412     s->vmdesc = NULL;
1413 
1414     qemu_savevm_state_cleanup();
1415 
1416     close_return_path_on_source(s);
1417 
1418     if (s->to_dst_file) {
1419         QEMUFile *tmp;
1420 
1421         trace_migrate_fd_cleanup();
1422         bql_unlock();
1423         if (s->migration_thread_running) {
1424             qemu_thread_join(&s->thread);
1425             s->migration_thread_running = false;
1426         }
1427         bql_lock();
1428 
1429         multifd_send_shutdown();
1430         qemu_mutex_lock(&s->qemu_file_lock);
1431         tmp = s->to_dst_file;
1432         s->to_dst_file = NULL;
1433         qemu_mutex_unlock(&s->qemu_file_lock);
1434         /*
1435          * Close the file handle without the lock to make sure the
1436          * critical section won't block for long.
1437          */
1438         migration_ioc_unregister_yank_from_file(tmp);
1439         qemu_fclose(tmp);
1440     }
1441 
1442     assert(!migration_is_active());
1443 
1444     if (s->state == MIGRATION_STATUS_CANCELLING) {
1445         migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1446                           MIGRATION_STATUS_CANCELLED);
1447     }
1448 
1449     if (s->error) {
1450         /* It is used on info migrate.  We can't free it */
1451         error_report_err(error_copy(s->error));
1452     }
1453     type = migration_has_failed(s) ? MIG_EVENT_PRECOPY_FAILED :
1454                                      MIG_EVENT_PRECOPY_DONE;
1455     migration_call_notifiers(s, type, NULL);
1456     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1457 }
1458 
1459 static void migrate_fd_cleanup_bh(void *opaque)
1460 {
1461     migrate_fd_cleanup(opaque);
1462 }
1463 
1464 void migrate_set_error(MigrationState *s, const Error *error)
1465 {
1466     QEMU_LOCK_GUARD(&s->error_mutex);
1467 
1468     trace_migrate_error(error_get_pretty(error));
1469 
1470     if (!s->error) {
1471         s->error = error_copy(error);
1472     }
1473 }
1474 
1475 bool migrate_has_error(MigrationState *s)
1476 {
1477     /* The lock is not helpful here, but still follow the rule */
1478     QEMU_LOCK_GUARD(&s->error_mutex);
1479     return qatomic_read(&s->error);
1480 }
1481 
1482 static void migrate_error_free(MigrationState *s)
1483 {
1484     QEMU_LOCK_GUARD(&s->error_mutex);
1485     if (s->error) {
1486         error_free(s->error);
1487         s->error = NULL;
1488     }
1489 }
1490 
1491 static void migrate_fd_error(MigrationState *s, const Error *error)
1492 {
1493     MigrationStatus current = s->state;
1494     MigrationStatus next;
1495 
1496     assert(s->to_dst_file == NULL);
1497 
1498     switch (current) {
1499     case MIGRATION_STATUS_SETUP:
1500         next = MIGRATION_STATUS_FAILED;
1501         break;
1502     case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
1503         /* Never fail a postcopy migration; switch back to PAUSED instead */
1504         next = MIGRATION_STATUS_POSTCOPY_PAUSED;
1505         break;
1506     default:
1507         /*
1508          * This really shouldn't happen. Just be careful to not crash a VM
1509          * just for this.  Instead, dump something.
1510          */
1511         error_report("%s: Illegal migration status (%s) detected",
1512                      __func__, MigrationStatus_str(current));
1513         return;
1514     }
1515 
1516     migrate_set_state(&s->state, current, next);
1517     migrate_set_error(s, error);
1518 }
1519 
1520 static void migrate_fd_cancel(MigrationState *s)
1521 {
1522     int old_state ;
1523 
1524     trace_migrate_fd_cancel();
1525 
1526     WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1527         if (s->rp_state.from_dst_file) {
1528             /* shutdown the rp socket, so causing the rp thread to shutdown */
1529             qemu_file_shutdown(s->rp_state.from_dst_file);
1530         }
1531     }
1532 
1533     do {
1534         old_state = s->state;
1535         if (!migration_is_running()) {
1536             break;
1537         }
1538         /* If the migration is paused, kick it out of the pause */
1539         if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1540             qemu_sem_post(&s->pause_sem);
1541         }
1542         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
1543     } while (s->state != MIGRATION_STATUS_CANCELLING);
1544 
1545     /*
1546      * If we're unlucky the migration code might be stuck somewhere in a
1547      * send/write while the network has failed and is waiting to timeout;
1548      * if we've got shutdown(2) available then we can force it to quit.
1549      */
1550     if (s->state == MIGRATION_STATUS_CANCELLING) {
1551         WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1552             if (s->to_dst_file) {
1553                 qemu_file_shutdown(s->to_dst_file);
1554             }
1555         }
1556     }
1557     if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
1558         Error *local_err = NULL;
1559 
1560         bdrv_activate_all(&local_err);
1561         if (local_err) {
1562             error_report_err(local_err);
1563         } else {
1564             s->block_inactive = false;
1565         }
1566     }
1567 }
1568 
1569 void migration_add_notifier_mode(NotifierWithReturn *notify,
1570                                  MigrationNotifyFunc func, MigMode mode)
1571 {
1572     notify->notify = (NotifierWithReturnFunc)func;
1573     notifier_with_return_list_add(&migration_state_notifiers[mode], notify);
1574 }
1575 
1576 void migration_add_notifier(NotifierWithReturn *notify,
1577                             MigrationNotifyFunc func)
1578 {
1579     migration_add_notifier_mode(notify, func, MIG_MODE_NORMAL);
1580 }
1581 
1582 void migration_remove_notifier(NotifierWithReturn *notify)
1583 {
1584     if (notify->notify) {
1585         notifier_with_return_remove(notify);
1586         notify->notify = NULL;
1587     }
1588 }
1589 
1590 int migration_call_notifiers(MigrationState *s, MigrationEventType type,
1591                              Error **errp)
1592 {
1593     MigMode mode = s->parameters.mode;
1594     MigrationEvent e;
1595     int ret;
1596 
1597     e.type = type;
1598     ret = notifier_with_return_list_notify(&migration_state_notifiers[mode],
1599                                            &e, errp);
1600     assert(!ret || type == MIG_EVENT_PRECOPY_SETUP);
1601     return ret;
1602 }
1603 
1604 bool migration_has_failed(MigrationState *s)
1605 {
1606     return (s->state == MIGRATION_STATUS_CANCELLED ||
1607             s->state == MIGRATION_STATUS_FAILED);
1608 }
1609 
1610 bool migration_in_postcopy(void)
1611 {
1612     MigrationState *s = migrate_get_current();
1613 
1614     switch (s->state) {
1615     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1616     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1617     case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
1618     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1619         return true;
1620     default:
1621         return false;
1622     }
1623 }
1624 
1625 bool migration_postcopy_is_alive(MigrationStatus state)
1626 {
1627     switch (state) {
1628     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1629     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1630         return true;
1631     default:
1632         return false;
1633     }
1634 }
1635 
1636 bool migration_in_incoming_postcopy(void)
1637 {
1638     PostcopyState ps = postcopy_state_get();
1639 
1640     return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
1641 }
1642 
1643 bool migration_incoming_postcopy_advised(void)
1644 {
1645     PostcopyState ps = postcopy_state_get();
1646 
1647     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
1648 }
1649 
1650 bool migration_in_bg_snapshot(void)
1651 {
1652     return migrate_background_snapshot() &&
1653            migration_is_setup_or_active();
1654 }
1655 
1656 bool migration_is_idle(void)
1657 {
1658     MigrationState *s = current_migration;
1659 
1660     if (!s) {
1661         return true;
1662     }
1663 
1664     switch (s->state) {
1665     case MIGRATION_STATUS_NONE:
1666     case MIGRATION_STATUS_CANCELLED:
1667     case MIGRATION_STATUS_COMPLETED:
1668     case MIGRATION_STATUS_FAILED:
1669         return true;
1670     default:
1671         return false;
1672     }
1673 }
1674 
1675 bool migration_is_active(void)
1676 {
1677     MigrationState *s = current_migration;
1678 
1679     return (s->state == MIGRATION_STATUS_ACTIVE ||
1680             s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
1681 }
1682 
1683 bool migration_is_device(void)
1684 {
1685     MigrationState *s = current_migration;
1686 
1687     return s->state == MIGRATION_STATUS_DEVICE;
1688 }
1689 
1690 bool migration_thread_is_self(void)
1691 {
1692     MigrationState *s = current_migration;
1693 
1694     return qemu_thread_is_self(&s->thread);
1695 }
1696 
1697 bool migrate_mode_is_cpr(MigrationState *s)
1698 {
1699     return s->parameters.mode == MIG_MODE_CPR_REBOOT;
1700 }
1701 
1702 int migrate_init(MigrationState *s, Error **errp)
1703 {
1704     int ret;
1705 
1706     ret = qemu_savevm_state_prepare(errp);
1707     if (ret) {
1708         return ret;
1709     }
1710 
1711     /*
1712      * Reinitialise all migration state, except
1713      * parameters/capabilities that the user set, and
1714      * locks.
1715      */
1716     s->to_dst_file = NULL;
1717     s->state = MIGRATION_STATUS_NONE;
1718     s->rp_state.from_dst_file = NULL;
1719     s->mbps = 0.0;
1720     s->pages_per_second = 0.0;
1721     s->downtime = 0;
1722     s->expected_downtime = 0;
1723     s->setup_time = 0;
1724     s->start_postcopy = false;
1725     s->migration_thread_running = false;
1726     error_free(s->error);
1727     s->error = NULL;
1728     s->vmdesc = NULL;
1729 
1730     migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
1731 
1732     s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1733     s->total_time = 0;
1734     s->vm_old_state = -1;
1735     s->iteration_initial_bytes = 0;
1736     s->threshold_size = 0;
1737     s->switchover_acked = false;
1738     s->rdma_migration = false;
1739     /*
1740      * set mig_stats memory to zero for a new migration
1741      */
1742     memset(&mig_stats, 0, sizeof(mig_stats));
1743     migration_reset_vfio_bytes_transferred();
1744 
1745     return 0;
1746 }
1747 
1748 static bool is_busy(Error **reasonp, Error **errp)
1749 {
1750     ERRP_GUARD();
1751 
1752     /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
1753     if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
1754         error_propagate_prepend(errp, *reasonp,
1755                                 "disallowing migration blocker "
1756                                 "(migration/snapshot in progress) for: ");
1757         *reasonp = NULL;
1758         return true;
1759     }
1760     return false;
1761 }
1762 
1763 static bool is_only_migratable(Error **reasonp, Error **errp, int modes)
1764 {
1765     ERRP_GUARD();
1766 
1767     if (only_migratable && (modes & BIT(MIG_MODE_NORMAL))) {
1768         error_propagate_prepend(errp, *reasonp,
1769                                 "disallowing migration blocker "
1770                                 "(--only-migratable) for: ");
1771         *reasonp = NULL;
1772         return true;
1773     }
1774     return false;
1775 }
1776 
1777 static int get_modes(MigMode mode, va_list ap)
1778 {
1779     int modes = 0;
1780 
1781     while (mode != -1 && mode != MIG_MODE_ALL) {
1782         assert(mode >= MIG_MODE_NORMAL && mode < MIG_MODE__MAX);
1783         modes |= BIT(mode);
1784         mode = va_arg(ap, MigMode);
1785     }
1786     if (mode == MIG_MODE_ALL) {
1787         modes = BIT(MIG_MODE__MAX) - 1;
1788     }
1789     return modes;
1790 }
1791 
1792 static int add_blockers(Error **reasonp, Error **errp, int modes)
1793 {
1794     for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) {
1795         if (modes & BIT(mode)) {
1796             migration_blockers[mode] = g_slist_prepend(migration_blockers[mode],
1797                                                        *reasonp);
1798         }
1799     }
1800     return 0;
1801 }
1802 
1803 int migrate_add_blocker(Error **reasonp, Error **errp)
1804 {
1805     return migrate_add_blocker_modes(reasonp, errp, MIG_MODE_ALL);
1806 }
1807 
1808 int migrate_add_blocker_normal(Error **reasonp, Error **errp)
1809 {
1810     return migrate_add_blocker_modes(reasonp, errp, MIG_MODE_NORMAL, -1);
1811 }
1812 
1813 int migrate_add_blocker_modes(Error **reasonp, Error **errp, MigMode mode, ...)
1814 {
1815     int modes;
1816     va_list ap;
1817 
1818     va_start(ap, mode);
1819     modes = get_modes(mode, ap);
1820     va_end(ap);
1821 
1822     if (is_only_migratable(reasonp, errp, modes)) {
1823         return -EACCES;
1824     } else if (is_busy(reasonp, errp)) {
1825         return -EBUSY;
1826     }
1827     return add_blockers(reasonp, errp, modes);
1828 }
1829 
1830 int migrate_add_blocker_internal(Error **reasonp, Error **errp)
1831 {
1832     int modes = BIT(MIG_MODE__MAX) - 1;
1833 
1834     if (is_busy(reasonp, errp)) {
1835         return -EBUSY;
1836     }
1837     return add_blockers(reasonp, errp, modes);
1838 }
1839 
1840 void migrate_del_blocker(Error **reasonp)
1841 {
1842     if (*reasonp) {
1843         for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) {
1844             migration_blockers[mode] = g_slist_remove(migration_blockers[mode],
1845                                                       *reasonp);
1846         }
1847         error_free(*reasonp);
1848         *reasonp = NULL;
1849     }
1850 }
1851 
1852 void qmp_migrate_incoming(const char *uri, bool has_channels,
1853                           MigrationChannelList *channels,
1854                           bool has_exit_on_error, bool exit_on_error,
1855                           Error **errp)
1856 {
1857     Error *local_err = NULL;
1858     static bool once = true;
1859     MigrationIncomingState *mis = migration_incoming_get_current();
1860 
1861     if (!once) {
1862         error_setg(errp, "The incoming migration has already been started");
1863         return;
1864     }
1865     if (!runstate_check(RUN_STATE_INMIGRATE)) {
1866         error_setg(errp, "'-incoming' was not specified on the command line");
1867         return;
1868     }
1869 
1870     if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
1871         return;
1872     }
1873 
1874     mis->exit_on_error =
1875         has_exit_on_error ? exit_on_error : INMIGRATE_DEFAULT_EXIT_ON_ERROR;
1876 
1877     qemu_start_incoming_migration(uri, has_channels, channels, &local_err);
1878 
1879     if (local_err) {
1880         yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1881         error_propagate(errp, local_err);
1882         return;
1883     }
1884 
1885     once = false;
1886 }
1887 
1888 void qmp_migrate_recover(const char *uri, Error **errp)
1889 {
1890     MigrationIncomingState *mis = migration_incoming_get_current();
1891 
1892     /*
1893      * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
1894      * callers (no one should ignore a recover failure); if there is, it's a
1895      * programming error.
1896      */
1897     assert(errp);
1898 
1899     if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1900         error_setg(errp, "Migrate recover can only be run "
1901                    "when postcopy is paused.");
1902         return;
1903     }
1904 
1905     /* If there's an existing transport, release it */
1906     migration_incoming_transport_cleanup(mis);
1907 
1908     /*
1909      * Note that this call will never start a real migration; it will
1910      * only re-setup the migration stream and poke existing migration
1911      * to continue using that newly established channel.
1912      */
1913     qemu_start_incoming_migration(uri, false, NULL, errp);
1914 }
1915 
1916 void qmp_migrate_pause(Error **errp)
1917 {
1918     MigrationState *ms = migrate_get_current();
1919     MigrationIncomingState *mis = migration_incoming_get_current();
1920     int ret = 0;
1921 
1922     if (migration_postcopy_is_alive(ms->state)) {
1923         /* Source side, during postcopy */
1924         Error *error = NULL;
1925 
1926         /* Tell the core migration that we're pausing */
1927         error_setg(&error, "Postcopy migration is paused by the user");
1928         migrate_set_error(ms, error);
1929         error_free(error);
1930 
1931         qemu_mutex_lock(&ms->qemu_file_lock);
1932         if (ms->to_dst_file) {
1933             ret = qemu_file_shutdown(ms->to_dst_file);
1934         }
1935         qemu_mutex_unlock(&ms->qemu_file_lock);
1936         if (ret) {
1937             error_setg(errp, "Failed to pause source migration");
1938         }
1939 
1940         /*
1941          * Kick the migration thread out of any waiting windows (on behalf
1942          * of the rp thread).
1943          */
1944         migration_rp_kick(ms);
1945 
1946         return;
1947     }
1948 
1949     if (migration_postcopy_is_alive(mis->state)) {
1950         ret = qemu_file_shutdown(mis->from_src_file);
1951         if (ret) {
1952             error_setg(errp, "Failed to pause destination migration");
1953         }
1954         return;
1955     }
1956 
1957     error_setg(errp, "migrate-pause is currently only supported "
1958                "during postcopy-active or postcopy-recover state");
1959 }
1960 
1961 bool migration_is_blocked(Error **errp)
1962 {
1963     GSList *blockers = migration_blockers[migrate_mode()];
1964 
1965     if (qemu_savevm_state_blocked(errp)) {
1966         return true;
1967     }
1968 
1969     if (blockers) {
1970         error_propagate(errp, error_copy(blockers->data));
1971         return true;
1972     }
1973 
1974     return false;
1975 }
1976 
1977 /* Returns true if continue to migrate, or false if error detected */
1978 static bool migrate_prepare(MigrationState *s, bool resume, Error **errp)
1979 {
1980     if (resume) {
1981         if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1982             error_setg(errp, "Cannot resume if there is no "
1983                        "paused migration");
1984             return false;
1985         }
1986 
1987         /*
1988          * Postcopy recovery won't work well with release-ram
1989          * capability since release-ram will drop the page buffer as
1990          * long as the page is put into the send buffer.  So if there
1991          * is a network failure happened, any page buffers that have
1992          * not yet reached the destination VM but have already been
1993          * sent from the source VM will be lost forever.  Let's refuse
1994          * the client from resuming such a postcopy migration.
1995          * Luckily release-ram was designed to only be used when src
1996          * and destination VMs are on the same host, so it should be
1997          * fine.
1998          */
1999         if (migrate_release_ram()) {
2000             error_setg(errp, "Postcopy recovery cannot work "
2001                        "when release-ram capability is set");
2002             return false;
2003         }
2004 
2005         migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
2006                           MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP);
2007 
2008         /* This is a resume, skip init status */
2009         return true;
2010     }
2011 
2012     if (migration_is_running()) {
2013         error_setg(errp, "There's a migration process in progress");
2014         return false;
2015     }
2016 
2017     if (runstate_check(RUN_STATE_INMIGRATE)) {
2018         error_setg(errp, "Guest is waiting for an incoming migration");
2019         return false;
2020     }
2021 
2022     if (runstate_check(RUN_STATE_POSTMIGRATE)) {
2023         error_setg(errp, "Can't migrate the vm that was paused due to "
2024                    "previous migration");
2025         return false;
2026     }
2027 
2028     if (kvm_hwpoisoned_mem()) {
2029         error_setg(errp, "Can't migrate this vm with hardware poisoned memory, "
2030                    "please reboot the vm and try again");
2031         return false;
2032     }
2033 
2034     if (migration_is_blocked(errp)) {
2035         return false;
2036     }
2037 
2038     if (migrate_mapped_ram()) {
2039         if (migrate_tls()) {
2040             error_setg(errp, "Cannot use TLS with mapped-ram");
2041             return false;
2042         }
2043 
2044         if (migrate_multifd_compression()) {
2045             error_setg(errp, "Cannot use compression with mapped-ram");
2046             return false;
2047         }
2048     }
2049 
2050     if (migrate_mode_is_cpr(s)) {
2051         const char *conflict = NULL;
2052 
2053         if (migrate_postcopy()) {
2054             conflict = "postcopy";
2055         } else if (migrate_background_snapshot()) {
2056             conflict = "background snapshot";
2057         } else if (migrate_colo()) {
2058             conflict = "COLO";
2059         }
2060 
2061         if (conflict) {
2062             error_setg(errp, "Cannot use %s with CPR", conflict);
2063             return false;
2064         }
2065     }
2066 
2067     if (migrate_init(s, errp)) {
2068         return false;
2069     }
2070 
2071     return true;
2072 }
2073 
2074 void qmp_migrate(const char *uri, bool has_channels,
2075                  MigrationChannelList *channels, bool has_detach, bool detach,
2076                  bool has_resume, bool resume, Error **errp)
2077 {
2078     bool resume_requested;
2079     Error *local_err = NULL;
2080     MigrationState *s = migrate_get_current();
2081     g_autoptr(MigrationChannel) channel = NULL;
2082     MigrationAddress *addr = NULL;
2083 
2084     /*
2085      * Having preliminary checks for uri and channel
2086      */
2087     if (!uri == !channels) {
2088         error_setg(errp, "need either 'uri' or 'channels' argument");
2089         return;
2090     }
2091 
2092     if (channels) {
2093         /* To verify that Migrate channel list has only item */
2094         if (channels->next) {
2095             error_setg(errp, "Channel list has more than one entries");
2096             return;
2097         }
2098         addr = channels->value->addr;
2099     }
2100 
2101     if (uri) {
2102         /* caller uses the old URI syntax */
2103         if (!migrate_uri_parse(uri, &channel, errp)) {
2104             return;
2105         }
2106         addr = channel->addr;
2107     }
2108 
2109     /* transport mechanism not suitable for migration? */
2110     if (!migration_channels_and_transport_compatible(addr, errp)) {
2111         return;
2112     }
2113 
2114     resume_requested = has_resume && resume;
2115     if (!migrate_prepare(s, resume_requested, errp)) {
2116         /* Error detected, put into errp */
2117         return;
2118     }
2119 
2120     if (!resume_requested) {
2121         if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2122             return;
2123         }
2124     }
2125 
2126     if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
2127         SocketAddress *saddr = &addr->u.socket;
2128         if (saddr->type == SOCKET_ADDRESS_TYPE_INET ||
2129             saddr->type == SOCKET_ADDRESS_TYPE_UNIX ||
2130             saddr->type == SOCKET_ADDRESS_TYPE_VSOCK) {
2131             socket_start_outgoing_migration(s, saddr, &local_err);
2132         } else if (saddr->type == SOCKET_ADDRESS_TYPE_FD) {
2133             fd_start_outgoing_migration(s, saddr->u.fd.str, &local_err);
2134         }
2135 #ifdef CONFIG_RDMA
2136     } else if (addr->transport == MIGRATION_ADDRESS_TYPE_RDMA) {
2137         rdma_start_outgoing_migration(s, &addr->u.rdma, &local_err);
2138 #endif
2139     } else if (addr->transport == MIGRATION_ADDRESS_TYPE_EXEC) {
2140         exec_start_outgoing_migration(s, addr->u.exec.args, &local_err);
2141     } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) {
2142         file_start_outgoing_migration(s, &addr->u.file, &local_err);
2143     } else {
2144         error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri",
2145                    "a valid migration protocol");
2146         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2147                           MIGRATION_STATUS_FAILED);
2148     }
2149 
2150     if (local_err) {
2151         if (!resume_requested) {
2152             yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2153         }
2154         migrate_fd_error(s, local_err);
2155         error_propagate(errp, local_err);
2156         return;
2157     }
2158 }
2159 
2160 void qmp_migrate_cancel(Error **errp)
2161 {
2162     migration_cancel(NULL);
2163 }
2164 
2165 void qmp_migrate_continue(MigrationStatus state, Error **errp)
2166 {
2167     MigrationState *s = migrate_get_current();
2168     if (s->state != state) {
2169         error_setg(errp,  "Migration not in expected state: %s",
2170                    MigrationStatus_str(s->state));
2171         return;
2172     }
2173     qemu_sem_post(&s->pause_sem);
2174 }
2175 
2176 int migration_rp_wait(MigrationState *s)
2177 {
2178     /* If migration has failure already, ignore the wait */
2179     if (migrate_has_error(s)) {
2180         return -1;
2181     }
2182 
2183     qemu_sem_wait(&s->rp_state.rp_sem);
2184 
2185     /* After wait, double check that there's no failure */
2186     if (migrate_has_error(s)) {
2187         return -1;
2188     }
2189 
2190     return 0;
2191 }
2192 
2193 void migration_rp_kick(MigrationState *s)
2194 {
2195     qemu_sem_post(&s->rp_state.rp_sem);
2196 }
2197 
2198 static struct rp_cmd_args {
2199     ssize_t     len; /* -1 = variable */
2200     const char *name;
2201 } rp_cmd_args[] = {
2202     [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
2203     [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
2204     [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
2205     [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
2206     [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
2207     [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
2208     [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
2209     [MIG_RP_MSG_SWITCHOVER_ACK] = { .len =  0, .name = "SWITCHOVER_ACK" },
2210     [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
2211 };
2212 
2213 /*
2214  * Process a request for pages received on the return path,
2215  * We're allowed to send more than requested (e.g. to round to our page size)
2216  * and we don't need to send pages that have already been sent.
2217  */
2218 static void
2219 migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
2220                             ram_addr_t start, size_t len, Error **errp)
2221 {
2222     long our_host_ps = qemu_real_host_page_size();
2223 
2224     trace_migrate_handle_rp_req_pages(rbname, start, len);
2225 
2226     /*
2227      * Since we currently insist on matching page sizes, just sanity check
2228      * we're being asked for whole host pages.
2229      */
2230     if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
2231         !QEMU_IS_ALIGNED(len, our_host_ps)) {
2232         error_setg(errp, "MIG_RP_MSG_REQ_PAGES: Misaligned page request, start:"
2233                    RAM_ADDR_FMT " len: %zd", start, len);
2234         return;
2235     }
2236 
2237     ram_save_queue_pages(rbname, start, len, errp);
2238 }
2239 
2240 static bool migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name,
2241                                           Error **errp)
2242 {
2243     RAMBlock *block = qemu_ram_block_by_name(block_name);
2244 
2245     if (!block) {
2246         error_setg(errp, "MIG_RP_MSG_RECV_BITMAP has invalid block name '%s'",
2247                    block_name);
2248         return false;
2249     }
2250 
2251     /* Fetch the received bitmap and refresh the dirty bitmap */
2252     return ram_dirty_bitmap_reload(s, block, errp);
2253 }
2254 
2255 static bool migrate_handle_rp_resume_ack(MigrationState *s,
2256                                          uint32_t value, Error **errp)
2257 {
2258     trace_source_return_path_thread_resume_ack(value);
2259 
2260     if (value != MIGRATION_RESUME_ACK_VALUE) {
2261         error_setg(errp, "illegal resume_ack value %"PRIu32, value);
2262         return false;
2263     }
2264 
2265     /* Now both sides are active. */
2266     migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2267                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
2268 
2269     /* Notify send thread that time to continue send pages */
2270     migration_rp_kick(s);
2271 
2272     return true;
2273 }
2274 
2275 /*
2276  * Release ms->rp_state.from_dst_file (and postcopy_qemufile_src if
2277  * existed) in a safe way.
2278  */
2279 static void migration_release_dst_files(MigrationState *ms)
2280 {
2281     QEMUFile *file;
2282 
2283     WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2284         /*
2285          * Reset the from_dst_file pointer first before releasing it, as we
2286          * can't block within lock section
2287          */
2288         file = ms->rp_state.from_dst_file;
2289         ms->rp_state.from_dst_file = NULL;
2290     }
2291 
2292     /*
2293      * Do the same to postcopy fast path socket too if there is.  No
2294      * locking needed because this qemufile should only be managed by
2295      * return path thread.
2296      */
2297     if (ms->postcopy_qemufile_src) {
2298         migration_ioc_unregister_yank_from_file(ms->postcopy_qemufile_src);
2299         qemu_file_shutdown(ms->postcopy_qemufile_src);
2300         qemu_fclose(ms->postcopy_qemufile_src);
2301         ms->postcopy_qemufile_src = NULL;
2302     }
2303 
2304     qemu_fclose(file);
2305 }
2306 
2307 /*
2308  * Handles messages sent on the return path towards the source VM
2309  *
2310  */
2311 static void *source_return_path_thread(void *opaque)
2312 {
2313     MigrationState *ms = opaque;
2314     QEMUFile *rp = ms->rp_state.from_dst_file;
2315     uint16_t header_len, header_type;
2316     uint8_t buf[512];
2317     uint32_t tmp32, sibling_error;
2318     ram_addr_t start = 0; /* =0 to silence warning */
2319     size_t  len = 0, expected_len;
2320     Error *err = NULL;
2321     int res;
2322 
2323     trace_source_return_path_thread_entry();
2324     rcu_register_thread();
2325 
2326     while (migration_is_setup_or_active()) {
2327         trace_source_return_path_thread_loop_top();
2328 
2329         header_type = qemu_get_be16(rp);
2330         header_len = qemu_get_be16(rp);
2331 
2332         if (qemu_file_get_error(rp)) {
2333             qemu_file_get_error_obj(rp, &err);
2334             goto out;
2335         }
2336 
2337         if (header_type >= MIG_RP_MSG_MAX ||
2338             header_type == MIG_RP_MSG_INVALID) {
2339             error_setg(&err, "Received invalid message 0x%04x length 0x%04x",
2340                        header_type, header_len);
2341             goto out;
2342         }
2343 
2344         if ((rp_cmd_args[header_type].len != -1 &&
2345             header_len != rp_cmd_args[header_type].len) ||
2346             header_len > sizeof(buf)) {
2347             error_setg(&err, "Received '%s' message (0x%04x) with"
2348                        "incorrect length %d expecting %zu",
2349                        rp_cmd_args[header_type].name, header_type, header_len,
2350                        (size_t)rp_cmd_args[header_type].len);
2351             goto out;
2352         }
2353 
2354         /* We know we've got a valid header by this point */
2355         res = qemu_get_buffer(rp, buf, header_len);
2356         if (res != header_len) {
2357             error_setg(&err, "Failed reading data for message 0x%04x"
2358                        " read %d expected %d",
2359                        header_type, res, header_len);
2360             goto out;
2361         }
2362 
2363         /* OK, we have the message and the data */
2364         switch (header_type) {
2365         case MIG_RP_MSG_SHUT:
2366             sibling_error = ldl_be_p(buf);
2367             trace_source_return_path_thread_shut(sibling_error);
2368             if (sibling_error) {
2369                 error_setg(&err, "Sibling indicated error %d", sibling_error);
2370             }
2371             /*
2372              * We'll let the main thread deal with closing the RP
2373              * we could do a shutdown(2) on it, but we're the only user
2374              * anyway, so there's nothing gained.
2375              */
2376             goto out;
2377 
2378         case MIG_RP_MSG_PONG:
2379             tmp32 = ldl_be_p(buf);
2380             trace_source_return_path_thread_pong(tmp32);
2381             qemu_sem_post(&ms->rp_state.rp_pong_acks);
2382             break;
2383 
2384         case MIG_RP_MSG_REQ_PAGES:
2385             start = ldq_be_p(buf);
2386             len = ldl_be_p(buf + 8);
2387             migrate_handle_rp_req_pages(ms, NULL, start, len, &err);
2388             if (err) {
2389                 goto out;
2390             }
2391             break;
2392 
2393         case MIG_RP_MSG_REQ_PAGES_ID:
2394             expected_len = 12 + 1; /* header + termination */
2395 
2396             if (header_len >= expected_len) {
2397                 start = ldq_be_p(buf);
2398                 len = ldl_be_p(buf + 8);
2399                 /* Now we expect an idstr */
2400                 tmp32 = buf[12]; /* Length of the following idstr */
2401                 buf[13 + tmp32] = '\0';
2402                 expected_len += tmp32;
2403             }
2404             if (header_len != expected_len) {
2405                 error_setg(&err, "Req_Page_id with length %d expecting %zd",
2406                            header_len, expected_len);
2407                 goto out;
2408             }
2409             migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len,
2410                                         &err);
2411             if (err) {
2412                 goto out;
2413             }
2414             break;
2415 
2416         case MIG_RP_MSG_RECV_BITMAP:
2417             if (header_len < 1) {
2418                 error_setg(&err, "MIG_RP_MSG_RECV_BITMAP missing block name");
2419                 goto out;
2420             }
2421             /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2422             buf[buf[0] + 1] = '\0';
2423             if (!migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1), &err)) {
2424                 goto out;
2425             }
2426             break;
2427 
2428         case MIG_RP_MSG_RESUME_ACK:
2429             tmp32 = ldl_be_p(buf);
2430             if (!migrate_handle_rp_resume_ack(ms, tmp32, &err)) {
2431                 goto out;
2432             }
2433             break;
2434 
2435         case MIG_RP_MSG_SWITCHOVER_ACK:
2436             ms->switchover_acked = true;
2437             trace_source_return_path_thread_switchover_acked();
2438             break;
2439 
2440         default:
2441             break;
2442         }
2443     }
2444 
2445 out:
2446     if (err) {
2447         migrate_set_error(ms, err);
2448         error_free(err);
2449         trace_source_return_path_thread_bad_end();
2450     }
2451 
2452     if (ms->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2453         /*
2454          * this will be extremely unlikely: that we got yet another network
2455          * issue during recovering of the 1st network failure.. during this
2456          * period the main migration thread can be waiting on rp_sem for
2457          * this thread to sync with the other side.
2458          *
2459          * When this happens, explicitly kick the migration thread out of
2460          * RECOVER stage and back to PAUSED, so the admin can try
2461          * everything again.
2462          */
2463         migration_rp_kick(ms);
2464     }
2465 
2466     trace_source_return_path_thread_end();
2467     rcu_unregister_thread();
2468 
2469     return NULL;
2470 }
2471 
2472 static int open_return_path_on_source(MigrationState *ms)
2473 {
2474     ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
2475     if (!ms->rp_state.from_dst_file) {
2476         return -1;
2477     }
2478 
2479     trace_open_return_path_on_source();
2480 
2481     qemu_thread_create(&ms->rp_state.rp_thread, "mig/src/rp-thr",
2482                        source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
2483     ms->rp_state.rp_thread_created = true;
2484 
2485     trace_open_return_path_on_source_continue();
2486 
2487     return 0;
2488 }
2489 
2490 /* Return true if error detected, or false otherwise */
2491 static bool close_return_path_on_source(MigrationState *ms)
2492 {
2493     if (!ms->rp_state.rp_thread_created) {
2494         return false;
2495     }
2496 
2497     trace_migration_return_path_end_before();
2498 
2499     /*
2500      * If this is a normal exit then the destination will send a SHUT
2501      * and the rp_thread will exit, however if there's an error we
2502      * need to cause it to exit. shutdown(2), if we have it, will
2503      * cause it to unblock if it's stuck waiting for the destination.
2504      */
2505     WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2506         if (migrate_has_error(ms) && ms->rp_state.from_dst_file) {
2507             qemu_file_shutdown(ms->rp_state.from_dst_file);
2508         }
2509     }
2510 
2511     qemu_thread_join(&ms->rp_state.rp_thread);
2512     ms->rp_state.rp_thread_created = false;
2513     migration_release_dst_files(ms);
2514     trace_migration_return_path_end_after();
2515 
2516     /* Return path will persist the error in MigrationState when quit */
2517     return migrate_has_error(ms);
2518 }
2519 
2520 static inline void
2521 migration_wait_main_channel(MigrationState *ms)
2522 {
2523     /* Wait until one PONG message received */
2524     qemu_sem_wait(&ms->rp_state.rp_pong_acks);
2525 }
2526 
2527 /*
2528  * Switch from normal iteration to postcopy
2529  * Returns non-0 on error
2530  */
2531 static int postcopy_start(MigrationState *ms, Error **errp)
2532 {
2533     int ret;
2534     QIOChannelBuffer *bioc;
2535     QEMUFile *fb;
2536     uint64_t bandwidth = migrate_max_postcopy_bandwidth();
2537     bool restart_block = false;
2538     int cur_state = MIGRATION_STATUS_ACTIVE;
2539 
2540     if (migrate_postcopy_preempt()) {
2541         migration_wait_main_channel(ms);
2542         if (postcopy_preempt_establish_channel(ms)) {
2543             migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
2544             error_setg(errp, "%s: Failed to establish preempt channel",
2545                        __func__);
2546             return -1;
2547         }
2548     }
2549 
2550     if (!migrate_pause_before_switchover()) {
2551         migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
2552                           MIGRATION_STATUS_POSTCOPY_ACTIVE);
2553     }
2554 
2555     trace_postcopy_start();
2556     bql_lock();
2557     trace_postcopy_start_set_run();
2558 
2559     ret = migration_stop_vm(ms, RUN_STATE_FINISH_MIGRATE);
2560     if (ret < 0) {
2561         error_setg_errno(errp, -ret, "%s: Failed to stop the VM", __func__);
2562         goto fail;
2563     }
2564 
2565     ret = migration_maybe_pause(ms, &cur_state,
2566                                 MIGRATION_STATUS_POSTCOPY_ACTIVE);
2567     if (ret < 0) {
2568         error_setg_errno(errp, -ret, "%s: Failed in migration_maybe_pause()",
2569                          __func__);
2570         goto fail;
2571     }
2572 
2573     ret = bdrv_inactivate_all();
2574     if (ret < 0) {
2575         error_setg_errno(errp, -ret, "%s: Failed in bdrv_inactivate_all()",
2576                          __func__);
2577         goto fail;
2578     }
2579     restart_block = true;
2580 
2581     /*
2582      * Cause any non-postcopiable, but iterative devices to
2583      * send out their final data.
2584      */
2585     qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
2586 
2587     /*
2588      * in Finish migrate and with the io-lock held everything should
2589      * be quiet, but we've potentially still got dirty pages and we
2590      * need to tell the destination to throw any pages it's already received
2591      * that are dirty
2592      */
2593     if (migrate_postcopy_ram()) {
2594         ram_postcopy_send_discard_bitmap(ms);
2595     }
2596 
2597     /*
2598      * send rest of state - note things that are doing postcopy
2599      * will notice we're in POSTCOPY_ACTIVE and not actually
2600      * wrap their state up here
2601      */
2602     migration_rate_set(bandwidth);
2603     if (migrate_postcopy_ram()) {
2604         /* Ping just for debugging, helps line traces up */
2605         qemu_savevm_send_ping(ms->to_dst_file, 2);
2606     }
2607 
2608     /*
2609      * While loading the device state we may trigger page transfer
2610      * requests and the fd must be free to process those, and thus
2611      * the destination must read the whole device state off the fd before
2612      * it starts processing it.  Unfortunately the ad-hoc migration format
2613      * doesn't allow the destination to know the size to read without fully
2614      * parsing it through each devices load-state code (especially the open
2615      * coded devices that use get/put).
2616      * So we wrap the device state up in a package with a length at the start;
2617      * to do this we use a qemu_buf to hold the whole of the device state.
2618      */
2619     bioc = qio_channel_buffer_new(4096);
2620     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
2621     fb = qemu_file_new_output(QIO_CHANNEL(bioc));
2622     object_unref(OBJECT(bioc));
2623 
2624     /*
2625      * Make sure the receiver can get incoming pages before we send the rest
2626      * of the state
2627      */
2628     qemu_savevm_send_postcopy_listen(fb);
2629 
2630     qemu_savevm_state_complete_precopy(fb, false, false);
2631     if (migrate_postcopy_ram()) {
2632         qemu_savevm_send_ping(fb, 3);
2633     }
2634 
2635     qemu_savevm_send_postcopy_run(fb);
2636 
2637     /* <><> end of stuff going into the package */
2638 
2639     /* Last point of recovery; as soon as we send the package the destination
2640      * can open devices and potentially start running.
2641      * Lets just check again we've not got any errors.
2642      */
2643     ret = qemu_file_get_error(ms->to_dst_file);
2644     if (ret) {
2645         error_setg(errp, "postcopy_start: Migration stream errored (pre package)");
2646         goto fail_closefb;
2647     }
2648 
2649     restart_block = false;
2650 
2651     /* Now send that blob */
2652     if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
2653         error_setg(errp, "%s: Failed to send packaged data", __func__);
2654         goto fail_closefb;
2655     }
2656     qemu_fclose(fb);
2657 
2658     /* Send a notify to give a chance for anything that needs to happen
2659      * at the transition to postcopy and after the device state; in particular
2660      * spice needs to trigger a transition now
2661      */
2662     migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE, NULL);
2663 
2664     migration_downtime_end(ms);
2665 
2666     bql_unlock();
2667 
2668     if (migrate_postcopy_ram()) {
2669         /*
2670          * Although this ping is just for debug, it could potentially be
2671          * used for getting a better measurement of downtime at the source.
2672          */
2673         qemu_savevm_send_ping(ms->to_dst_file, 4);
2674     }
2675 
2676     if (migrate_release_ram()) {
2677         ram_postcopy_migrated_memory_release(ms);
2678     }
2679 
2680     ret = qemu_file_get_error(ms->to_dst_file);
2681     if (ret) {
2682         error_setg_errno(errp, -ret, "postcopy_start: Migration stream error");
2683         bql_lock();
2684         goto fail;
2685     }
2686     trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
2687 
2688     return ret;
2689 
2690 fail_closefb:
2691     qemu_fclose(fb);
2692 fail:
2693     migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2694                           MIGRATION_STATUS_FAILED);
2695     if (restart_block) {
2696         /* A failure happened early enough that we know the destination hasn't
2697          * accessed block devices, so we're safe to recover.
2698          */
2699         Error *local_err = NULL;
2700 
2701         bdrv_activate_all(&local_err);
2702         if (local_err) {
2703             error_report_err(local_err);
2704         }
2705     }
2706     migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
2707     bql_unlock();
2708     return -1;
2709 }
2710 
2711 /**
2712  * migration_maybe_pause: Pause if required to by
2713  * migrate_pause_before_switchover called with the BQL locked
2714  * Returns: 0 on success
2715  */
2716 static int migration_maybe_pause(MigrationState *s,
2717                                  int *current_active_state,
2718                                  int new_state)
2719 {
2720     if (!migrate_pause_before_switchover()) {
2721         return 0;
2722     }
2723 
2724     /* Since leaving this state is not atomic with posting the semaphore
2725      * it's possible that someone could have issued multiple migrate_continue
2726      * and the semaphore is incorrectly positive at this point;
2727      * the docs say it's undefined to reinit a semaphore that's already
2728      * init'd, so use timedwait to eat up any existing posts.
2729      */
2730     while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
2731         /* This block intentionally left blank */
2732     }
2733 
2734     /*
2735      * If the migration is cancelled when it is in the completion phase,
2736      * the migration state is set to MIGRATION_STATUS_CANCELLING.
2737      * So we don't need to wait a semaphore, otherwise we would always
2738      * wait for the 'pause_sem' semaphore.
2739      */
2740     if (s->state != MIGRATION_STATUS_CANCELLING) {
2741         bql_unlock();
2742         migrate_set_state(&s->state, *current_active_state,
2743                           MIGRATION_STATUS_PRE_SWITCHOVER);
2744         qemu_sem_wait(&s->pause_sem);
2745         migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
2746                           new_state);
2747         *current_active_state = new_state;
2748         bql_lock();
2749     }
2750 
2751     return s->state == new_state ? 0 : -EINVAL;
2752 }
2753 
2754 static int migration_completion_precopy(MigrationState *s,
2755                                         int *current_active_state)
2756 {
2757     int ret;
2758 
2759     bql_lock();
2760 
2761     if (!migrate_mode_is_cpr(s)) {
2762         ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
2763         if (ret < 0) {
2764             goto out_unlock;
2765         }
2766     }
2767 
2768     ret = migration_maybe_pause(s, current_active_state,
2769                                 MIGRATION_STATUS_DEVICE);
2770     if (ret < 0) {
2771         goto out_unlock;
2772     }
2773 
2774     /*
2775      * Inactivate disks except in COLO, and track that we have done so in order
2776      * to remember to reactivate them if migration fails or is cancelled.
2777      */
2778     s->block_inactive = !migrate_colo();
2779     migration_rate_set(RATE_LIMIT_DISABLED);
2780     ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
2781                                              s->block_inactive);
2782 out_unlock:
2783     bql_unlock();
2784     return ret;
2785 }
2786 
2787 static void migration_completion_postcopy(MigrationState *s)
2788 {
2789     trace_migration_completion_postcopy_end();
2790 
2791     bql_lock();
2792     qemu_savevm_state_complete_postcopy(s->to_dst_file);
2793     bql_unlock();
2794 
2795     /*
2796      * Shutdown the postcopy fast path thread.  This is only needed when dest
2797      * QEMU binary is old (7.1/7.2).  QEMU 8.0+ doesn't need this.
2798      */
2799     if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
2800         postcopy_preempt_shutdown_file(s);
2801     }
2802 
2803     trace_migration_completion_postcopy_end_after_complete();
2804 }
2805 
2806 static void migration_completion_failed(MigrationState *s,
2807                                         int current_active_state)
2808 {
2809     if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE ||
2810                               s->state == MIGRATION_STATUS_DEVICE)) {
2811         /*
2812          * If not doing postcopy, vm_start() will be called: let's
2813          * regain control on images.
2814          */
2815         Error *local_err = NULL;
2816 
2817         bql_lock();
2818         bdrv_activate_all(&local_err);
2819         if (local_err) {
2820             error_report_err(local_err);
2821         } else {
2822             s->block_inactive = false;
2823         }
2824         bql_unlock();
2825     }
2826 
2827     migrate_set_state(&s->state, current_active_state,
2828                       MIGRATION_STATUS_FAILED);
2829 }
2830 
2831 /**
2832  * migration_completion: Used by migration_thread when there's not much left.
2833  *   The caller 'breaks' the loop when this returns.
2834  *
2835  * @s: Current migration state
2836  */
2837 static void migration_completion(MigrationState *s)
2838 {
2839     int ret = 0;
2840     int current_active_state = s->state;
2841     Error *local_err = NULL;
2842 
2843     if (s->state == MIGRATION_STATUS_ACTIVE) {
2844         ret = migration_completion_precopy(s, &current_active_state);
2845     } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2846         migration_completion_postcopy(s);
2847     } else {
2848         ret = -1;
2849     }
2850 
2851     if (ret < 0) {
2852         goto fail;
2853     }
2854 
2855     if (close_return_path_on_source(s)) {
2856         goto fail;
2857     }
2858 
2859     if (qemu_file_get_error(s->to_dst_file)) {
2860         trace_migration_completion_file_err();
2861         goto fail;
2862     }
2863 
2864     if (migrate_colo() && s->state == MIGRATION_STATUS_ACTIVE) {
2865         /* COLO does not support postcopy */
2866         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
2867                           MIGRATION_STATUS_COLO);
2868     } else {
2869         migration_completion_end(s);
2870     }
2871 
2872     return;
2873 
2874 fail:
2875     if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
2876         migrate_set_error(s, local_err);
2877         error_free(local_err);
2878     } else if (ret) {
2879         error_setg_errno(&local_err, -ret, "Error in migration completion");
2880         migrate_set_error(s, local_err);
2881         error_free(local_err);
2882     }
2883 
2884     migration_completion_failed(s, current_active_state);
2885 }
2886 
2887 /**
2888  * bg_migration_completion: Used by bg_migration_thread when after all the
2889  *   RAM has been saved. The caller 'breaks' the loop when this returns.
2890  *
2891  * @s: Current migration state
2892  */
2893 static void bg_migration_completion(MigrationState *s)
2894 {
2895     int current_active_state = s->state;
2896 
2897     if (s->state == MIGRATION_STATUS_ACTIVE) {
2898         /*
2899          * By this moment we have RAM content saved into the migration stream.
2900          * The next step is to flush the non-RAM content (device state)
2901          * right after the ram content. The device state has been stored into
2902          * the temporary buffer before RAM saving started.
2903          */
2904         qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
2905         qemu_fflush(s->to_dst_file);
2906     } else if (s->state == MIGRATION_STATUS_CANCELLING) {
2907         goto fail;
2908     }
2909 
2910     if (qemu_file_get_error(s->to_dst_file)) {
2911         trace_migration_completion_file_err();
2912         goto fail;
2913     }
2914 
2915     migration_completion_end(s);
2916     return;
2917 
2918 fail:
2919     migrate_set_state(&s->state, current_active_state,
2920                       MIGRATION_STATUS_FAILED);
2921 }
2922 
2923 typedef enum MigThrError {
2924     /* No error detected */
2925     MIG_THR_ERR_NONE = 0,
2926     /* Detected error, but resumed successfully */
2927     MIG_THR_ERR_RECOVERED = 1,
2928     /* Detected fatal error, need to exit */
2929     MIG_THR_ERR_FATAL = 2,
2930 } MigThrError;
2931 
2932 static int postcopy_resume_handshake(MigrationState *s)
2933 {
2934     qemu_savevm_send_postcopy_resume(s->to_dst_file);
2935 
2936     while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2937         if (migration_rp_wait(s)) {
2938             return -1;
2939         }
2940     }
2941 
2942     if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2943         return 0;
2944     }
2945 
2946     return -1;
2947 }
2948 
2949 /* Return zero if success, or <0 for error */
2950 static int postcopy_do_resume(MigrationState *s)
2951 {
2952     int ret;
2953 
2954     /*
2955      * Call all the resume_prepare() hooks, so that modules can be
2956      * ready for the migration resume.
2957      */
2958     ret = qemu_savevm_state_resume_prepare(s);
2959     if (ret) {
2960         error_report("%s: resume_prepare() failure detected: %d",
2961                      __func__, ret);
2962         return ret;
2963     }
2964 
2965     /*
2966      * If preempt is enabled, re-establish the preempt channel.  Note that
2967      * we do it after resume prepare to make sure the main channel will be
2968      * created before the preempt channel.  E.g. with weak network, the
2969      * dest QEMU may get messed up with the preempt and main channels on
2970      * the order of connection setup.  This guarantees the correct order.
2971      */
2972     ret = postcopy_preempt_establish_channel(s);
2973     if (ret) {
2974         error_report("%s: postcopy_preempt_establish_channel(): %d",
2975                      __func__, ret);
2976         return ret;
2977     }
2978 
2979     /*
2980      * Last handshake with destination on the resume (destination will
2981      * switch to postcopy-active afterwards)
2982      */
2983     ret = postcopy_resume_handshake(s);
2984     if (ret) {
2985         error_report("%s: handshake failed: %d", __func__, ret);
2986         return ret;
2987     }
2988 
2989     return 0;
2990 }
2991 
2992 /*
2993  * We don't return until we are in a safe state to continue current
2994  * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
2995  * MIG_THR_ERR_FATAL if unrecovery failure happened.
2996  */
2997 static MigThrError postcopy_pause(MigrationState *s)
2998 {
2999     assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
3000 
3001     while (true) {
3002         QEMUFile *file;
3003 
3004         /*
3005          * We're already pausing, so ignore any errors on the return
3006          * path and just wait for the thread to finish. It will be
3007          * re-created when we resume.
3008          */
3009         close_return_path_on_source(s);
3010 
3011         /*
3012          * Current channel is possibly broken. Release it.  Note that this is
3013          * guaranteed even without lock because to_dst_file should only be
3014          * modified by the migration thread.  That also guarantees that the
3015          * unregister of yank is safe too without the lock.  It should be safe
3016          * even to be within the qemu_file_lock, but we didn't do that to avoid
3017          * taking more mutex (yank_lock) within qemu_file_lock.  TL;DR: we make
3018          * the qemu_file_lock critical section as small as possible.
3019          */
3020         assert(s->to_dst_file);
3021         migration_ioc_unregister_yank_from_file(s->to_dst_file);
3022         qemu_mutex_lock(&s->qemu_file_lock);
3023         file = s->to_dst_file;
3024         s->to_dst_file = NULL;
3025         qemu_mutex_unlock(&s->qemu_file_lock);
3026 
3027         qemu_file_shutdown(file);
3028         qemu_fclose(file);
3029 
3030         migrate_set_state(&s->state, s->state,
3031                           MIGRATION_STATUS_POSTCOPY_PAUSED);
3032 
3033         error_report("Detected IO failure for postcopy. "
3034                      "Migration paused.");
3035 
3036         /*
3037          * We wait until things fixed up. Then someone will setup the
3038          * status back for us.
3039          */
3040         do {
3041             qemu_sem_wait(&s->postcopy_pause_sem);
3042         } while (postcopy_is_paused(s->state));
3043 
3044         if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3045             /* Woken up by a recover procedure. Give it a shot */
3046 
3047             /* Do the resume logic */
3048             if (postcopy_do_resume(s) == 0) {
3049                 /* Let's continue! */
3050                 trace_postcopy_pause_continued();
3051                 return MIG_THR_ERR_RECOVERED;
3052             } else {
3053                 /*
3054                  * Something wrong happened during the recovery, let's
3055                  * pause again. Pause is always better than throwing
3056                  * data away.
3057                  */
3058                 continue;
3059             }
3060         } else {
3061             /* This is not right... Time to quit. */
3062             return MIG_THR_ERR_FATAL;
3063         }
3064     }
3065 }
3066 
3067 void migration_file_set_error(int ret, Error *err)
3068 {
3069     MigrationState *s = current_migration;
3070 
3071     WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
3072         if (s->to_dst_file) {
3073             qemu_file_set_error_obj(s->to_dst_file, ret, err);
3074         } else if (err) {
3075             error_report_err(err);
3076         }
3077     }
3078 }
3079 
3080 static MigThrError migration_detect_error(MigrationState *s)
3081 {
3082     int ret;
3083     int state = s->state;
3084     Error *local_error = NULL;
3085 
3086     if (state == MIGRATION_STATUS_CANCELLING ||
3087         state == MIGRATION_STATUS_CANCELLED) {
3088         /* End the migration, but don't set the state to failed */
3089         return MIG_THR_ERR_FATAL;
3090     }
3091 
3092     /*
3093      * Try to detect any file errors.  Note that postcopy_qemufile_src will
3094      * be NULL when postcopy preempt is not enabled.
3095      */
3096     ret = qemu_file_get_error_obj_any(s->to_dst_file,
3097                                       s->postcopy_qemufile_src,
3098                                       &local_error);
3099     if (!ret) {
3100         /* Everything is fine */
3101         assert(!local_error);
3102         return MIG_THR_ERR_NONE;
3103     }
3104 
3105     if (local_error) {
3106         migrate_set_error(s, local_error);
3107         error_free(local_error);
3108     }
3109 
3110     if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) {
3111         /*
3112          * For postcopy, we allow the network to be down for a
3113          * while. After that, it can be continued by a
3114          * recovery phase.
3115          */
3116         return postcopy_pause(s);
3117     } else {
3118         /*
3119          * For precopy (or postcopy with error outside IO), we fail
3120          * with no time.
3121          */
3122         migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
3123         trace_migration_thread_file_err();
3124 
3125         /* Time to stop the migration, now. */
3126         return MIG_THR_ERR_FATAL;
3127     }
3128 }
3129 
3130 static void migration_completion_end(MigrationState *s)
3131 {
3132     uint64_t bytes = migration_transferred_bytes();
3133     int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3134     int64_t transfer_time;
3135 
3136     /*
3137      * Take the BQL here so that query-migrate on the QMP thread sees:
3138      * - atomic update of s->total_time and s->mbps;
3139      * - correct ordering of s->mbps update vs. s->state;
3140      */
3141     bql_lock();
3142     migration_downtime_end(s);
3143     s->total_time = end_time - s->start_time;
3144     transfer_time = s->total_time - s->setup_time;
3145     if (transfer_time) {
3146         s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
3147     }
3148 
3149     migrate_set_state(&s->state, s->state,
3150                       MIGRATION_STATUS_COMPLETED);
3151     bql_unlock();
3152 }
3153 
3154 static void update_iteration_initial_status(MigrationState *s)
3155 {
3156     /*
3157      * Update these three fields at the same time to avoid mismatch info lead
3158      * wrong speed calculation.
3159      */
3160     s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3161     s->iteration_initial_bytes = migration_transferred_bytes();
3162     s->iteration_initial_pages = ram_get_total_transferred_pages();
3163 }
3164 
3165 static void migration_update_counters(MigrationState *s,
3166                                       int64_t current_time)
3167 {
3168     uint64_t transferred, transferred_pages, time_spent;
3169     uint64_t current_bytes; /* bytes transferred since the beginning */
3170     uint64_t switchover_bw;
3171     /* Expected bandwidth when switching over to destination QEMU */
3172     double expected_bw_per_ms;
3173     double bandwidth;
3174 
3175     if (current_time < s->iteration_start_time + BUFFER_DELAY) {
3176         return;
3177     }
3178 
3179     switchover_bw = migrate_avail_switchover_bandwidth();
3180     current_bytes = migration_transferred_bytes();
3181     transferred = current_bytes - s->iteration_initial_bytes;
3182     time_spent = current_time - s->iteration_start_time;
3183     bandwidth = (double)transferred / time_spent;
3184 
3185     if (switchover_bw) {
3186         /*
3187          * If the user specified a switchover bandwidth, let's trust the
3188          * user so that can be more accurate than what we estimated.
3189          */
3190         expected_bw_per_ms = switchover_bw / 1000;
3191     } else {
3192         /* If the user doesn't specify bandwidth, we use the estimated */
3193         expected_bw_per_ms = bandwidth;
3194     }
3195 
3196     s->threshold_size = expected_bw_per_ms * migrate_downtime_limit();
3197 
3198     s->mbps = (((double) transferred * 8.0) /
3199                ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
3200 
3201     transferred_pages = ram_get_total_transferred_pages() -
3202                             s->iteration_initial_pages;
3203     s->pages_per_second = (double) transferred_pages /
3204                              (((double) time_spent / 1000.0));
3205 
3206     /*
3207      * if we haven't sent anything, we don't want to
3208      * recalculate. 10000 is a small enough number for our purposes
3209      */
3210     if (stat64_get(&mig_stats.dirty_pages_rate) &&
3211         transferred > 10000) {
3212         s->expected_downtime =
3213             stat64_get(&mig_stats.dirty_bytes_last_sync) / expected_bw_per_ms;
3214     }
3215 
3216     migration_rate_reset();
3217 
3218     update_iteration_initial_status(s);
3219 
3220     trace_migrate_transferred(transferred, time_spent,
3221                               /* Both in unit bytes/ms */
3222                               bandwidth, switchover_bw / 1000,
3223                               s->threshold_size);
3224 }
3225 
3226 static bool migration_can_switchover(MigrationState *s)
3227 {
3228     if (!migrate_switchover_ack()) {
3229         return true;
3230     }
3231 
3232     /* No reason to wait for switchover ACK if VM is stopped */
3233     if (!runstate_is_running()) {
3234         return true;
3235     }
3236 
3237     return s->switchover_acked;
3238 }
3239 
3240 /* Migration thread iteration status */
3241 typedef enum {
3242     MIG_ITERATE_RESUME,         /* Resume current iteration */
3243     MIG_ITERATE_SKIP,           /* Skip current iteration */
3244     MIG_ITERATE_BREAK,          /* Break the loop */
3245 } MigIterateState;
3246 
3247 /*
3248  * Return true if continue to the next iteration directly, false
3249  * otherwise.
3250  */
3251 static MigIterateState migration_iteration_run(MigrationState *s)
3252 {
3253     uint64_t must_precopy, can_postcopy, pending_size;
3254     Error *local_err = NULL;
3255     bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
3256     bool can_switchover = migration_can_switchover(s);
3257 
3258     qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy);
3259     pending_size = must_precopy + can_postcopy;
3260     trace_migrate_pending_estimate(pending_size, must_precopy, can_postcopy);
3261 
3262     if (pending_size < s->threshold_size) {
3263         qemu_savevm_state_pending_exact(&must_precopy, &can_postcopy);
3264         pending_size = must_precopy + can_postcopy;
3265         trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy);
3266     }
3267 
3268     if ((!pending_size || pending_size < s->threshold_size) && can_switchover) {
3269         trace_migration_thread_low_pending(pending_size);
3270         migration_completion(s);
3271         return MIG_ITERATE_BREAK;
3272     }
3273 
3274     /* Still a significant amount to transfer */
3275     if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover &&
3276         qatomic_read(&s->start_postcopy)) {
3277         if (postcopy_start(s, &local_err)) {
3278             migrate_set_error(s, local_err);
3279             error_report_err(local_err);
3280         }
3281         return MIG_ITERATE_SKIP;
3282     }
3283 
3284     /* Just another iteration step */
3285     qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
3286     return MIG_ITERATE_RESUME;
3287 }
3288 
3289 static void migration_iteration_finish(MigrationState *s)
3290 {
3291     /* If we enabled cpu throttling for auto-converge, turn it off. */
3292     cpu_throttle_stop();
3293 
3294     bql_lock();
3295     switch (s->state) {
3296     case MIGRATION_STATUS_COMPLETED:
3297         runstate_set(RUN_STATE_POSTMIGRATE);
3298         break;
3299     case MIGRATION_STATUS_COLO:
3300         assert(migrate_colo());
3301         migrate_start_colo_process(s);
3302         s->vm_old_state = RUN_STATE_RUNNING;
3303         /* Fallthrough */
3304     case MIGRATION_STATUS_FAILED:
3305     case MIGRATION_STATUS_CANCELLED:
3306     case MIGRATION_STATUS_CANCELLING:
3307         if (runstate_is_live(s->vm_old_state)) {
3308             if (!runstate_check(RUN_STATE_SHUTDOWN)) {
3309                 vm_start();
3310             }
3311         } else {
3312             if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
3313                 runstate_set(s->vm_old_state);
3314             }
3315         }
3316         break;
3317 
3318     default:
3319         /* Should not reach here, but if so, forgive the VM. */
3320         error_report("%s: Unknown ending state %d", __func__, s->state);
3321         break;
3322     }
3323 
3324     migration_bh_schedule(migrate_fd_cleanup_bh, s);
3325     bql_unlock();
3326 }
3327 
3328 static void bg_migration_iteration_finish(MigrationState *s)
3329 {
3330     /*
3331      * Stop tracking RAM writes - un-protect memory, un-register UFFD
3332      * memory ranges, flush kernel wait queues and wake up threads
3333      * waiting for write fault to be resolved.
3334      */
3335     ram_write_tracking_stop();
3336 
3337     bql_lock();
3338     switch (s->state) {
3339     case MIGRATION_STATUS_COMPLETED:
3340     case MIGRATION_STATUS_ACTIVE:
3341     case MIGRATION_STATUS_FAILED:
3342     case MIGRATION_STATUS_CANCELLED:
3343     case MIGRATION_STATUS_CANCELLING:
3344         break;
3345 
3346     default:
3347         /* Should not reach here, but if so, forgive the VM. */
3348         error_report("%s: Unknown ending state %d", __func__, s->state);
3349         break;
3350     }
3351 
3352     migration_bh_schedule(migrate_fd_cleanup_bh, s);
3353     bql_unlock();
3354 }
3355 
3356 /*
3357  * Return true if continue to the next iteration directly, false
3358  * otherwise.
3359  */
3360 static MigIterateState bg_migration_iteration_run(MigrationState *s)
3361 {
3362     int res;
3363 
3364     res = qemu_savevm_state_iterate(s->to_dst_file, false);
3365     if (res > 0) {
3366         bg_migration_completion(s);
3367         return MIG_ITERATE_BREAK;
3368     }
3369 
3370     return MIG_ITERATE_RESUME;
3371 }
3372 
3373 void migration_make_urgent_request(void)
3374 {
3375     qemu_sem_post(&migrate_get_current()->rate_limit_sem);
3376 }
3377 
3378 void migration_consume_urgent_request(void)
3379 {
3380     qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
3381 }
3382 
3383 /* Returns true if the rate limiting was broken by an urgent request */
3384 bool migration_rate_limit(void)
3385 {
3386     int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3387     MigrationState *s = migrate_get_current();
3388 
3389     bool urgent = false;
3390     migration_update_counters(s, now);
3391     if (migration_rate_exceeded(s->to_dst_file)) {
3392 
3393         if (qemu_file_get_error(s->to_dst_file)) {
3394             return false;
3395         }
3396         /*
3397          * Wait for a delay to do rate limiting OR
3398          * something urgent to post the semaphore.
3399          */
3400         int ms = s->iteration_start_time + BUFFER_DELAY - now;
3401         trace_migration_rate_limit_pre(ms);
3402         if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
3403             /*
3404              * We were woken by one or more urgent things but
3405              * the timedwait will have consumed one of them.
3406              * The service routine for the urgent wake will dec
3407              * the semaphore itself for each item it consumes,
3408              * so add this one we just eat back.
3409              */
3410             qemu_sem_post(&s->rate_limit_sem);
3411             urgent = true;
3412         }
3413         trace_migration_rate_limit_post(urgent);
3414     }
3415     return urgent;
3416 }
3417 
3418 /*
3419  * if failover devices are present, wait they are completely
3420  * unplugged
3421  */
3422 
3423 static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
3424                                     int new_state)
3425 {
3426     if (qemu_savevm_state_guest_unplug_pending()) {
3427         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
3428 
3429         while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
3430                qemu_savevm_state_guest_unplug_pending()) {
3431             qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3432         }
3433         if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
3434             int timeout = 120; /* 30 seconds */
3435             /*
3436              * migration has been canceled
3437              * but as we have started an unplug we must wait the end
3438              * to be able to plug back the card
3439              */
3440             while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
3441                 qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3442             }
3443             if (qemu_savevm_state_guest_unplug_pending() &&
3444                 !qtest_enabled()) {
3445                 warn_report("migration: partially unplugged device on "
3446                             "failure");
3447             }
3448         }
3449 
3450         migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
3451     } else {
3452         migrate_set_state(&s->state, old_state, new_state);
3453     }
3454 }
3455 
3456 /*
3457  * Master migration thread on the source VM.
3458  * It drives the migration and pumps the data down the outgoing channel.
3459  */
3460 static void *migration_thread(void *opaque)
3461 {
3462     MigrationState *s = opaque;
3463     MigrationThread *thread = NULL;
3464     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3465     MigThrError thr_error;
3466     bool urgent = false;
3467     Error *local_err = NULL;
3468     int ret;
3469 
3470     thread = migration_threads_add("live_migration", qemu_get_thread_id());
3471 
3472     rcu_register_thread();
3473 
3474     object_ref(OBJECT(s));
3475     update_iteration_initial_status(s);
3476 
3477     if (!multifd_send_setup()) {
3478         goto out;
3479     }
3480 
3481     bql_lock();
3482     qemu_savevm_state_header(s->to_dst_file);
3483     bql_unlock();
3484 
3485     /*
3486      * If we opened the return path, we need to make sure dst has it
3487      * opened as well.
3488      */
3489     if (s->rp_state.rp_thread_created) {
3490         /* Now tell the dest that it should open its end so it can reply */
3491         qemu_savevm_send_open_return_path(s->to_dst_file);
3492 
3493         /* And do a ping that will make stuff easier to debug */
3494         qemu_savevm_send_ping(s->to_dst_file, 1);
3495     }
3496 
3497     if (migrate_postcopy()) {
3498         /*
3499          * Tell the destination that we *might* want to do postcopy later;
3500          * if the other end can't do postcopy it should fail now, nice and
3501          * early.
3502          */
3503         qemu_savevm_send_postcopy_advise(s->to_dst_file);
3504     }
3505 
3506     if (migrate_colo()) {
3507         /* Notify migration destination that we enable COLO */
3508         qemu_savevm_send_colo_enable(s->to_dst_file);
3509     }
3510 
3511     bql_lock();
3512     ret = qemu_savevm_state_setup(s->to_dst_file, &local_err);
3513     bql_unlock();
3514 
3515     qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3516                                MIGRATION_STATUS_ACTIVE);
3517 
3518     /*
3519      * Handle SETUP failures after waiting for virtio-net-failover
3520      * devices to unplug. This to preserve migration state transitions.
3521      */
3522     if (ret) {
3523         migrate_set_error(s, local_err);
3524         error_free(local_err);
3525         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3526                           MIGRATION_STATUS_FAILED);
3527         goto out;
3528     }
3529 
3530     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3531 
3532     trace_migration_thread_setup_complete();
3533 
3534     while (migration_is_active()) {
3535         if (urgent || !migration_rate_exceeded(s->to_dst_file)) {
3536             MigIterateState iter_state = migration_iteration_run(s);
3537             if (iter_state == MIG_ITERATE_SKIP) {
3538                 continue;
3539             } else if (iter_state == MIG_ITERATE_BREAK) {
3540                 break;
3541             }
3542         }
3543 
3544         /*
3545          * Try to detect any kind of failures, and see whether we
3546          * should stop the migration now.
3547          */
3548         thr_error = migration_detect_error(s);
3549         if (thr_error == MIG_THR_ERR_FATAL) {
3550             /* Stop migration */
3551             break;
3552         } else if (thr_error == MIG_THR_ERR_RECOVERED) {
3553             /*
3554              * Just recovered from a e.g. network failure, reset all
3555              * the local variables. This is important to avoid
3556              * breaking transferred_bytes and bandwidth calculation
3557              */
3558             update_iteration_initial_status(s);
3559         }
3560 
3561         urgent = migration_rate_limit();
3562     }
3563 
3564 out:
3565     trace_migration_thread_after_loop();
3566     migration_iteration_finish(s);
3567     object_unref(OBJECT(s));
3568     rcu_unregister_thread();
3569     migration_threads_remove(thread);
3570     return NULL;
3571 }
3572 
3573 static void bg_migration_vm_start_bh(void *opaque)
3574 {
3575     MigrationState *s = opaque;
3576 
3577     vm_resume(s->vm_old_state);
3578     migration_downtime_end(s);
3579 }
3580 
3581 /**
3582  * Background snapshot thread, based on live migration code.
3583  * This is an alternative implementation of live migration mechanism
3584  * introduced specifically to support background snapshots.
3585  *
3586  * It takes advantage of userfault_fd write protection mechanism introduced
3587  * in v5.7 kernel. Compared to existing dirty page logging migration much
3588  * lesser stream traffic is produced resulting in smaller snapshot images,
3589  * simply cause of no page duplicates can get into the stream.
3590  *
3591  * Another key point is that generated vmstate stream reflects machine state
3592  * 'frozen' at the beginning of snapshot creation compared to dirty page logging
3593  * mechanism, which effectively results in that saved snapshot is the state of VM
3594  * at the end of the process.
3595  */
3596 static void *bg_migration_thread(void *opaque)
3597 {
3598     MigrationState *s = opaque;
3599     int64_t setup_start;
3600     MigThrError thr_error;
3601     QEMUFile *fb;
3602     bool early_fail = true;
3603     Error *local_err = NULL;
3604     int ret;
3605 
3606     rcu_register_thread();
3607     object_ref(OBJECT(s));
3608 
3609     migration_rate_set(RATE_LIMIT_DISABLED);
3610 
3611     setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3612     /*
3613      * We want to save vmstate for the moment when migration has been
3614      * initiated but also we want to save RAM content while VM is running.
3615      * The RAM content should appear first in the vmstate. So, we first
3616      * stash the non-RAM part of the vmstate to the temporary buffer,
3617      * then write RAM part of the vmstate to the migration stream
3618      * with vCPUs running and, finally, write stashed non-RAM part of
3619      * the vmstate from the buffer to the migration stream.
3620      */
3621     s->bioc = qio_channel_buffer_new(512 * 1024);
3622     qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
3623     fb = qemu_file_new_output(QIO_CHANNEL(s->bioc));
3624     object_unref(OBJECT(s->bioc));
3625 
3626     update_iteration_initial_status(s);
3627 
3628     /*
3629      * Prepare for tracking memory writes with UFFD-WP - populate
3630      * RAM pages before protecting.
3631      */
3632 #ifdef __linux__
3633     ram_write_tracking_prepare();
3634 #endif
3635 
3636     bql_lock();
3637     qemu_savevm_state_header(s->to_dst_file);
3638     ret = qemu_savevm_state_setup(s->to_dst_file, &local_err);
3639     bql_unlock();
3640 
3641     qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3642                                MIGRATION_STATUS_ACTIVE);
3643 
3644     /*
3645      * Handle SETUP failures after waiting for virtio-net-failover
3646      * devices to unplug. This to preserve migration state transitions.
3647      */
3648     if (ret) {
3649         migrate_set_error(s, local_err);
3650         error_free(local_err);
3651         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3652                           MIGRATION_STATUS_FAILED);
3653         goto fail_setup;
3654     }
3655 
3656     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3657 
3658     trace_migration_thread_setup_complete();
3659 
3660     bql_lock();
3661 
3662     if (migration_stop_vm(s, RUN_STATE_PAUSED)) {
3663         goto fail;
3664     }
3665     /*
3666      * Put vCPUs in sync with shadow context structures, then
3667      * save their state to channel-buffer along with devices.
3668      */
3669     cpu_synchronize_all_states();
3670     if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
3671         goto fail;
3672     }
3673     /*
3674      * Since we are going to get non-iterable state data directly
3675      * from s->bioc->data, explicit flush is needed here.
3676      */
3677     qemu_fflush(fb);
3678 
3679     /* Now initialize UFFD context and start tracking RAM writes */
3680     if (ram_write_tracking_start()) {
3681         goto fail;
3682     }
3683     early_fail = false;
3684 
3685     /*
3686      * Start VM from BH handler to avoid write-fault lock here.
3687      * UFFD-WP protection for the whole RAM is already enabled so
3688      * calling VM state change notifiers from vm_start() would initiate
3689      * writes to virtio VQs memory which is in write-protected region.
3690      */
3691     migration_bh_schedule(bg_migration_vm_start_bh, s);
3692     bql_unlock();
3693 
3694     while (migration_is_active()) {
3695         MigIterateState iter_state = bg_migration_iteration_run(s);
3696         if (iter_state == MIG_ITERATE_SKIP) {
3697             continue;
3698         } else if (iter_state == MIG_ITERATE_BREAK) {
3699             break;
3700         }
3701 
3702         /*
3703          * Try to detect any kind of failures, and see whether we
3704          * should stop the migration now.
3705          */
3706         thr_error = migration_detect_error(s);
3707         if (thr_error == MIG_THR_ERR_FATAL) {
3708             /* Stop migration */
3709             break;
3710         }
3711 
3712         migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
3713     }
3714 
3715     trace_migration_thread_after_loop();
3716 
3717 fail:
3718     if (early_fail) {
3719         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3720                 MIGRATION_STATUS_FAILED);
3721         bql_unlock();
3722     }
3723 
3724 fail_setup:
3725     bg_migration_iteration_finish(s);
3726 
3727     qemu_fclose(fb);
3728     object_unref(OBJECT(s));
3729     rcu_unregister_thread();
3730 
3731     return NULL;
3732 }
3733 
3734 void migrate_fd_connect(MigrationState *s, Error *error_in)
3735 {
3736     Error *local_err = NULL;
3737     uint64_t rate_limit;
3738     bool resume = (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP);
3739     int ret;
3740 
3741     /*
3742      * If there's a previous error, free it and prepare for another one.
3743      * Meanwhile if migration completes successfully, there won't have an error
3744      * dumped when calling migrate_fd_cleanup().
3745      */
3746     migrate_error_free(s);
3747 
3748     s->expected_downtime = migrate_downtime_limit();
3749     if (error_in) {
3750         migrate_fd_error(s, error_in);
3751         if (resume) {
3752             /*
3753              * Don't do cleanup for resume if channel is invalid, but only dump
3754              * the error.  We wait for another channel connect from the user.
3755              * The error_report still gives HMP user a hint on what failed.
3756              * It's normally done in migrate_fd_cleanup(), but call it here
3757              * explicitly.
3758              */
3759             error_report_err(error_copy(s->error));
3760         } else {
3761             migrate_fd_cleanup(s);
3762         }
3763         return;
3764     }
3765 
3766     if (resume) {
3767         /* This is a resumed migration */
3768         rate_limit = migrate_max_postcopy_bandwidth();
3769     } else {
3770         /* This is a fresh new migration */
3771         rate_limit = migrate_max_bandwidth();
3772 
3773         /* Notify before starting migration thread */
3774         if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP, &local_err)) {
3775             goto fail;
3776         }
3777     }
3778 
3779     migration_rate_set(rate_limit);
3780     qemu_file_set_blocking(s->to_dst_file, true);
3781 
3782     /*
3783      * Open the return path. For postcopy, it is used exclusively. For
3784      * precopy, only if user specified "return-path" capability would
3785      * QEMU uses the return path.
3786      */
3787     if (migrate_postcopy_ram() || migrate_return_path()) {
3788         if (open_return_path_on_source(s)) {
3789             error_setg(&local_err, "Unable to open return-path for postcopy");
3790             goto fail;
3791         }
3792     }
3793 
3794     /*
3795      * This needs to be done before resuming a postcopy.  Note: for newer
3796      * QEMUs we will delay the channel creation until postcopy_start(), to
3797      * avoid disorder of channel creations.
3798      */
3799     if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
3800         postcopy_preempt_setup(s);
3801     }
3802 
3803     if (resume) {
3804         /* Wakeup the main migration thread to do the recovery */
3805         migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP,
3806                           MIGRATION_STATUS_POSTCOPY_RECOVER);
3807         qemu_sem_post(&s->postcopy_pause_sem);
3808         return;
3809     }
3810 
3811     if (migrate_mode_is_cpr(s)) {
3812         ret = migration_stop_vm(s, RUN_STATE_FINISH_MIGRATE);
3813         if (ret < 0) {
3814             error_setg(&local_err, "migration_stop_vm failed, error %d", -ret);
3815             goto fail;
3816         }
3817     }
3818 
3819     if (migrate_background_snapshot()) {
3820         qemu_thread_create(&s->thread, "mig/snapshot",
3821                 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
3822     } else {
3823         qemu_thread_create(&s->thread, "mig/src/main",
3824                 migration_thread, s, QEMU_THREAD_JOINABLE);
3825     }
3826     s->migration_thread_running = true;
3827     return;
3828 
3829 fail:
3830     migrate_set_error(s, local_err);
3831     migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
3832     error_report_err(local_err);
3833     migrate_fd_cleanup(s);
3834 }
3835 
3836 static void migration_class_init(ObjectClass *klass, void *data)
3837 {
3838     DeviceClass *dc = DEVICE_CLASS(klass);
3839 
3840     dc->user_creatable = false;
3841     device_class_set_props(dc, migration_properties);
3842 }
3843 
3844 static void migration_instance_finalize(Object *obj)
3845 {
3846     MigrationState *ms = MIGRATION_OBJ(obj);
3847 
3848     qemu_mutex_destroy(&ms->error_mutex);
3849     qemu_mutex_destroy(&ms->qemu_file_lock);
3850     qemu_sem_destroy(&ms->wait_unplug_sem);
3851     qemu_sem_destroy(&ms->rate_limit_sem);
3852     qemu_sem_destroy(&ms->pause_sem);
3853     qemu_sem_destroy(&ms->postcopy_pause_sem);
3854     qemu_sem_destroy(&ms->rp_state.rp_sem);
3855     qemu_sem_destroy(&ms->rp_state.rp_pong_acks);
3856     qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
3857     error_free(ms->error);
3858 }
3859 
3860 static void migration_instance_init(Object *obj)
3861 {
3862     MigrationState *ms = MIGRATION_OBJ(obj);
3863 
3864     ms->state = MIGRATION_STATUS_NONE;
3865     ms->mbps = -1;
3866     ms->pages_per_second = -1;
3867     qemu_sem_init(&ms->pause_sem, 0);
3868     qemu_mutex_init(&ms->error_mutex);
3869 
3870     migrate_params_init(&ms->parameters);
3871 
3872     qemu_sem_init(&ms->postcopy_pause_sem, 0);
3873     qemu_sem_init(&ms->rp_state.rp_sem, 0);
3874     qemu_sem_init(&ms->rp_state.rp_pong_acks, 0);
3875     qemu_sem_init(&ms->rate_limit_sem, 0);
3876     qemu_sem_init(&ms->wait_unplug_sem, 0);
3877     qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
3878     qemu_mutex_init(&ms->qemu_file_lock);
3879 }
3880 
3881 /*
3882  * Return true if check pass, false otherwise. Error will be put
3883  * inside errp if provided.
3884  */
3885 static bool migration_object_check(MigrationState *ms, Error **errp)
3886 {
3887     /* Assuming all off */
3888     bool old_caps[MIGRATION_CAPABILITY__MAX] = { 0 };
3889 
3890     if (!migrate_params_check(&ms->parameters, errp)) {
3891         return false;
3892     }
3893 
3894     return migrate_caps_check(old_caps, ms->capabilities, errp);
3895 }
3896 
3897 static const TypeInfo migration_type = {
3898     .name = TYPE_MIGRATION,
3899     /*
3900      * NOTE: TYPE_MIGRATION is not really a device, as the object is
3901      * not created using qdev_new(), it is not attached to the qdev
3902      * device tree, and it is never realized.
3903      *
3904      * TODO: Make this TYPE_OBJECT once QOM provides something like
3905      * TYPE_DEVICE's "-global" properties.
3906      */
3907     .parent = TYPE_DEVICE,
3908     .class_init = migration_class_init,
3909     .class_size = sizeof(MigrationClass),
3910     .instance_size = sizeof(MigrationState),
3911     .instance_init = migration_instance_init,
3912     .instance_finalize = migration_instance_finalize,
3913 };
3914 
3915 static void register_migration_types(void)
3916 {
3917     type_register_static(&migration_type);
3918 }
3919 
3920 type_init(register_migration_types);
3921