xref: /openbmc/qemu/migration/migration.c (revision 1abaec9a)
1 /*
2  * QEMU live migration
3  *
4  * Copyright IBM, Corp. 2008
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qemu/cutils.h"
18 #include "qemu/error-report.h"
19 #include "qemu/main-loop.h"
20 #include "migration/blocker.h"
21 #include "exec.h"
22 #include "fd.h"
23 #include "socket.h"
24 #include "sysemu/runstate.h"
25 #include "sysemu/sysemu.h"
26 #include "sysemu/cpu-throttle.h"
27 #include "rdma.h"
28 #include "ram.h"
29 #include "migration/global_state.h"
30 #include "migration/misc.h"
31 #include "migration.h"
32 #include "savevm.h"
33 #include "qemu-file-channel.h"
34 #include "qemu-file.h"
35 #include "migration/vmstate.h"
36 #include "block/block.h"
37 #include "qapi/error.h"
38 #include "qapi/clone-visitor.h"
39 #include "qapi/qapi-visit-migration.h"
40 #include "qapi/qapi-visit-sockets.h"
41 #include "qapi/qapi-commands-migration.h"
42 #include "qapi/qapi-events-migration.h"
43 #include "qapi/qmp/qerror.h"
44 #include "qapi/qmp/qnull.h"
45 #include "qemu/rcu.h"
46 #include "block.h"
47 #include "postcopy-ram.h"
48 #include "qemu/thread.h"
49 #include "trace.h"
50 #include "exec/target_page.h"
51 #include "io/channel-buffer.h"
52 #include "migration/colo.h"
53 #include "hw/boards.h"
54 #include "hw/qdev-properties.h"
55 #include "hw/qdev-properties-system.h"
56 #include "monitor/monitor.h"
57 #include "net/announce.h"
58 #include "qemu/queue.h"
59 #include "multifd.h"
60 #include "qemu/yank.h"
61 #include "sysemu/cpus.h"
62 #include "yank_functions.h"
63 #include "sysemu/qtest.h"
64 
65 #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
66 
67 /* Amount of time to allocate to each "chunk" of bandwidth-throttled
68  * data. */
69 #define BUFFER_DELAY     100
70 #define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
71 
72 /* Time in milliseconds we are allowed to stop the source,
73  * for sending the last part */
74 #define DEFAULT_MIGRATE_SET_DOWNTIME 300
75 
76 /* Maximum migrate downtime set to 2000 seconds */
77 #define MAX_MIGRATE_DOWNTIME_SECONDS 2000
78 #define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000)
79 
80 /* Default compression thread count */
81 #define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
82 /* Default decompression thread count, usually decompression is at
83  * least 4 times as fast as compression.*/
84 #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
85 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
86 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
87 /* Define default autoconverge cpu throttle migration parameters */
88 #define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
89 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
90 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
91 #define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
92 
93 /* Migration XBZRLE default cache size */
94 #define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
95 
96 /* The delay time (in ms) between two COLO checkpoints */
97 #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
98 #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
99 #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
100 /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
101 #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
102 /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
103 #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
104 
105 /* Background transfer rate for postcopy, 0 means unlimited, note
106  * that page requests can still exceed this limit.
107  */
108 #define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0
109 
110 /*
111  * Parameters for self_announce_delay giving a stream of RARP/ARP
112  * packets after migration.
113  */
114 #define DEFAULT_MIGRATE_ANNOUNCE_INITIAL  50
115 #define DEFAULT_MIGRATE_ANNOUNCE_MAX     550
116 #define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS    5
117 #define DEFAULT_MIGRATE_ANNOUNCE_STEP    100
118 
119 static NotifierList migration_state_notifiers =
120     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
121 
122 /* Messages sent on the return path from destination to source */
123 enum mig_rp_message_type {
124     MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
125     MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
126     MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
127 
128     MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
129     MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
130     MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
131     MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
132 
133     MIG_RP_MSG_MAX
134 };
135 
136 /* Migration capabilities set */
137 struct MigrateCapsSet {
138     int size;                       /* Capability set size */
139     MigrationCapability caps[];     /* Variadic array of capabilities */
140 };
141 typedef struct MigrateCapsSet MigrateCapsSet;
142 
143 /* Define and initialize MigrateCapsSet */
144 #define INITIALIZE_MIGRATE_CAPS_SET(_name, ...)   \
145     MigrateCapsSet _name = {    \
146         .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \
147         .caps = { __VA_ARGS__ } \
148     }
149 
150 /* Background-snapshot compatibility check list */
151 static const
152 INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
153     MIGRATION_CAPABILITY_POSTCOPY_RAM,
154     MIGRATION_CAPABILITY_DIRTY_BITMAPS,
155     MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME,
156     MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE,
157     MIGRATION_CAPABILITY_RETURN_PATH,
158     MIGRATION_CAPABILITY_MULTIFD,
159     MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER,
160     MIGRATION_CAPABILITY_AUTO_CONVERGE,
161     MIGRATION_CAPABILITY_RELEASE_RAM,
162     MIGRATION_CAPABILITY_RDMA_PIN_ALL,
163     MIGRATION_CAPABILITY_COMPRESS,
164     MIGRATION_CAPABILITY_XBZRLE,
165     MIGRATION_CAPABILITY_X_COLO,
166     MIGRATION_CAPABILITY_VALIDATE_UUID,
167     MIGRATION_CAPABILITY_ZERO_COPY_SEND);
168 
169 /* When we add fault tolerance, we could have several
170    migrations at once.  For now we don't need to add
171    dynamic creation of migration */
172 
173 static MigrationState *current_migration;
174 static MigrationIncomingState *current_incoming;
175 
176 static GSList *migration_blockers;
177 
178 static bool migration_object_check(MigrationState *ms, Error **errp);
179 static int migration_maybe_pause(MigrationState *s,
180                                  int *current_active_state,
181                                  int new_state);
182 static void migrate_fd_cancel(MigrationState *s);
183 
184 static bool migrate_allow_multi_channels = true;
185 
186 void migrate_protocol_allow_multi_channels(bool allow)
187 {
188     migrate_allow_multi_channels = allow;
189 }
190 
191 bool migrate_multi_channels_is_allowed(void)
192 {
193     return migrate_allow_multi_channels;
194 }
195 
196 static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
197 {
198     uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
199 
200     return (a > b) - (a < b);
201 }
202 
203 void migration_object_init(void)
204 {
205     /* This can only be called once. */
206     assert(!current_migration);
207     current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
208 
209     /*
210      * Init the migrate incoming object as well no matter whether
211      * we'll use it or not.
212      */
213     assert(!current_incoming);
214     current_incoming = g_new0(MigrationIncomingState, 1);
215     current_incoming->state = MIGRATION_STATUS_NONE;
216     current_incoming->postcopy_remote_fds =
217         g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
218     qemu_mutex_init(&current_incoming->rp_mutex);
219     qemu_event_init(&current_incoming->main_thread_load_event, false);
220     qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
221     qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
222     qemu_mutex_init(&current_incoming->page_request_mutex);
223     current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
224 
225     migration_object_check(current_migration, &error_fatal);
226 
227     blk_mig_init();
228     ram_mig_init();
229     dirty_bitmap_mig_init();
230 }
231 
232 void migration_cancel(const Error *error)
233 {
234     if (error) {
235         migrate_set_error(current_migration, error);
236     }
237     migrate_fd_cancel(current_migration);
238 }
239 
240 void migration_shutdown(void)
241 {
242     /*
243      * When the QEMU main thread exit, the COLO thread
244      * may wait a semaphore. So, we should wakeup the
245      * COLO thread before migration shutdown.
246      */
247     colo_shutdown();
248     /*
249      * Cancel the current migration - that will (eventually)
250      * stop the migration using this structure
251      */
252     migration_cancel(NULL);
253     object_unref(OBJECT(current_migration));
254 
255     /*
256      * Cancel outgoing migration of dirty bitmaps. It should
257      * at least unref used block nodes.
258      */
259     dirty_bitmap_mig_cancel_outgoing();
260 
261     /*
262      * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
263      * are non-critical data, and their loss never considered as
264      * something serious.
265      */
266     dirty_bitmap_mig_cancel_incoming();
267 }
268 
269 /* For outgoing */
270 MigrationState *migrate_get_current(void)
271 {
272     /* This can only be called after the object created. */
273     assert(current_migration);
274     return current_migration;
275 }
276 
277 MigrationIncomingState *migration_incoming_get_current(void)
278 {
279     assert(current_incoming);
280     return current_incoming;
281 }
282 
283 void migration_incoming_transport_cleanup(MigrationIncomingState *mis)
284 {
285     if (mis->socket_address_list) {
286         qapi_free_SocketAddressList(mis->socket_address_list);
287         mis->socket_address_list = NULL;
288     }
289 
290     if (mis->transport_cleanup) {
291         mis->transport_cleanup(mis->transport_data);
292         mis->transport_data = mis->transport_cleanup = NULL;
293     }
294 }
295 
296 void migration_incoming_state_destroy(void)
297 {
298     struct MigrationIncomingState *mis = migration_incoming_get_current();
299 
300     if (mis->to_src_file) {
301         /* Tell source that we are done */
302         migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
303         qemu_fclose(mis->to_src_file);
304         mis->to_src_file = NULL;
305     }
306 
307     if (mis->from_src_file) {
308         migration_ioc_unregister_yank_from_file(mis->from_src_file);
309         qemu_fclose(mis->from_src_file);
310         mis->from_src_file = NULL;
311     }
312     if (mis->postcopy_remote_fds) {
313         g_array_free(mis->postcopy_remote_fds, TRUE);
314         mis->postcopy_remote_fds = NULL;
315     }
316 
317     migration_incoming_transport_cleanup(mis);
318     qemu_event_reset(&mis->main_thread_load_event);
319 
320     if (mis->page_requested) {
321         g_tree_destroy(mis->page_requested);
322         mis->page_requested = NULL;
323     }
324 
325     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
326 }
327 
328 static void migrate_generate_event(int new_state)
329 {
330     if (migrate_use_events()) {
331         qapi_event_send_migration(new_state);
332     }
333 }
334 
335 static bool migrate_late_block_activate(void)
336 {
337     MigrationState *s;
338 
339     s = migrate_get_current();
340 
341     return s->enabled_capabilities[
342         MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE];
343 }
344 
345 /*
346  * Send a message on the return channel back to the source
347  * of the migration.
348  */
349 static int migrate_send_rp_message(MigrationIncomingState *mis,
350                                    enum mig_rp_message_type message_type,
351                                    uint16_t len, void *data)
352 {
353     int ret = 0;
354 
355     trace_migrate_send_rp_message((int)message_type, len);
356     QEMU_LOCK_GUARD(&mis->rp_mutex);
357 
358     /*
359      * It's possible that the file handle got lost due to network
360      * failures.
361      */
362     if (!mis->to_src_file) {
363         ret = -EIO;
364         return ret;
365     }
366 
367     qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
368     qemu_put_be16(mis->to_src_file, len);
369     qemu_put_buffer(mis->to_src_file, data, len);
370     qemu_fflush(mis->to_src_file);
371 
372     /* It's possible that qemu file got error during sending */
373     ret = qemu_file_get_error(mis->to_src_file);
374 
375     return ret;
376 }
377 
378 /* Request one page from the source VM at the given start address.
379  *   rb: the RAMBlock to request the page in
380  *   Start: Address offset within the RB
381  *   Len: Length in bytes required - must be a multiple of pagesize
382  */
383 int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
384                                       RAMBlock *rb, ram_addr_t start)
385 {
386     uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
387     size_t msglen = 12; /* start + len */
388     size_t len = qemu_ram_pagesize(rb);
389     enum mig_rp_message_type msg_type;
390     const char *rbname;
391     int rbname_len;
392 
393     *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
394     *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
395 
396     /*
397      * We maintain the last ramblock that we requested for page.  Note that we
398      * don't need locking because this function will only be called within the
399      * postcopy ram fault thread.
400      */
401     if (rb != mis->last_rb) {
402         mis->last_rb = rb;
403 
404         rbname = qemu_ram_get_idstr(rb);
405         rbname_len = strlen(rbname);
406 
407         assert(rbname_len < 256);
408 
409         bufc[msglen++] = rbname_len;
410         memcpy(bufc + msglen, rbname, rbname_len);
411         msglen += rbname_len;
412         msg_type = MIG_RP_MSG_REQ_PAGES_ID;
413     } else {
414         msg_type = MIG_RP_MSG_REQ_PAGES;
415     }
416 
417     return migrate_send_rp_message(mis, msg_type, msglen, bufc);
418 }
419 
420 int migrate_send_rp_req_pages(MigrationIncomingState *mis,
421                               RAMBlock *rb, ram_addr_t start, uint64_t haddr)
422 {
423     void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
424     bool received = false;
425 
426     WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
427         received = ramblock_recv_bitmap_test_byte_offset(rb, start);
428         if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
429             /*
430              * The page has not been received, and it's not yet in the page
431              * request list.  Queue it.  Set the value of element to 1, so that
432              * things like g_tree_lookup() will return TRUE (1) when found.
433              */
434             g_tree_insert(mis->page_requested, aligned, (gpointer)1);
435             mis->page_requested_count++;
436             trace_postcopy_page_req_add(aligned, mis->page_requested_count);
437         }
438     }
439 
440     /*
441      * If the page is there, skip sending the message.  We don't even need the
442      * lock because as long as the page arrived, it'll be there forever.
443      */
444     if (received) {
445         return 0;
446     }
447 
448     return migrate_send_rp_message_req_pages(mis, rb, start);
449 }
450 
451 static bool migration_colo_enabled;
452 bool migration_incoming_colo_enabled(void)
453 {
454     return migration_colo_enabled;
455 }
456 
457 void migration_incoming_disable_colo(void)
458 {
459     ram_block_discard_disable(false);
460     migration_colo_enabled = false;
461 }
462 
463 int migration_incoming_enable_colo(void)
464 {
465     if (ram_block_discard_disable(true)) {
466         error_report("COLO: cannot disable RAM discard");
467         return -EBUSY;
468     }
469     migration_colo_enabled = true;
470     return 0;
471 }
472 
473 void migrate_add_address(SocketAddress *address)
474 {
475     MigrationIncomingState *mis = migration_incoming_get_current();
476 
477     QAPI_LIST_PREPEND(mis->socket_address_list,
478                       QAPI_CLONE(SocketAddress, address));
479 }
480 
481 static void qemu_start_incoming_migration(const char *uri, Error **errp)
482 {
483     const char *p = NULL;
484 
485     migrate_protocol_allow_multi_channels(false); /* reset it anyway */
486     qapi_event_send_migration(MIGRATION_STATUS_SETUP);
487     if (strstart(uri, "tcp:", &p) ||
488         strstart(uri, "unix:", NULL) ||
489         strstart(uri, "vsock:", NULL)) {
490         migrate_protocol_allow_multi_channels(true);
491         socket_start_incoming_migration(p ? p : uri, errp);
492 #ifdef CONFIG_RDMA
493     } else if (strstart(uri, "rdma:", &p)) {
494         rdma_start_incoming_migration(p, errp);
495 #endif
496     } else if (strstart(uri, "exec:", &p)) {
497         exec_start_incoming_migration(p, errp);
498     } else if (strstart(uri, "fd:", &p)) {
499         fd_start_incoming_migration(p, errp);
500     } else {
501         error_setg(errp, "unknown migration protocol: %s", uri);
502     }
503 }
504 
505 static void process_incoming_migration_bh(void *opaque)
506 {
507     Error *local_err = NULL;
508     MigrationIncomingState *mis = opaque;
509 
510     /* If capability late_block_activate is set:
511      * Only fire up the block code now if we're going to restart the
512      * VM, else 'cont' will do it.
513      * This causes file locking to happen; so we don't want it to happen
514      * unless we really are starting the VM.
515      */
516     if (!migrate_late_block_activate() ||
517          (autostart && (!global_state_received() ||
518             global_state_get_runstate() == RUN_STATE_RUNNING))) {
519         /* Make sure all file formats throw away their mutable metadata.
520          * If we get an error here, just don't restart the VM yet. */
521         bdrv_activate_all(&local_err);
522         if (local_err) {
523             error_report_err(local_err);
524             local_err = NULL;
525             autostart = false;
526         }
527     }
528 
529     /*
530      * This must happen after all error conditions are dealt with and
531      * we're sure the VM is going to be running on this host.
532      */
533     qemu_announce_self(&mis->announce_timer, migrate_announce_params());
534 
535     if (multifd_load_cleanup(&local_err) != 0) {
536         error_report_err(local_err);
537         autostart = false;
538     }
539     /* If global state section was not received or we are in running
540        state, we need to obey autostart. Any other state is set with
541        runstate_set. */
542 
543     dirty_bitmap_mig_before_vm_start();
544 
545     if (!global_state_received() ||
546         global_state_get_runstate() == RUN_STATE_RUNNING) {
547         if (autostart) {
548             vm_start();
549         } else {
550             runstate_set(RUN_STATE_PAUSED);
551         }
552     } else if (migration_incoming_colo_enabled()) {
553         migration_incoming_disable_colo();
554         vm_start();
555     } else {
556         runstate_set(global_state_get_runstate());
557     }
558     /*
559      * This must happen after any state changes since as soon as an external
560      * observer sees this event they might start to prod at the VM assuming
561      * it's ready to use.
562      */
563     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
564                       MIGRATION_STATUS_COMPLETED);
565     qemu_bh_delete(mis->bh);
566     migration_incoming_state_destroy();
567 }
568 
569 static void process_incoming_migration_co(void *opaque)
570 {
571     MigrationIncomingState *mis = migration_incoming_get_current();
572     PostcopyState ps;
573     int ret;
574     Error *local_err = NULL;
575 
576     assert(mis->from_src_file);
577     mis->migration_incoming_co = qemu_coroutine_self();
578     mis->largest_page_size = qemu_ram_pagesize_largest();
579     postcopy_state_set(POSTCOPY_INCOMING_NONE);
580     migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
581                       MIGRATION_STATUS_ACTIVE);
582     ret = qemu_loadvm_state(mis->from_src_file);
583 
584     ps = postcopy_state_get();
585     trace_process_incoming_migration_co_end(ret, ps);
586     if (ps != POSTCOPY_INCOMING_NONE) {
587         if (ps == POSTCOPY_INCOMING_ADVISE) {
588             /*
589              * Where a migration had postcopy enabled (and thus went to advise)
590              * but managed to complete within the precopy period, we can use
591              * the normal exit.
592              */
593             postcopy_ram_incoming_cleanup(mis);
594         } else if (ret >= 0) {
595             /*
596              * Postcopy was started, cleanup should happen at the end of the
597              * postcopy thread.
598              */
599             trace_process_incoming_migration_co_postcopy_end_main();
600             return;
601         }
602         /* Else if something went wrong then just fall out of the normal exit */
603     }
604 
605     /* we get COLO info, and know if we are in COLO mode */
606     if (!ret && migration_incoming_colo_enabled()) {
607         /* Make sure all file formats throw away their mutable metadata */
608         bdrv_activate_all(&local_err);
609         if (local_err) {
610             error_report_err(local_err);
611             goto fail;
612         }
613 
614         qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
615              colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
616         mis->have_colo_incoming_thread = true;
617         qemu_coroutine_yield();
618 
619         qemu_mutex_unlock_iothread();
620         /* Wait checkpoint incoming thread exit before free resource */
621         qemu_thread_join(&mis->colo_incoming_thread);
622         qemu_mutex_lock_iothread();
623         /* We hold the global iothread lock, so it is safe here */
624         colo_release_ram_cache();
625     }
626 
627     if (ret < 0) {
628         error_report("load of migration failed: %s", strerror(-ret));
629         goto fail;
630     }
631     mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
632     qemu_bh_schedule(mis->bh);
633     mis->migration_incoming_co = NULL;
634     return;
635 fail:
636     local_err = NULL;
637     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
638                       MIGRATION_STATUS_FAILED);
639     qemu_fclose(mis->from_src_file);
640     if (multifd_load_cleanup(&local_err) != 0) {
641         error_report_err(local_err);
642     }
643     exit(EXIT_FAILURE);
644 }
645 
646 /**
647  * migration_incoming_setup: Setup incoming migration
648  * @f: file for main migration channel
649  * @errp: where to put errors
650  *
651  * Returns: %true on success, %false on error.
652  */
653 static bool migration_incoming_setup(QEMUFile *f, Error **errp)
654 {
655     MigrationIncomingState *mis = migration_incoming_get_current();
656 
657     if (multifd_load_setup(errp) != 0) {
658         return false;
659     }
660 
661     if (!mis->from_src_file) {
662         mis->from_src_file = f;
663     }
664     qemu_file_set_blocking(f, false);
665     return true;
666 }
667 
668 void migration_incoming_process(void)
669 {
670     Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
671     qemu_coroutine_enter(co);
672 }
673 
674 /* Returns true if recovered from a paused migration, otherwise false */
675 static bool postcopy_try_recover(void)
676 {
677     MigrationIncomingState *mis = migration_incoming_get_current();
678 
679     if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
680         /* Resumed from a paused postcopy migration */
681 
682         /* This should be set already in migration_incoming_setup() */
683         assert(mis->from_src_file);
684         /* Postcopy has standalone thread to do vm load */
685         qemu_file_set_blocking(mis->from_src_file, true);
686 
687         /* Re-configure the return path */
688         mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
689 
690         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
691                           MIGRATION_STATUS_POSTCOPY_RECOVER);
692 
693         /*
694          * Here, we only wake up the main loading thread (while the
695          * fault thread will still be waiting), so that we can receive
696          * commands from source now, and answer it if needed. The
697          * fault thread will be woken up afterwards until we are sure
698          * that source is ready to reply to page requests.
699          */
700         qemu_sem_post(&mis->postcopy_pause_sem_dst);
701         return true;
702     }
703 
704     return false;
705 }
706 
707 void migration_fd_process_incoming(QEMUFile *f, Error **errp)
708 {
709     if (!migration_incoming_setup(f, errp)) {
710         return;
711     }
712     if (postcopy_try_recover()) {
713         return;
714     }
715     migration_incoming_process();
716 }
717 
718 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
719 {
720     MigrationIncomingState *mis = migration_incoming_get_current();
721     Error *local_err = NULL;
722     bool start_migration;
723 
724     if (!mis->from_src_file) {
725         /* The first connection (multifd may have multiple) */
726         QEMUFile *f = qemu_fopen_channel_input(ioc);
727 
728         if (!migration_incoming_setup(f, errp)) {
729             return;
730         }
731 
732         /*
733          * Common migration only needs one channel, so we can start
734          * right now.  Multifd needs more than one channel, we wait.
735          */
736         start_migration = !migrate_use_multifd();
737     } else {
738         /* Multiple connections */
739         assert(migrate_use_multifd());
740         start_migration = multifd_recv_new_channel(ioc, &local_err);
741         if (local_err) {
742             error_propagate(errp, local_err);
743             return;
744         }
745     }
746 
747     if (start_migration) {
748         /* If it's a recovery, we're done */
749         if (postcopy_try_recover()) {
750             return;
751         }
752         migration_incoming_process();
753     }
754 }
755 
756 /**
757  * @migration_has_all_channels: We have received all channels that we need
758  *
759  * Returns true when we have got connections to all the channels that
760  * we need for migration.
761  */
762 bool migration_has_all_channels(void)
763 {
764     MigrationIncomingState *mis = migration_incoming_get_current();
765     bool all_channels;
766 
767     all_channels = multifd_recv_all_channels_created();
768 
769     return all_channels && mis->from_src_file != NULL;
770 }
771 
772 /*
773  * Send a 'SHUT' message on the return channel with the given value
774  * to indicate that we've finished with the RP.  Non-0 value indicates
775  * error.
776  */
777 void migrate_send_rp_shut(MigrationIncomingState *mis,
778                           uint32_t value)
779 {
780     uint32_t buf;
781 
782     buf = cpu_to_be32(value);
783     migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
784 }
785 
786 /*
787  * Send a 'PONG' message on the return channel with the given value
788  * (normally in response to a 'PING')
789  */
790 void migrate_send_rp_pong(MigrationIncomingState *mis,
791                           uint32_t value)
792 {
793     uint32_t buf;
794 
795     buf = cpu_to_be32(value);
796     migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
797 }
798 
799 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
800                                  char *block_name)
801 {
802     char buf[512];
803     int len;
804     int64_t res;
805 
806     /*
807      * First, we send the header part. It contains only the len of
808      * idstr, and the idstr itself.
809      */
810     len = strlen(block_name);
811     buf[0] = len;
812     memcpy(buf + 1, block_name, len);
813 
814     if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
815         error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
816                      __func__);
817         return;
818     }
819 
820     migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
821 
822     /*
823      * Next, we dump the received bitmap to the stream.
824      *
825      * TODO: currently we are safe since we are the only one that is
826      * using the to_src_file handle (fault thread is still paused),
827      * and it's ok even not taking the mutex. However the best way is
828      * to take the lock before sending the message header, and release
829      * the lock after sending the bitmap.
830      */
831     qemu_mutex_lock(&mis->rp_mutex);
832     res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
833     qemu_mutex_unlock(&mis->rp_mutex);
834 
835     trace_migrate_send_rp_recv_bitmap(block_name, res);
836 }
837 
838 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
839 {
840     uint32_t buf;
841 
842     buf = cpu_to_be32(value);
843     migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
844 }
845 
846 MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
847 {
848     MigrationCapabilityStatusList *head = NULL, **tail = &head;
849     MigrationCapabilityStatus *caps;
850     MigrationState *s = migrate_get_current();
851     int i;
852 
853     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
854 #ifndef CONFIG_LIVE_BLOCK_MIGRATION
855         if (i == MIGRATION_CAPABILITY_BLOCK) {
856             continue;
857         }
858 #endif
859         caps = g_malloc0(sizeof(*caps));
860         caps->capability = i;
861         caps->state = s->enabled_capabilities[i];
862         QAPI_LIST_APPEND(tail, caps);
863     }
864 
865     return head;
866 }
867 
868 MigrationParameters *qmp_query_migrate_parameters(Error **errp)
869 {
870     MigrationParameters *params;
871     MigrationState *s = migrate_get_current();
872 
873     /* TODO use QAPI_CLONE() instead of duplicating it inline */
874     params = g_malloc0(sizeof(*params));
875     params->has_compress_level = true;
876     params->compress_level = s->parameters.compress_level;
877     params->has_compress_threads = true;
878     params->compress_threads = s->parameters.compress_threads;
879     params->has_compress_wait_thread = true;
880     params->compress_wait_thread = s->parameters.compress_wait_thread;
881     params->has_decompress_threads = true;
882     params->decompress_threads = s->parameters.decompress_threads;
883     params->has_throttle_trigger_threshold = true;
884     params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold;
885     params->has_cpu_throttle_initial = true;
886     params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
887     params->has_cpu_throttle_increment = true;
888     params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
889     params->has_cpu_throttle_tailslow = true;
890     params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
891     params->has_tls_creds = true;
892     params->tls_creds = g_strdup(s->parameters.tls_creds);
893     params->has_tls_hostname = true;
894     params->tls_hostname = g_strdup(s->parameters.tls_hostname);
895     params->has_tls_authz = true;
896     params->tls_authz = g_strdup(s->parameters.tls_authz ?
897                                  s->parameters.tls_authz : "");
898     params->has_max_bandwidth = true;
899     params->max_bandwidth = s->parameters.max_bandwidth;
900     params->has_downtime_limit = true;
901     params->downtime_limit = s->parameters.downtime_limit;
902     params->has_x_checkpoint_delay = true;
903     params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
904     params->has_block_incremental = true;
905     params->block_incremental = s->parameters.block_incremental;
906     params->has_multifd_channels = true;
907     params->multifd_channels = s->parameters.multifd_channels;
908     params->has_multifd_compression = true;
909     params->multifd_compression = s->parameters.multifd_compression;
910     params->has_multifd_zlib_level = true;
911     params->multifd_zlib_level = s->parameters.multifd_zlib_level;
912     params->has_multifd_zstd_level = true;
913     params->multifd_zstd_level = s->parameters.multifd_zstd_level;
914     params->has_xbzrle_cache_size = true;
915     params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
916     params->has_max_postcopy_bandwidth = true;
917     params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth;
918     params->has_max_cpu_throttle = true;
919     params->max_cpu_throttle = s->parameters.max_cpu_throttle;
920     params->has_announce_initial = true;
921     params->announce_initial = s->parameters.announce_initial;
922     params->has_announce_max = true;
923     params->announce_max = s->parameters.announce_max;
924     params->has_announce_rounds = true;
925     params->announce_rounds = s->parameters.announce_rounds;
926     params->has_announce_step = true;
927     params->announce_step = s->parameters.announce_step;
928 
929     if (s->parameters.has_block_bitmap_mapping) {
930         params->has_block_bitmap_mapping = true;
931         params->block_bitmap_mapping =
932             QAPI_CLONE(BitmapMigrationNodeAliasList,
933                        s->parameters.block_bitmap_mapping);
934     }
935 
936     return params;
937 }
938 
939 AnnounceParameters *migrate_announce_params(void)
940 {
941     static AnnounceParameters ap;
942 
943     MigrationState *s = migrate_get_current();
944 
945     ap.initial = s->parameters.announce_initial;
946     ap.max = s->parameters.announce_max;
947     ap.rounds = s->parameters.announce_rounds;
948     ap.step = s->parameters.announce_step;
949 
950     return &ap;
951 }
952 
953 /*
954  * Return true if we're already in the middle of a migration
955  * (i.e. any of the active or setup states)
956  */
957 bool migration_is_setup_or_active(int state)
958 {
959     switch (state) {
960     case MIGRATION_STATUS_ACTIVE:
961     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
962     case MIGRATION_STATUS_POSTCOPY_PAUSED:
963     case MIGRATION_STATUS_POSTCOPY_RECOVER:
964     case MIGRATION_STATUS_SETUP:
965     case MIGRATION_STATUS_PRE_SWITCHOVER:
966     case MIGRATION_STATUS_DEVICE:
967     case MIGRATION_STATUS_WAIT_UNPLUG:
968     case MIGRATION_STATUS_COLO:
969         return true;
970 
971     default:
972         return false;
973 
974     }
975 }
976 
977 bool migration_is_running(int state)
978 {
979     switch (state) {
980     case MIGRATION_STATUS_ACTIVE:
981     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
982     case MIGRATION_STATUS_POSTCOPY_PAUSED:
983     case MIGRATION_STATUS_POSTCOPY_RECOVER:
984     case MIGRATION_STATUS_SETUP:
985     case MIGRATION_STATUS_PRE_SWITCHOVER:
986     case MIGRATION_STATUS_DEVICE:
987     case MIGRATION_STATUS_WAIT_UNPLUG:
988     case MIGRATION_STATUS_CANCELLING:
989         return true;
990 
991     default:
992         return false;
993 
994     }
995 }
996 
997 static void populate_time_info(MigrationInfo *info, MigrationState *s)
998 {
999     info->has_status = true;
1000     info->has_setup_time = true;
1001     info->setup_time = s->setup_time;
1002     if (s->state == MIGRATION_STATUS_COMPLETED) {
1003         info->has_total_time = true;
1004         info->total_time = s->total_time;
1005         info->has_downtime = true;
1006         info->downtime = s->downtime;
1007     } else {
1008         info->has_total_time = true;
1009         info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
1010                            s->start_time;
1011         info->has_expected_downtime = true;
1012         info->expected_downtime = s->expected_downtime;
1013     }
1014 }
1015 
1016 static void populate_ram_info(MigrationInfo *info, MigrationState *s)
1017 {
1018     size_t page_size = qemu_target_page_size();
1019 
1020     info->has_ram = true;
1021     info->ram = g_malloc0(sizeof(*info->ram));
1022     info->ram->transferred = ram_counters.transferred;
1023     info->ram->total = ram_bytes_total();
1024     info->ram->duplicate = ram_counters.duplicate;
1025     /* legacy value.  It is not used anymore */
1026     info->ram->skipped = 0;
1027     info->ram->normal = ram_counters.normal;
1028     info->ram->normal_bytes = ram_counters.normal * page_size;
1029     info->ram->mbps = s->mbps;
1030     info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
1031     info->ram->postcopy_requests = ram_counters.postcopy_requests;
1032     info->ram->page_size = page_size;
1033     info->ram->multifd_bytes = ram_counters.multifd_bytes;
1034     info->ram->pages_per_second = s->pages_per_second;
1035     info->ram->precopy_bytes = ram_counters.precopy_bytes;
1036     info->ram->downtime_bytes = ram_counters.downtime_bytes;
1037     info->ram->postcopy_bytes = ram_counters.postcopy_bytes;
1038 
1039     if (migrate_use_xbzrle()) {
1040         info->has_xbzrle_cache = true;
1041         info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
1042         info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
1043         info->xbzrle_cache->bytes = xbzrle_counters.bytes;
1044         info->xbzrle_cache->pages = xbzrle_counters.pages;
1045         info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
1046         info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
1047         info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
1048         info->xbzrle_cache->overflow = xbzrle_counters.overflow;
1049     }
1050 
1051     if (migrate_use_compression()) {
1052         info->has_compression = true;
1053         info->compression = g_malloc0(sizeof(*info->compression));
1054         info->compression->pages = compression_counters.pages;
1055         info->compression->busy = compression_counters.busy;
1056         info->compression->busy_rate = compression_counters.busy_rate;
1057         info->compression->compressed_size =
1058                                     compression_counters.compressed_size;
1059         info->compression->compression_rate =
1060                                     compression_counters.compression_rate;
1061     }
1062 
1063     if (cpu_throttle_active()) {
1064         info->has_cpu_throttle_percentage = true;
1065         info->cpu_throttle_percentage = cpu_throttle_get_percentage();
1066     }
1067 
1068     if (s->state != MIGRATION_STATUS_COMPLETED) {
1069         info->ram->remaining = ram_bytes_remaining();
1070         info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate;
1071     }
1072 }
1073 
1074 static void populate_disk_info(MigrationInfo *info)
1075 {
1076     if (blk_mig_active()) {
1077         info->has_disk = true;
1078         info->disk = g_malloc0(sizeof(*info->disk));
1079         info->disk->transferred = blk_mig_bytes_transferred();
1080         info->disk->remaining = blk_mig_bytes_remaining();
1081         info->disk->total = blk_mig_bytes_total();
1082     }
1083 }
1084 
1085 static void fill_source_migration_info(MigrationInfo *info)
1086 {
1087     MigrationState *s = migrate_get_current();
1088     int state = qatomic_read(&s->state);
1089     GSList *cur_blocker = migration_blockers;
1090 
1091     info->blocked_reasons = NULL;
1092 
1093     /*
1094      * There are two types of reasons a migration might be blocked;
1095      * a) devices marked in VMState as non-migratable, and
1096      * b) Explicit migration blockers
1097      * We need to add both of them here.
1098      */
1099     qemu_savevm_non_migratable_list(&info->blocked_reasons);
1100 
1101     while (cur_blocker) {
1102         QAPI_LIST_PREPEND(info->blocked_reasons,
1103                           g_strdup(error_get_pretty(cur_blocker->data)));
1104         cur_blocker = g_slist_next(cur_blocker);
1105     }
1106     info->has_blocked_reasons = info->blocked_reasons != NULL;
1107 
1108     switch (state) {
1109     case MIGRATION_STATUS_NONE:
1110         /* no migration has happened ever */
1111         /* do not overwrite destination migration status */
1112         return;
1113     case MIGRATION_STATUS_SETUP:
1114         info->has_status = true;
1115         info->has_total_time = false;
1116         break;
1117     case MIGRATION_STATUS_ACTIVE:
1118     case MIGRATION_STATUS_CANCELLING:
1119     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1120     case MIGRATION_STATUS_PRE_SWITCHOVER:
1121     case MIGRATION_STATUS_DEVICE:
1122     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1123     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1124         /* TODO add some postcopy stats */
1125         populate_time_info(info, s);
1126         populate_ram_info(info, s);
1127         populate_disk_info(info);
1128         populate_vfio_info(info);
1129         break;
1130     case MIGRATION_STATUS_COLO:
1131         info->has_status = true;
1132         /* TODO: display COLO specific information (checkpoint info etc.) */
1133         break;
1134     case MIGRATION_STATUS_COMPLETED:
1135         populate_time_info(info, s);
1136         populate_ram_info(info, s);
1137         populate_vfio_info(info);
1138         break;
1139     case MIGRATION_STATUS_FAILED:
1140         info->has_status = true;
1141         if (s->error) {
1142             info->has_error_desc = true;
1143             info->error_desc = g_strdup(error_get_pretty(s->error));
1144         }
1145         break;
1146     case MIGRATION_STATUS_CANCELLED:
1147         info->has_status = true;
1148         break;
1149     case MIGRATION_STATUS_WAIT_UNPLUG:
1150         info->has_status = true;
1151         break;
1152     }
1153     info->status = state;
1154 }
1155 
1156 typedef enum WriteTrackingSupport {
1157     WT_SUPPORT_UNKNOWN = 0,
1158     WT_SUPPORT_ABSENT,
1159     WT_SUPPORT_AVAILABLE,
1160     WT_SUPPORT_COMPATIBLE
1161 } WriteTrackingSupport;
1162 
1163 static
1164 WriteTrackingSupport migrate_query_write_tracking(void)
1165 {
1166     /* Check if kernel supports required UFFD features */
1167     if (!ram_write_tracking_available()) {
1168         return WT_SUPPORT_ABSENT;
1169     }
1170     /*
1171      * Check if current memory configuration is
1172      * compatible with required UFFD features.
1173      */
1174     if (!ram_write_tracking_compatible()) {
1175         return WT_SUPPORT_AVAILABLE;
1176     }
1177 
1178     return WT_SUPPORT_COMPATIBLE;
1179 }
1180 
1181 /**
1182  * @migration_caps_check - check capability validity
1183  *
1184  * @cap_list: old capability list, array of bool
1185  * @params: new capabilities to be applied soon
1186  * @errp: set *errp if the check failed, with reason
1187  *
1188  * Returns true if check passed, otherwise false.
1189  */
1190 static bool migrate_caps_check(bool *cap_list,
1191                                MigrationCapabilityStatusList *params,
1192                                Error **errp)
1193 {
1194     MigrationCapabilityStatusList *cap;
1195     bool old_postcopy_cap;
1196     MigrationIncomingState *mis = migration_incoming_get_current();
1197 
1198     old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM];
1199 
1200     for (cap = params; cap; cap = cap->next) {
1201         cap_list[cap->value->capability] = cap->value->state;
1202     }
1203 
1204 #ifndef CONFIG_LIVE_BLOCK_MIGRATION
1205     if (cap_list[MIGRATION_CAPABILITY_BLOCK]) {
1206         error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) "
1207                    "block migration");
1208         error_append_hint(errp, "Use drive_mirror+NBD instead.\n");
1209         return false;
1210     }
1211 #endif
1212 
1213 #ifndef CONFIG_REPLICATION
1214     if (cap_list[MIGRATION_CAPABILITY_X_COLO]) {
1215         error_setg(errp, "QEMU compiled without replication module"
1216                    " can't enable COLO");
1217         error_append_hint(errp, "Please enable replication before COLO.\n");
1218         return false;
1219     }
1220 #endif
1221 
1222     if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
1223         /* This check is reasonably expensive, so only when it's being
1224          * set the first time, also it's only the destination that needs
1225          * special support.
1226          */
1227         if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
1228             !postcopy_ram_supported_by_host(mis)) {
1229             /* postcopy_ram_supported_by_host will have emitted a more
1230              * detailed message
1231              */
1232             error_setg(errp, "Postcopy is not supported");
1233             return false;
1234         }
1235 
1236         if (cap_list[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) {
1237             error_setg(errp, "Postcopy is not compatible with ignore-shared");
1238             return false;
1239         }
1240     }
1241 
1242     if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
1243         WriteTrackingSupport wt_support;
1244         int idx;
1245         /*
1246          * Check if 'background-snapshot' capability is supported by
1247          * host kernel and compatible with guest memory configuration.
1248          */
1249         wt_support = migrate_query_write_tracking();
1250         if (wt_support < WT_SUPPORT_AVAILABLE) {
1251             error_setg(errp, "Background-snapshot is not supported by host kernel");
1252             return false;
1253         }
1254         if (wt_support < WT_SUPPORT_COMPATIBLE) {
1255             error_setg(errp, "Background-snapshot is not compatible "
1256                     "with guest memory configuration");
1257             return false;
1258         }
1259 
1260         /*
1261          * Check if there are any migration capabilities
1262          * incompatible with 'background-snapshot'.
1263          */
1264         for (idx = 0; idx < check_caps_background_snapshot.size; idx++) {
1265             int incomp_cap = check_caps_background_snapshot.caps[idx];
1266             if (cap_list[incomp_cap]) {
1267                 error_setg(errp,
1268                         "Background-snapshot is not compatible with %s",
1269                         MigrationCapability_str(incomp_cap));
1270                 return false;
1271             }
1272         }
1273     }
1274 
1275 #ifdef CONFIG_LINUX
1276     if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND] &&
1277         (!cap_list[MIGRATION_CAPABILITY_MULTIFD] ||
1278          migrate_use_compression() ||
1279          migrate_use_tls())) {
1280         error_setg(errp,
1281                    "Zero copy only available for non-compressed non-TLS multifd migration");
1282         return false;
1283     }
1284 #else
1285     if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) {
1286         error_setg(errp,
1287                    "Zero copy currently only available on Linux");
1288         return false;
1289     }
1290 #endif
1291 
1292 
1293     /* incoming side only */
1294     if (runstate_check(RUN_STATE_INMIGRATE) &&
1295         !migrate_multi_channels_is_allowed() &&
1296         cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
1297         error_setg(errp, "multifd is not supported by current protocol");
1298         return false;
1299     }
1300 
1301     return true;
1302 }
1303 
1304 static void fill_destination_migration_info(MigrationInfo *info)
1305 {
1306     MigrationIncomingState *mis = migration_incoming_get_current();
1307 
1308     if (mis->socket_address_list) {
1309         info->has_socket_address = true;
1310         info->socket_address =
1311             QAPI_CLONE(SocketAddressList, mis->socket_address_list);
1312     }
1313 
1314     switch (mis->state) {
1315     case MIGRATION_STATUS_NONE:
1316         return;
1317     case MIGRATION_STATUS_SETUP:
1318     case MIGRATION_STATUS_CANCELLING:
1319     case MIGRATION_STATUS_CANCELLED:
1320     case MIGRATION_STATUS_ACTIVE:
1321     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1322     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1323     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1324     case MIGRATION_STATUS_FAILED:
1325     case MIGRATION_STATUS_COLO:
1326         info->has_status = true;
1327         break;
1328     case MIGRATION_STATUS_COMPLETED:
1329         info->has_status = true;
1330         fill_destination_postcopy_migration_info(info);
1331         break;
1332     }
1333     info->status = mis->state;
1334 }
1335 
1336 MigrationInfo *qmp_query_migrate(Error **errp)
1337 {
1338     MigrationInfo *info = g_malloc0(sizeof(*info));
1339 
1340     fill_destination_migration_info(info);
1341     fill_source_migration_info(info);
1342 
1343     return info;
1344 }
1345 
1346 void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
1347                                   Error **errp)
1348 {
1349     MigrationState *s = migrate_get_current();
1350     MigrationCapabilityStatusList *cap;
1351     bool cap_list[MIGRATION_CAPABILITY__MAX];
1352 
1353     if (migration_is_running(s->state)) {
1354         error_setg(errp, QERR_MIGRATION_ACTIVE);
1355         return;
1356     }
1357 
1358     memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
1359     if (!migrate_caps_check(cap_list, params, errp)) {
1360         return;
1361     }
1362 
1363     for (cap = params; cap; cap = cap->next) {
1364         s->enabled_capabilities[cap->value->capability] = cap->value->state;
1365     }
1366 }
1367 
1368 /*
1369  * Check whether the parameters are valid. Error will be put into errp
1370  * (if provided). Return true if valid, otherwise false.
1371  */
1372 static bool migrate_params_check(MigrationParameters *params, Error **errp)
1373 {
1374     if (params->has_compress_level &&
1375         (params->compress_level > 9)) {
1376         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
1377                    "a value between 0 and 9");
1378         return false;
1379     }
1380 
1381     if (params->has_compress_threads && (params->compress_threads < 1)) {
1382         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1383                    "compress_threads",
1384                    "a value between 1 and 255");
1385         return false;
1386     }
1387 
1388     if (params->has_decompress_threads && (params->decompress_threads < 1)) {
1389         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1390                    "decompress_threads",
1391                    "a value between 1 and 255");
1392         return false;
1393     }
1394 
1395     if (params->has_throttle_trigger_threshold &&
1396         (params->throttle_trigger_threshold < 1 ||
1397          params->throttle_trigger_threshold > 100)) {
1398         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1399                    "throttle_trigger_threshold",
1400                    "an integer in the range of 1 to 100");
1401         return false;
1402     }
1403 
1404     if (params->has_cpu_throttle_initial &&
1405         (params->cpu_throttle_initial < 1 ||
1406          params->cpu_throttle_initial > 99)) {
1407         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1408                    "cpu_throttle_initial",
1409                    "an integer in the range of 1 to 99");
1410         return false;
1411     }
1412 
1413     if (params->has_cpu_throttle_increment &&
1414         (params->cpu_throttle_increment < 1 ||
1415          params->cpu_throttle_increment > 99)) {
1416         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1417                    "cpu_throttle_increment",
1418                    "an integer in the range of 1 to 99");
1419         return false;
1420     }
1421 
1422     if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
1423         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1424                    "max_bandwidth",
1425                    "an integer in the range of 0 to "stringify(SIZE_MAX)
1426                    " bytes/second");
1427         return false;
1428     }
1429 
1430     if (params->has_downtime_limit &&
1431         (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
1432         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1433                    "downtime_limit",
1434                    "an integer in the range of 0 to "
1435                     stringify(MAX_MIGRATE_DOWNTIME)" ms");
1436         return false;
1437     }
1438 
1439     /* x_checkpoint_delay is now always positive */
1440 
1441     if (params->has_multifd_channels && (params->multifd_channels < 1)) {
1442         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1443                    "multifd_channels",
1444                    "a value between 1 and 255");
1445         return false;
1446     }
1447 
1448     if (params->has_multifd_zlib_level &&
1449         (params->multifd_zlib_level > 9)) {
1450         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level",
1451                    "a value between 0 and 9");
1452         return false;
1453     }
1454 
1455     if (params->has_multifd_zstd_level &&
1456         (params->multifd_zstd_level > 20)) {
1457         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
1458                    "a value between 0 and 20");
1459         return false;
1460     }
1461 
1462     if (params->has_xbzrle_cache_size &&
1463         (params->xbzrle_cache_size < qemu_target_page_size() ||
1464          !is_power_of_2(params->xbzrle_cache_size))) {
1465         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1466                    "xbzrle_cache_size",
1467                    "a power of two no less than the target page size");
1468         return false;
1469     }
1470 
1471     if (params->has_max_cpu_throttle &&
1472         (params->max_cpu_throttle < params->cpu_throttle_initial ||
1473          params->max_cpu_throttle > 99)) {
1474         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1475                    "max_cpu_throttle",
1476                    "an integer in the range of cpu_throttle_initial to 99");
1477         return false;
1478     }
1479 
1480     if (params->has_announce_initial &&
1481         params->announce_initial > 100000) {
1482         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1483                    "announce_initial",
1484                    "a value between 0 and 100000");
1485         return false;
1486     }
1487     if (params->has_announce_max &&
1488         params->announce_max > 100000) {
1489         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1490                    "announce_max",
1491                    "a value between 0 and 100000");
1492        return false;
1493     }
1494     if (params->has_announce_rounds &&
1495         params->announce_rounds > 1000) {
1496         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1497                    "announce_rounds",
1498                    "a value between 0 and 1000");
1499        return false;
1500     }
1501     if (params->has_announce_step &&
1502         (params->announce_step < 1 ||
1503         params->announce_step > 10000)) {
1504         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1505                    "announce_step",
1506                    "a value between 0 and 10000");
1507        return false;
1508     }
1509 
1510     if (params->has_block_bitmap_mapping &&
1511         !check_dirty_bitmap_mig_alias_map(params->block_bitmap_mapping, errp)) {
1512         error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: ");
1513         return false;
1514     }
1515     return true;
1516 }
1517 
1518 static void migrate_params_test_apply(MigrateSetParameters *params,
1519                                       MigrationParameters *dest)
1520 {
1521     *dest = migrate_get_current()->parameters;
1522 
1523     /* TODO use QAPI_CLONE() instead of duplicating it inline */
1524 
1525     if (params->has_compress_level) {
1526         dest->compress_level = params->compress_level;
1527     }
1528 
1529     if (params->has_compress_threads) {
1530         dest->compress_threads = params->compress_threads;
1531     }
1532 
1533     if (params->has_compress_wait_thread) {
1534         dest->compress_wait_thread = params->compress_wait_thread;
1535     }
1536 
1537     if (params->has_decompress_threads) {
1538         dest->decompress_threads = params->decompress_threads;
1539     }
1540 
1541     if (params->has_throttle_trigger_threshold) {
1542         dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
1543     }
1544 
1545     if (params->has_cpu_throttle_initial) {
1546         dest->cpu_throttle_initial = params->cpu_throttle_initial;
1547     }
1548 
1549     if (params->has_cpu_throttle_increment) {
1550         dest->cpu_throttle_increment = params->cpu_throttle_increment;
1551     }
1552 
1553     if (params->has_cpu_throttle_tailslow) {
1554         dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1555     }
1556 
1557     if (params->has_tls_creds) {
1558         assert(params->tls_creds->type == QTYPE_QSTRING);
1559         dest->tls_creds = params->tls_creds->u.s;
1560     }
1561 
1562     if (params->has_tls_hostname) {
1563         assert(params->tls_hostname->type == QTYPE_QSTRING);
1564         dest->tls_hostname = params->tls_hostname->u.s;
1565     }
1566 
1567     if (params->has_max_bandwidth) {
1568         dest->max_bandwidth = params->max_bandwidth;
1569     }
1570 
1571     if (params->has_downtime_limit) {
1572         dest->downtime_limit = params->downtime_limit;
1573     }
1574 
1575     if (params->has_x_checkpoint_delay) {
1576         dest->x_checkpoint_delay = params->x_checkpoint_delay;
1577     }
1578 
1579     if (params->has_block_incremental) {
1580         dest->block_incremental = params->block_incremental;
1581     }
1582     if (params->has_multifd_channels) {
1583         dest->multifd_channels = params->multifd_channels;
1584     }
1585     if (params->has_multifd_compression) {
1586         dest->multifd_compression = params->multifd_compression;
1587     }
1588     if (params->has_xbzrle_cache_size) {
1589         dest->xbzrle_cache_size = params->xbzrle_cache_size;
1590     }
1591     if (params->has_max_postcopy_bandwidth) {
1592         dest->max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1593     }
1594     if (params->has_max_cpu_throttle) {
1595         dest->max_cpu_throttle = params->max_cpu_throttle;
1596     }
1597     if (params->has_announce_initial) {
1598         dest->announce_initial = params->announce_initial;
1599     }
1600     if (params->has_announce_max) {
1601         dest->announce_max = params->announce_max;
1602     }
1603     if (params->has_announce_rounds) {
1604         dest->announce_rounds = params->announce_rounds;
1605     }
1606     if (params->has_announce_step) {
1607         dest->announce_step = params->announce_step;
1608     }
1609 
1610     if (params->has_block_bitmap_mapping) {
1611         dest->has_block_bitmap_mapping = true;
1612         dest->block_bitmap_mapping = params->block_bitmap_mapping;
1613     }
1614 }
1615 
1616 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
1617 {
1618     MigrationState *s = migrate_get_current();
1619 
1620     /* TODO use QAPI_CLONE() instead of duplicating it inline */
1621 
1622     if (params->has_compress_level) {
1623         s->parameters.compress_level = params->compress_level;
1624     }
1625 
1626     if (params->has_compress_threads) {
1627         s->parameters.compress_threads = params->compress_threads;
1628     }
1629 
1630     if (params->has_compress_wait_thread) {
1631         s->parameters.compress_wait_thread = params->compress_wait_thread;
1632     }
1633 
1634     if (params->has_decompress_threads) {
1635         s->parameters.decompress_threads = params->decompress_threads;
1636     }
1637 
1638     if (params->has_throttle_trigger_threshold) {
1639         s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold;
1640     }
1641 
1642     if (params->has_cpu_throttle_initial) {
1643         s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
1644     }
1645 
1646     if (params->has_cpu_throttle_increment) {
1647         s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
1648     }
1649 
1650     if (params->has_cpu_throttle_tailslow) {
1651         s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1652     }
1653 
1654     if (params->has_tls_creds) {
1655         g_free(s->parameters.tls_creds);
1656         assert(params->tls_creds->type == QTYPE_QSTRING);
1657         s->parameters.tls_creds = g_strdup(params->tls_creds->u.s);
1658     }
1659 
1660     if (params->has_tls_hostname) {
1661         g_free(s->parameters.tls_hostname);
1662         assert(params->tls_hostname->type == QTYPE_QSTRING);
1663         s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s);
1664     }
1665 
1666     if (params->has_tls_authz) {
1667         g_free(s->parameters.tls_authz);
1668         assert(params->tls_authz->type == QTYPE_QSTRING);
1669         s->parameters.tls_authz = g_strdup(params->tls_authz->u.s);
1670     }
1671 
1672     if (params->has_max_bandwidth) {
1673         s->parameters.max_bandwidth = params->max_bandwidth;
1674         if (s->to_dst_file && !migration_in_postcopy()) {
1675             qemu_file_set_rate_limit(s->to_dst_file,
1676                                 s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
1677         }
1678     }
1679 
1680     if (params->has_downtime_limit) {
1681         s->parameters.downtime_limit = params->downtime_limit;
1682     }
1683 
1684     if (params->has_x_checkpoint_delay) {
1685         s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
1686         if (migration_in_colo_state()) {
1687             colo_checkpoint_notify(s);
1688         }
1689     }
1690 
1691     if (params->has_block_incremental) {
1692         s->parameters.block_incremental = params->block_incremental;
1693     }
1694     if (params->has_multifd_channels) {
1695         s->parameters.multifd_channels = params->multifd_channels;
1696     }
1697     if (params->has_multifd_compression) {
1698         s->parameters.multifd_compression = params->multifd_compression;
1699     }
1700     if (params->has_xbzrle_cache_size) {
1701         s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
1702         xbzrle_cache_resize(params->xbzrle_cache_size, errp);
1703     }
1704     if (params->has_max_postcopy_bandwidth) {
1705         s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1706         if (s->to_dst_file && migration_in_postcopy()) {
1707             qemu_file_set_rate_limit(s->to_dst_file,
1708                     s->parameters.max_postcopy_bandwidth / XFER_LIMIT_RATIO);
1709         }
1710     }
1711     if (params->has_max_cpu_throttle) {
1712         s->parameters.max_cpu_throttle = params->max_cpu_throttle;
1713     }
1714     if (params->has_announce_initial) {
1715         s->parameters.announce_initial = params->announce_initial;
1716     }
1717     if (params->has_announce_max) {
1718         s->parameters.announce_max = params->announce_max;
1719     }
1720     if (params->has_announce_rounds) {
1721         s->parameters.announce_rounds = params->announce_rounds;
1722     }
1723     if (params->has_announce_step) {
1724         s->parameters.announce_step = params->announce_step;
1725     }
1726 
1727     if (params->has_block_bitmap_mapping) {
1728         qapi_free_BitmapMigrationNodeAliasList(
1729             s->parameters.block_bitmap_mapping);
1730 
1731         s->parameters.has_block_bitmap_mapping = true;
1732         s->parameters.block_bitmap_mapping =
1733             QAPI_CLONE(BitmapMigrationNodeAliasList,
1734                        params->block_bitmap_mapping);
1735     }
1736 }
1737 
1738 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
1739 {
1740     MigrationParameters tmp;
1741 
1742     /* TODO Rewrite "" to null instead */
1743     if (params->has_tls_creds
1744         && params->tls_creds->type == QTYPE_QNULL) {
1745         qobject_unref(params->tls_creds->u.n);
1746         params->tls_creds->type = QTYPE_QSTRING;
1747         params->tls_creds->u.s = strdup("");
1748     }
1749     /* TODO Rewrite "" to null instead */
1750     if (params->has_tls_hostname
1751         && params->tls_hostname->type == QTYPE_QNULL) {
1752         qobject_unref(params->tls_hostname->u.n);
1753         params->tls_hostname->type = QTYPE_QSTRING;
1754         params->tls_hostname->u.s = strdup("");
1755     }
1756 
1757     migrate_params_test_apply(params, &tmp);
1758 
1759     if (!migrate_params_check(&tmp, errp)) {
1760         /* Invalid parameter */
1761         return;
1762     }
1763 
1764     migrate_params_apply(params, errp);
1765 }
1766 
1767 
1768 void qmp_migrate_start_postcopy(Error **errp)
1769 {
1770     MigrationState *s = migrate_get_current();
1771 
1772     if (!migrate_postcopy()) {
1773         error_setg(errp, "Enable postcopy with migrate_set_capability before"
1774                          " the start of migration");
1775         return;
1776     }
1777 
1778     if (s->state == MIGRATION_STATUS_NONE) {
1779         error_setg(errp, "Postcopy must be started after migration has been"
1780                          " started");
1781         return;
1782     }
1783     /*
1784      * we don't error if migration has finished since that would be racy
1785      * with issuing this command.
1786      */
1787     qatomic_set(&s->start_postcopy, true);
1788 }
1789 
1790 /* shared migration helpers */
1791 
1792 void migrate_set_state(int *state, int old_state, int new_state)
1793 {
1794     assert(new_state < MIGRATION_STATUS__MAX);
1795     if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
1796         trace_migrate_set_state(MigrationStatus_str(new_state));
1797         migrate_generate_event(new_state);
1798     }
1799 }
1800 
1801 static MigrationCapabilityStatus *migrate_cap_add(MigrationCapability index,
1802                                                   bool state)
1803 {
1804     MigrationCapabilityStatus *cap;
1805 
1806     cap = g_new0(MigrationCapabilityStatus, 1);
1807     cap->capability = index;
1808     cap->state = state;
1809 
1810     return cap;
1811 }
1812 
1813 void migrate_set_block_enabled(bool value, Error **errp)
1814 {
1815     MigrationCapabilityStatusList *cap = NULL;
1816 
1817     QAPI_LIST_PREPEND(cap, migrate_cap_add(MIGRATION_CAPABILITY_BLOCK, value));
1818     qmp_migrate_set_capabilities(cap, errp);
1819     qapi_free_MigrationCapabilityStatusList(cap);
1820 }
1821 
1822 static void migrate_set_block_incremental(MigrationState *s, bool value)
1823 {
1824     s->parameters.block_incremental = value;
1825 }
1826 
1827 static void block_cleanup_parameters(MigrationState *s)
1828 {
1829     if (s->must_remove_block_options) {
1830         /* setting to false can never fail */
1831         migrate_set_block_enabled(false, &error_abort);
1832         migrate_set_block_incremental(s, false);
1833         s->must_remove_block_options = false;
1834     }
1835 }
1836 
1837 static void migrate_fd_cleanup(MigrationState *s)
1838 {
1839     qemu_bh_delete(s->cleanup_bh);
1840     s->cleanup_bh = NULL;
1841 
1842     g_free(s->hostname);
1843     s->hostname = NULL;
1844 
1845     qemu_savevm_state_cleanup();
1846 
1847     if (s->to_dst_file) {
1848         QEMUFile *tmp;
1849 
1850         trace_migrate_fd_cleanup();
1851         qemu_mutex_unlock_iothread();
1852         if (s->migration_thread_running) {
1853             qemu_thread_join(&s->thread);
1854             s->migration_thread_running = false;
1855         }
1856         qemu_mutex_lock_iothread();
1857 
1858         multifd_save_cleanup();
1859         qemu_mutex_lock(&s->qemu_file_lock);
1860         tmp = s->to_dst_file;
1861         s->to_dst_file = NULL;
1862         qemu_mutex_unlock(&s->qemu_file_lock);
1863         /*
1864          * Close the file handle without the lock to make sure the
1865          * critical section won't block for long.
1866          */
1867         migration_ioc_unregister_yank_from_file(tmp);
1868         qemu_fclose(tmp);
1869     }
1870 
1871     assert(!migration_is_active(s));
1872 
1873     if (s->state == MIGRATION_STATUS_CANCELLING) {
1874         migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1875                           MIGRATION_STATUS_CANCELLED);
1876     }
1877 
1878     if (s->error) {
1879         /* It is used on info migrate.  We can't free it */
1880         error_report_err(error_copy(s->error));
1881     }
1882     notifier_list_notify(&migration_state_notifiers, s);
1883     block_cleanup_parameters(s);
1884     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1885 }
1886 
1887 static void migrate_fd_cleanup_schedule(MigrationState *s)
1888 {
1889     /*
1890      * Ref the state for bh, because it may be called when
1891      * there're already no other refs
1892      */
1893     object_ref(OBJECT(s));
1894     qemu_bh_schedule(s->cleanup_bh);
1895 }
1896 
1897 static void migrate_fd_cleanup_bh(void *opaque)
1898 {
1899     MigrationState *s = opaque;
1900     migrate_fd_cleanup(s);
1901     object_unref(OBJECT(s));
1902 }
1903 
1904 void migrate_set_error(MigrationState *s, const Error *error)
1905 {
1906     QEMU_LOCK_GUARD(&s->error_mutex);
1907     if (!s->error) {
1908         s->error = error_copy(error);
1909     }
1910 }
1911 
1912 static void migrate_error_free(MigrationState *s)
1913 {
1914     QEMU_LOCK_GUARD(&s->error_mutex);
1915     if (s->error) {
1916         error_free(s->error);
1917         s->error = NULL;
1918     }
1919 }
1920 
1921 void migrate_fd_error(MigrationState *s, const Error *error)
1922 {
1923     trace_migrate_fd_error(error_get_pretty(error));
1924     assert(s->to_dst_file == NULL);
1925     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1926                       MIGRATION_STATUS_FAILED);
1927     migrate_set_error(s, error);
1928 }
1929 
1930 static void migrate_fd_cancel(MigrationState *s)
1931 {
1932     int old_state ;
1933     QEMUFile *f = migrate_get_current()->to_dst_file;
1934     trace_migrate_fd_cancel();
1935 
1936     WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1937         if (s->rp_state.from_dst_file) {
1938             /* shutdown the rp socket, so causing the rp thread to shutdown */
1939             qemu_file_shutdown(s->rp_state.from_dst_file);
1940         }
1941     }
1942 
1943     do {
1944         old_state = s->state;
1945         if (!migration_is_running(old_state)) {
1946             break;
1947         }
1948         /* If the migration is paused, kick it out of the pause */
1949         if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1950             qemu_sem_post(&s->pause_sem);
1951         }
1952         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
1953     } while (s->state != MIGRATION_STATUS_CANCELLING);
1954 
1955     /*
1956      * If we're unlucky the migration code might be stuck somewhere in a
1957      * send/write while the network has failed and is waiting to timeout;
1958      * if we've got shutdown(2) available then we can force it to quit.
1959      * The outgoing qemu file gets closed in migrate_fd_cleanup that is
1960      * called in a bh, so there is no race against this cancel.
1961      */
1962     if (s->state == MIGRATION_STATUS_CANCELLING && f) {
1963         qemu_file_shutdown(f);
1964     }
1965     if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
1966         Error *local_err = NULL;
1967 
1968         bdrv_activate_all(&local_err);
1969         if (local_err) {
1970             error_report_err(local_err);
1971         } else {
1972             s->block_inactive = false;
1973         }
1974     }
1975 }
1976 
1977 void add_migration_state_change_notifier(Notifier *notify)
1978 {
1979     notifier_list_add(&migration_state_notifiers, notify);
1980 }
1981 
1982 void remove_migration_state_change_notifier(Notifier *notify)
1983 {
1984     notifier_remove(notify);
1985 }
1986 
1987 bool migration_in_setup(MigrationState *s)
1988 {
1989     return s->state == MIGRATION_STATUS_SETUP;
1990 }
1991 
1992 bool migration_has_finished(MigrationState *s)
1993 {
1994     return s->state == MIGRATION_STATUS_COMPLETED;
1995 }
1996 
1997 bool migration_has_failed(MigrationState *s)
1998 {
1999     return (s->state == MIGRATION_STATUS_CANCELLED ||
2000             s->state == MIGRATION_STATUS_FAILED);
2001 }
2002 
2003 bool migration_in_postcopy(void)
2004 {
2005     MigrationState *s = migrate_get_current();
2006 
2007     switch (s->state) {
2008     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
2009     case MIGRATION_STATUS_POSTCOPY_PAUSED:
2010     case MIGRATION_STATUS_POSTCOPY_RECOVER:
2011         return true;
2012     default:
2013         return false;
2014     }
2015 }
2016 
2017 bool migration_in_postcopy_after_devices(MigrationState *s)
2018 {
2019     return migration_in_postcopy() && s->postcopy_after_devices;
2020 }
2021 
2022 bool migration_in_incoming_postcopy(void)
2023 {
2024     PostcopyState ps = postcopy_state_get();
2025 
2026     return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
2027 }
2028 
2029 bool migration_in_bg_snapshot(void)
2030 {
2031     MigrationState *s = migrate_get_current();
2032 
2033     return migrate_background_snapshot() &&
2034             migration_is_setup_or_active(s->state);
2035 }
2036 
2037 bool migration_is_idle(void)
2038 {
2039     MigrationState *s = current_migration;
2040 
2041     if (!s) {
2042         return true;
2043     }
2044 
2045     switch (s->state) {
2046     case MIGRATION_STATUS_NONE:
2047     case MIGRATION_STATUS_CANCELLED:
2048     case MIGRATION_STATUS_COMPLETED:
2049     case MIGRATION_STATUS_FAILED:
2050         return true;
2051     case MIGRATION_STATUS_SETUP:
2052     case MIGRATION_STATUS_CANCELLING:
2053     case MIGRATION_STATUS_ACTIVE:
2054     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
2055     case MIGRATION_STATUS_COLO:
2056     case MIGRATION_STATUS_PRE_SWITCHOVER:
2057     case MIGRATION_STATUS_DEVICE:
2058     case MIGRATION_STATUS_WAIT_UNPLUG:
2059         return false;
2060     case MIGRATION_STATUS__MAX:
2061         g_assert_not_reached();
2062     }
2063 
2064     return false;
2065 }
2066 
2067 bool migration_is_active(MigrationState *s)
2068 {
2069     return (s->state == MIGRATION_STATUS_ACTIVE ||
2070             s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2071 }
2072 
2073 void migrate_init(MigrationState *s)
2074 {
2075     /*
2076      * Reinitialise all migration state, except
2077      * parameters/capabilities that the user set, and
2078      * locks.
2079      */
2080     s->cleanup_bh = 0;
2081     s->vm_start_bh = 0;
2082     s->to_dst_file = NULL;
2083     s->state = MIGRATION_STATUS_NONE;
2084     s->rp_state.from_dst_file = NULL;
2085     s->rp_state.error = false;
2086     s->mbps = 0.0;
2087     s->pages_per_second = 0.0;
2088     s->downtime = 0;
2089     s->expected_downtime = 0;
2090     s->setup_time = 0;
2091     s->start_postcopy = false;
2092     s->postcopy_after_devices = false;
2093     s->migration_thread_running = false;
2094     error_free(s->error);
2095     s->error = NULL;
2096     s->hostname = NULL;
2097 
2098     migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
2099 
2100     s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2101     s->total_time = 0;
2102     s->vm_was_running = false;
2103     s->iteration_initial_bytes = 0;
2104     s->threshold_size = 0;
2105 }
2106 
2107 int migrate_add_blocker_internal(Error *reason, Error **errp)
2108 {
2109     /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
2110     if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
2111         error_propagate_prepend(errp, error_copy(reason),
2112                                 "disallowing migration blocker "
2113                                 "(migration/snapshot in progress) for: ");
2114         return -EBUSY;
2115     }
2116 
2117     migration_blockers = g_slist_prepend(migration_blockers, reason);
2118     return 0;
2119 }
2120 
2121 int migrate_add_blocker(Error *reason, Error **errp)
2122 {
2123     if (only_migratable) {
2124         error_propagate_prepend(errp, error_copy(reason),
2125                                 "disallowing migration blocker "
2126                                 "(--only-migratable) for: ");
2127         return -EACCES;
2128     }
2129 
2130     return migrate_add_blocker_internal(reason, errp);
2131 }
2132 
2133 void migrate_del_blocker(Error *reason)
2134 {
2135     migration_blockers = g_slist_remove(migration_blockers, reason);
2136 }
2137 
2138 void qmp_migrate_incoming(const char *uri, Error **errp)
2139 {
2140     Error *local_err = NULL;
2141     static bool once = true;
2142 
2143     if (!once) {
2144         error_setg(errp, "The incoming migration has already been started");
2145         return;
2146     }
2147     if (!runstate_check(RUN_STATE_INMIGRATE)) {
2148         error_setg(errp, "'-incoming' was not specified on the command line");
2149         return;
2150     }
2151 
2152     if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2153         return;
2154     }
2155 
2156     qemu_start_incoming_migration(uri, &local_err);
2157 
2158     if (local_err) {
2159         yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2160         error_propagate(errp, local_err);
2161         return;
2162     }
2163 
2164     once = false;
2165 }
2166 
2167 void qmp_migrate_recover(const char *uri, Error **errp)
2168 {
2169     MigrationIncomingState *mis = migration_incoming_get_current();
2170 
2171     /*
2172      * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
2173      * callers (no one should ignore a recover failure); if there is, it's a
2174      * programming error.
2175      */
2176     assert(errp);
2177 
2178     if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2179         error_setg(errp, "Migrate recover can only be run "
2180                    "when postcopy is paused.");
2181         return;
2182     }
2183 
2184     /* If there's an existing transport, release it */
2185     migration_incoming_transport_cleanup(mis);
2186 
2187     /*
2188      * Note that this call will never start a real migration; it will
2189      * only re-setup the migration stream and poke existing migration
2190      * to continue using that newly established channel.
2191      */
2192     qemu_start_incoming_migration(uri, errp);
2193 }
2194 
2195 void qmp_migrate_pause(Error **errp)
2196 {
2197     MigrationState *ms = migrate_get_current();
2198     MigrationIncomingState *mis = migration_incoming_get_current();
2199     int ret;
2200 
2201     if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2202         /* Source side, during postcopy */
2203         qemu_mutex_lock(&ms->qemu_file_lock);
2204         ret = qemu_file_shutdown(ms->to_dst_file);
2205         qemu_mutex_unlock(&ms->qemu_file_lock);
2206         if (ret) {
2207             error_setg(errp, "Failed to pause source migration");
2208         }
2209         return;
2210     }
2211 
2212     if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2213         ret = qemu_file_shutdown(mis->from_src_file);
2214         if (ret) {
2215             error_setg(errp, "Failed to pause destination migration");
2216         }
2217         return;
2218     }
2219 
2220     error_setg(errp, "migrate-pause is currently only supported "
2221                "during postcopy-active state");
2222 }
2223 
2224 bool migration_is_blocked(Error **errp)
2225 {
2226     if (qemu_savevm_state_blocked(errp)) {
2227         return true;
2228     }
2229 
2230     if (migration_blockers) {
2231         error_propagate(errp, error_copy(migration_blockers->data));
2232         return true;
2233     }
2234 
2235     return false;
2236 }
2237 
2238 /* Returns true if continue to migrate, or false if error detected */
2239 static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
2240                             bool resume, Error **errp)
2241 {
2242     Error *local_err = NULL;
2243 
2244     if (resume) {
2245         if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2246             error_setg(errp, "Cannot resume if there is no "
2247                        "paused migration");
2248             return false;
2249         }
2250 
2251         /*
2252          * Postcopy recovery won't work well with release-ram
2253          * capability since release-ram will drop the page buffer as
2254          * long as the page is put into the send buffer.  So if there
2255          * is a network failure happened, any page buffers that have
2256          * not yet reached the destination VM but have already been
2257          * sent from the source VM will be lost forever.  Let's refuse
2258          * the client from resuming such a postcopy migration.
2259          * Luckily release-ram was designed to only be used when src
2260          * and destination VMs are on the same host, so it should be
2261          * fine.
2262          */
2263         if (migrate_release_ram()) {
2264             error_setg(errp, "Postcopy recovery cannot work "
2265                        "when release-ram capability is set");
2266             return false;
2267         }
2268 
2269         /* This is a resume, skip init status */
2270         return true;
2271     }
2272 
2273     if (migration_is_running(s->state)) {
2274         error_setg(errp, QERR_MIGRATION_ACTIVE);
2275         return false;
2276     }
2277 
2278     if (runstate_check(RUN_STATE_INMIGRATE)) {
2279         error_setg(errp, "Guest is waiting for an incoming migration");
2280         return false;
2281     }
2282 
2283     if (runstate_check(RUN_STATE_POSTMIGRATE)) {
2284         error_setg(errp, "Can't migrate the vm that was paused due to "
2285                    "previous migration");
2286         return false;
2287     }
2288 
2289     if (migration_is_blocked(errp)) {
2290         return false;
2291     }
2292 
2293     if (blk || blk_inc) {
2294         if (migrate_colo_enabled()) {
2295             error_setg(errp, "No disk migration is required in COLO mode");
2296             return false;
2297         }
2298         if (migrate_use_block() || migrate_use_block_incremental()) {
2299             error_setg(errp, "Command options are incompatible with "
2300                        "current migration capabilities");
2301             return false;
2302         }
2303         migrate_set_block_enabled(true, &local_err);
2304         if (local_err) {
2305             error_propagate(errp, local_err);
2306             return false;
2307         }
2308         s->must_remove_block_options = true;
2309     }
2310 
2311     if (blk_inc) {
2312         migrate_set_block_incremental(s, true);
2313     }
2314 
2315     migrate_init(s);
2316     /*
2317      * set ram_counters compression_counters memory to zero for a
2318      * new migration
2319      */
2320     memset(&ram_counters, 0, sizeof(ram_counters));
2321     memset(&compression_counters, 0, sizeof(compression_counters));
2322 
2323     return true;
2324 }
2325 
2326 void qmp_migrate(const char *uri, bool has_blk, bool blk,
2327                  bool has_inc, bool inc, bool has_detach, bool detach,
2328                  bool has_resume, bool resume, Error **errp)
2329 {
2330     Error *local_err = NULL;
2331     MigrationState *s = migrate_get_current();
2332     const char *p = NULL;
2333 
2334     if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
2335                          has_resume && resume, errp)) {
2336         /* Error detected, put into errp */
2337         return;
2338     }
2339 
2340     if (!(has_resume && resume)) {
2341         if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2342             return;
2343         }
2344     }
2345 
2346     migrate_protocol_allow_multi_channels(false);
2347     if (strstart(uri, "tcp:", &p) ||
2348         strstart(uri, "unix:", NULL) ||
2349         strstart(uri, "vsock:", NULL)) {
2350         migrate_protocol_allow_multi_channels(true);
2351         socket_start_outgoing_migration(s, p ? p : uri, &local_err);
2352 #ifdef CONFIG_RDMA
2353     } else if (strstart(uri, "rdma:", &p)) {
2354         rdma_start_outgoing_migration(s, p, &local_err);
2355 #endif
2356     } else if (strstart(uri, "exec:", &p)) {
2357         exec_start_outgoing_migration(s, p, &local_err);
2358     } else if (strstart(uri, "fd:", &p)) {
2359         fd_start_outgoing_migration(s, p, &local_err);
2360     } else {
2361         if (!(has_resume && resume)) {
2362             yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2363         }
2364         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
2365                    "a valid migration protocol");
2366         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2367                           MIGRATION_STATUS_FAILED);
2368         block_cleanup_parameters(s);
2369         return;
2370     }
2371 
2372     if (local_err) {
2373         if (!(has_resume && resume)) {
2374             yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2375         }
2376         migrate_fd_error(s, local_err);
2377         error_propagate(errp, local_err);
2378         return;
2379     }
2380 }
2381 
2382 void qmp_migrate_cancel(Error **errp)
2383 {
2384     migration_cancel(NULL);
2385 }
2386 
2387 void qmp_migrate_continue(MigrationStatus state, Error **errp)
2388 {
2389     MigrationState *s = migrate_get_current();
2390     if (s->state != state) {
2391         error_setg(errp,  "Migration not in expected state: %s",
2392                    MigrationStatus_str(s->state));
2393         return;
2394     }
2395     qemu_sem_post(&s->pause_sem);
2396 }
2397 
2398 bool migrate_release_ram(void)
2399 {
2400     MigrationState *s;
2401 
2402     s = migrate_get_current();
2403 
2404     return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
2405 }
2406 
2407 bool migrate_postcopy_ram(void)
2408 {
2409     MigrationState *s;
2410 
2411     s = migrate_get_current();
2412 
2413     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
2414 }
2415 
2416 bool migrate_postcopy(void)
2417 {
2418     return migrate_postcopy_ram() || migrate_dirty_bitmaps();
2419 }
2420 
2421 bool migrate_auto_converge(void)
2422 {
2423     MigrationState *s;
2424 
2425     s = migrate_get_current();
2426 
2427     return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
2428 }
2429 
2430 bool migrate_zero_blocks(void)
2431 {
2432     MigrationState *s;
2433 
2434     s = migrate_get_current();
2435 
2436     return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
2437 }
2438 
2439 bool migrate_postcopy_blocktime(void)
2440 {
2441     MigrationState *s;
2442 
2443     s = migrate_get_current();
2444 
2445     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
2446 }
2447 
2448 bool migrate_use_compression(void)
2449 {
2450     MigrationState *s;
2451 
2452     s = migrate_get_current();
2453 
2454     return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
2455 }
2456 
2457 int migrate_compress_level(void)
2458 {
2459     MigrationState *s;
2460 
2461     s = migrate_get_current();
2462 
2463     return s->parameters.compress_level;
2464 }
2465 
2466 int migrate_compress_threads(void)
2467 {
2468     MigrationState *s;
2469 
2470     s = migrate_get_current();
2471 
2472     return s->parameters.compress_threads;
2473 }
2474 
2475 int migrate_compress_wait_thread(void)
2476 {
2477     MigrationState *s;
2478 
2479     s = migrate_get_current();
2480 
2481     return s->parameters.compress_wait_thread;
2482 }
2483 
2484 int migrate_decompress_threads(void)
2485 {
2486     MigrationState *s;
2487 
2488     s = migrate_get_current();
2489 
2490     return s->parameters.decompress_threads;
2491 }
2492 
2493 bool migrate_dirty_bitmaps(void)
2494 {
2495     MigrationState *s;
2496 
2497     s = migrate_get_current();
2498 
2499     return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
2500 }
2501 
2502 bool migrate_ignore_shared(void)
2503 {
2504     MigrationState *s;
2505 
2506     s = migrate_get_current();
2507 
2508     return s->enabled_capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED];
2509 }
2510 
2511 bool migrate_validate_uuid(void)
2512 {
2513     MigrationState *s;
2514 
2515     s = migrate_get_current();
2516 
2517     return s->enabled_capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID];
2518 }
2519 
2520 bool migrate_use_events(void)
2521 {
2522     MigrationState *s;
2523 
2524     s = migrate_get_current();
2525 
2526     return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
2527 }
2528 
2529 bool migrate_use_multifd(void)
2530 {
2531     MigrationState *s;
2532 
2533     s = migrate_get_current();
2534 
2535     return s->enabled_capabilities[MIGRATION_CAPABILITY_MULTIFD];
2536 }
2537 
2538 bool migrate_pause_before_switchover(void)
2539 {
2540     MigrationState *s;
2541 
2542     s = migrate_get_current();
2543 
2544     return s->enabled_capabilities[
2545         MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
2546 }
2547 
2548 int migrate_multifd_channels(void)
2549 {
2550     MigrationState *s;
2551 
2552     s = migrate_get_current();
2553 
2554     return s->parameters.multifd_channels;
2555 }
2556 
2557 MultiFDCompression migrate_multifd_compression(void)
2558 {
2559     MigrationState *s;
2560 
2561     s = migrate_get_current();
2562 
2563     return s->parameters.multifd_compression;
2564 }
2565 
2566 int migrate_multifd_zlib_level(void)
2567 {
2568     MigrationState *s;
2569 
2570     s = migrate_get_current();
2571 
2572     return s->parameters.multifd_zlib_level;
2573 }
2574 
2575 int migrate_multifd_zstd_level(void)
2576 {
2577     MigrationState *s;
2578 
2579     s = migrate_get_current();
2580 
2581     return s->parameters.multifd_zstd_level;
2582 }
2583 
2584 #ifdef CONFIG_LINUX
2585 bool migrate_use_zero_copy_send(void)
2586 {
2587     MigrationState *s;
2588 
2589     s = migrate_get_current();
2590 
2591     return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND];
2592 }
2593 #endif
2594 
2595 int migrate_use_tls(void)
2596 {
2597     MigrationState *s;
2598 
2599     s = migrate_get_current();
2600 
2601     return s->parameters.tls_creds && *s->parameters.tls_creds;
2602 }
2603 
2604 int migrate_use_xbzrle(void)
2605 {
2606     MigrationState *s;
2607 
2608     s = migrate_get_current();
2609 
2610     return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
2611 }
2612 
2613 uint64_t migrate_xbzrle_cache_size(void)
2614 {
2615     MigrationState *s;
2616 
2617     s = migrate_get_current();
2618 
2619     return s->parameters.xbzrle_cache_size;
2620 }
2621 
2622 static int64_t migrate_max_postcopy_bandwidth(void)
2623 {
2624     MigrationState *s;
2625 
2626     s = migrate_get_current();
2627 
2628     return s->parameters.max_postcopy_bandwidth;
2629 }
2630 
2631 bool migrate_use_block(void)
2632 {
2633     MigrationState *s;
2634 
2635     s = migrate_get_current();
2636 
2637     return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK];
2638 }
2639 
2640 bool migrate_use_return_path(void)
2641 {
2642     MigrationState *s;
2643 
2644     s = migrate_get_current();
2645 
2646     return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
2647 }
2648 
2649 bool migrate_use_block_incremental(void)
2650 {
2651     MigrationState *s;
2652 
2653     s = migrate_get_current();
2654 
2655     return s->parameters.block_incremental;
2656 }
2657 
2658 bool migrate_background_snapshot(void)
2659 {
2660     MigrationState *s;
2661 
2662     s = migrate_get_current();
2663 
2664     return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
2665 }
2666 
2667 /* migration thread support */
2668 /*
2669  * Something bad happened to the RP stream, mark an error
2670  * The caller shall print or trace something to indicate why
2671  */
2672 static void mark_source_rp_bad(MigrationState *s)
2673 {
2674     s->rp_state.error = true;
2675 }
2676 
2677 static struct rp_cmd_args {
2678     ssize_t     len; /* -1 = variable */
2679     const char *name;
2680 } rp_cmd_args[] = {
2681     [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
2682     [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
2683     [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
2684     [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
2685     [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
2686     [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
2687     [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
2688     [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
2689 };
2690 
2691 /*
2692  * Process a request for pages received on the return path,
2693  * We're allowed to send more than requested (e.g. to round to our page size)
2694  * and we don't need to send pages that have already been sent.
2695  */
2696 static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
2697                                        ram_addr_t start, size_t len)
2698 {
2699     long our_host_ps = qemu_real_host_page_size();
2700 
2701     trace_migrate_handle_rp_req_pages(rbname, start, len);
2702 
2703     /*
2704      * Since we currently insist on matching page sizes, just sanity check
2705      * we're being asked for whole host pages.
2706      */
2707     if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
2708         !QEMU_IS_ALIGNED(len, our_host_ps)) {
2709         error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
2710                      " len: %zd", __func__, start, len);
2711         mark_source_rp_bad(ms);
2712         return;
2713     }
2714 
2715     if (ram_save_queue_pages(rbname, start, len)) {
2716         mark_source_rp_bad(ms);
2717     }
2718 }
2719 
2720 /* Return true to retry, false to quit */
2721 static bool postcopy_pause_return_path_thread(MigrationState *s)
2722 {
2723     trace_postcopy_pause_return_path();
2724 
2725     qemu_sem_wait(&s->postcopy_pause_rp_sem);
2726 
2727     trace_postcopy_pause_return_path_continued();
2728 
2729     return true;
2730 }
2731 
2732 static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
2733 {
2734     RAMBlock *block = qemu_ram_block_by_name(block_name);
2735 
2736     if (!block) {
2737         error_report("%s: invalid block name '%s'", __func__, block_name);
2738         return -EINVAL;
2739     }
2740 
2741     /* Fetch the received bitmap and refresh the dirty bitmap */
2742     return ram_dirty_bitmap_reload(s, block);
2743 }
2744 
2745 static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
2746 {
2747     trace_source_return_path_thread_resume_ack(value);
2748 
2749     if (value != MIGRATION_RESUME_ACK_VALUE) {
2750         error_report("%s: illegal resume_ack value %"PRIu32,
2751                      __func__, value);
2752         return -1;
2753     }
2754 
2755     /* Now both sides are active. */
2756     migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2757                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
2758 
2759     /* Notify send thread that time to continue send pages */
2760     qemu_sem_post(&s->rp_state.rp_sem);
2761 
2762     return 0;
2763 }
2764 
2765 /* Release ms->rp_state.from_dst_file in a safe way */
2766 static void migration_release_from_dst_file(MigrationState *ms)
2767 {
2768     QEMUFile *file;
2769 
2770     WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2771         /*
2772          * Reset the from_dst_file pointer first before releasing it, as we
2773          * can't block within lock section
2774          */
2775         file = ms->rp_state.from_dst_file;
2776         ms->rp_state.from_dst_file = NULL;
2777     }
2778 
2779     qemu_fclose(file);
2780 }
2781 
2782 /*
2783  * Handles messages sent on the return path towards the source VM
2784  *
2785  */
2786 static void *source_return_path_thread(void *opaque)
2787 {
2788     MigrationState *ms = opaque;
2789     QEMUFile *rp = ms->rp_state.from_dst_file;
2790     uint16_t header_len, header_type;
2791     uint8_t buf[512];
2792     uint32_t tmp32, sibling_error;
2793     ram_addr_t start = 0; /* =0 to silence warning */
2794     size_t  len = 0, expected_len;
2795     int res;
2796 
2797     trace_source_return_path_thread_entry();
2798     rcu_register_thread();
2799 
2800 retry:
2801     while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
2802            migration_is_setup_or_active(ms->state)) {
2803         trace_source_return_path_thread_loop_top();
2804         header_type = qemu_get_be16(rp);
2805         header_len = qemu_get_be16(rp);
2806 
2807         if (qemu_file_get_error(rp)) {
2808             mark_source_rp_bad(ms);
2809             goto out;
2810         }
2811 
2812         if (header_type >= MIG_RP_MSG_MAX ||
2813             header_type == MIG_RP_MSG_INVALID) {
2814             error_report("RP: Received invalid message 0x%04x length 0x%04x",
2815                          header_type, header_len);
2816             mark_source_rp_bad(ms);
2817             goto out;
2818         }
2819 
2820         if ((rp_cmd_args[header_type].len != -1 &&
2821             header_len != rp_cmd_args[header_type].len) ||
2822             header_len > sizeof(buf)) {
2823             error_report("RP: Received '%s' message (0x%04x) with"
2824                          "incorrect length %d expecting %zu",
2825                          rp_cmd_args[header_type].name, header_type, header_len,
2826                          (size_t)rp_cmd_args[header_type].len);
2827             mark_source_rp_bad(ms);
2828             goto out;
2829         }
2830 
2831         /* We know we've got a valid header by this point */
2832         res = qemu_get_buffer(rp, buf, header_len);
2833         if (res != header_len) {
2834             error_report("RP: Failed reading data for message 0x%04x"
2835                          " read %d expected %d",
2836                          header_type, res, header_len);
2837             mark_source_rp_bad(ms);
2838             goto out;
2839         }
2840 
2841         /* OK, we have the message and the data */
2842         switch (header_type) {
2843         case MIG_RP_MSG_SHUT:
2844             sibling_error = ldl_be_p(buf);
2845             trace_source_return_path_thread_shut(sibling_error);
2846             if (sibling_error) {
2847                 error_report("RP: Sibling indicated error %d", sibling_error);
2848                 mark_source_rp_bad(ms);
2849             }
2850             /*
2851              * We'll let the main thread deal with closing the RP
2852              * we could do a shutdown(2) on it, but we're the only user
2853              * anyway, so there's nothing gained.
2854              */
2855             goto out;
2856 
2857         case MIG_RP_MSG_PONG:
2858             tmp32 = ldl_be_p(buf);
2859             trace_source_return_path_thread_pong(tmp32);
2860             break;
2861 
2862         case MIG_RP_MSG_REQ_PAGES:
2863             start = ldq_be_p(buf);
2864             len = ldl_be_p(buf + 8);
2865             migrate_handle_rp_req_pages(ms, NULL, start, len);
2866             break;
2867 
2868         case MIG_RP_MSG_REQ_PAGES_ID:
2869             expected_len = 12 + 1; /* header + termination */
2870 
2871             if (header_len >= expected_len) {
2872                 start = ldq_be_p(buf);
2873                 len = ldl_be_p(buf + 8);
2874                 /* Now we expect an idstr */
2875                 tmp32 = buf[12]; /* Length of the following idstr */
2876                 buf[13 + tmp32] = '\0';
2877                 expected_len += tmp32;
2878             }
2879             if (header_len != expected_len) {
2880                 error_report("RP: Req_Page_id with length %d expecting %zd",
2881                              header_len, expected_len);
2882                 mark_source_rp_bad(ms);
2883                 goto out;
2884             }
2885             migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
2886             break;
2887 
2888         case MIG_RP_MSG_RECV_BITMAP:
2889             if (header_len < 1) {
2890                 error_report("%s: missing block name", __func__);
2891                 mark_source_rp_bad(ms);
2892                 goto out;
2893             }
2894             /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2895             buf[buf[0] + 1] = '\0';
2896             if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
2897                 mark_source_rp_bad(ms);
2898                 goto out;
2899             }
2900             break;
2901 
2902         case MIG_RP_MSG_RESUME_ACK:
2903             tmp32 = ldl_be_p(buf);
2904             if (migrate_handle_rp_resume_ack(ms, tmp32)) {
2905                 mark_source_rp_bad(ms);
2906                 goto out;
2907             }
2908             break;
2909 
2910         default:
2911             break;
2912         }
2913     }
2914 
2915 out:
2916     res = qemu_file_get_error(rp);
2917     if (res) {
2918         if (res && migration_in_postcopy()) {
2919             /*
2920              * Maybe there is something we can do: it looks like a
2921              * network down issue, and we pause for a recovery.
2922              */
2923             migration_release_from_dst_file(ms);
2924             rp = NULL;
2925             if (postcopy_pause_return_path_thread(ms)) {
2926                 /*
2927                  * Reload rp, reset the rest.  Referencing it is safe since
2928                  * it's reset only by us above, or when migration completes
2929                  */
2930                 rp = ms->rp_state.from_dst_file;
2931                 ms->rp_state.error = false;
2932                 goto retry;
2933             }
2934         }
2935 
2936         trace_source_return_path_thread_bad_end();
2937         mark_source_rp_bad(ms);
2938     }
2939 
2940     trace_source_return_path_thread_end();
2941     migration_release_from_dst_file(ms);
2942     rcu_unregister_thread();
2943     return NULL;
2944 }
2945 
2946 static int open_return_path_on_source(MigrationState *ms,
2947                                       bool create_thread)
2948 {
2949     ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
2950     if (!ms->rp_state.from_dst_file) {
2951         return -1;
2952     }
2953 
2954     trace_open_return_path_on_source();
2955 
2956     if (!create_thread) {
2957         /* We're done */
2958         return 0;
2959     }
2960 
2961     qemu_thread_create(&ms->rp_state.rp_thread, "return path",
2962                        source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
2963     ms->rp_state.rp_thread_created = true;
2964 
2965     trace_open_return_path_on_source_continue();
2966 
2967     return 0;
2968 }
2969 
2970 /* Returns 0 if the RP was ok, otherwise there was an error on the RP */
2971 static int await_return_path_close_on_source(MigrationState *ms)
2972 {
2973     /*
2974      * If this is a normal exit then the destination will send a SHUT and the
2975      * rp_thread will exit, however if there's an error we need to cause
2976      * it to exit.
2977      */
2978     if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
2979         /*
2980          * shutdown(2), if we have it, will cause it to unblock if it's stuck
2981          * waiting for the destination.
2982          */
2983         qemu_file_shutdown(ms->rp_state.from_dst_file);
2984         mark_source_rp_bad(ms);
2985     }
2986     trace_await_return_path_close_on_source_joining();
2987     qemu_thread_join(&ms->rp_state.rp_thread);
2988     ms->rp_state.rp_thread_created = false;
2989     trace_await_return_path_close_on_source_close();
2990     return ms->rp_state.error;
2991 }
2992 
2993 /*
2994  * Switch from normal iteration to postcopy
2995  * Returns non-0 on error
2996  */
2997 static int postcopy_start(MigrationState *ms)
2998 {
2999     int ret;
3000     QIOChannelBuffer *bioc;
3001     QEMUFile *fb;
3002     int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3003     int64_t bandwidth = migrate_max_postcopy_bandwidth();
3004     bool restart_block = false;
3005     int cur_state = MIGRATION_STATUS_ACTIVE;
3006     if (!migrate_pause_before_switchover()) {
3007         migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
3008                           MIGRATION_STATUS_POSTCOPY_ACTIVE);
3009     }
3010 
3011     trace_postcopy_start();
3012     qemu_mutex_lock_iothread();
3013     trace_postcopy_start_set_run();
3014 
3015     qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3016     global_state_store();
3017     ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
3018     if (ret < 0) {
3019         goto fail;
3020     }
3021 
3022     ret = migration_maybe_pause(ms, &cur_state,
3023                                 MIGRATION_STATUS_POSTCOPY_ACTIVE);
3024     if (ret < 0) {
3025         goto fail;
3026     }
3027 
3028     ret = bdrv_inactivate_all();
3029     if (ret < 0) {
3030         goto fail;
3031     }
3032     restart_block = true;
3033 
3034     /*
3035      * Cause any non-postcopiable, but iterative devices to
3036      * send out their final data.
3037      */
3038     qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
3039 
3040     /*
3041      * in Finish migrate and with the io-lock held everything should
3042      * be quiet, but we've potentially still got dirty pages and we
3043      * need to tell the destination to throw any pages it's already received
3044      * that are dirty
3045      */
3046     if (migrate_postcopy_ram()) {
3047         ram_postcopy_send_discard_bitmap(ms);
3048     }
3049 
3050     /*
3051      * send rest of state - note things that are doing postcopy
3052      * will notice we're in POSTCOPY_ACTIVE and not actually
3053      * wrap their state up here
3054      */
3055     /* 0 max-postcopy-bandwidth means unlimited */
3056     if (!bandwidth) {
3057         qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
3058     } else {
3059         qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO);
3060     }
3061     if (migrate_postcopy_ram()) {
3062         /* Ping just for debugging, helps line traces up */
3063         qemu_savevm_send_ping(ms->to_dst_file, 2);
3064     }
3065 
3066     /*
3067      * While loading the device state we may trigger page transfer
3068      * requests and the fd must be free to process those, and thus
3069      * the destination must read the whole device state off the fd before
3070      * it starts processing it.  Unfortunately the ad-hoc migration format
3071      * doesn't allow the destination to know the size to read without fully
3072      * parsing it through each devices load-state code (especially the open
3073      * coded devices that use get/put).
3074      * So we wrap the device state up in a package with a length at the start;
3075      * to do this we use a qemu_buf to hold the whole of the device state.
3076      */
3077     bioc = qio_channel_buffer_new(4096);
3078     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
3079     fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
3080     object_unref(OBJECT(bioc));
3081 
3082     /*
3083      * Make sure the receiver can get incoming pages before we send the rest
3084      * of the state
3085      */
3086     qemu_savevm_send_postcopy_listen(fb);
3087 
3088     qemu_savevm_state_complete_precopy(fb, false, false);
3089     if (migrate_postcopy_ram()) {
3090         qemu_savevm_send_ping(fb, 3);
3091     }
3092 
3093     qemu_savevm_send_postcopy_run(fb);
3094 
3095     /* <><> end of stuff going into the package */
3096 
3097     /* Last point of recovery; as soon as we send the package the destination
3098      * can open devices and potentially start running.
3099      * Lets just check again we've not got any errors.
3100      */
3101     ret = qemu_file_get_error(ms->to_dst_file);
3102     if (ret) {
3103         error_report("postcopy_start: Migration stream errored (pre package)");
3104         goto fail_closefb;
3105     }
3106 
3107     restart_block = false;
3108 
3109     /* Now send that blob */
3110     if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
3111         goto fail_closefb;
3112     }
3113     qemu_fclose(fb);
3114 
3115     /* Send a notify to give a chance for anything that needs to happen
3116      * at the transition to postcopy and after the device state; in particular
3117      * spice needs to trigger a transition now
3118      */
3119     ms->postcopy_after_devices = true;
3120     notifier_list_notify(&migration_state_notifiers, ms);
3121 
3122     ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
3123 
3124     qemu_mutex_unlock_iothread();
3125 
3126     if (migrate_postcopy_ram()) {
3127         /*
3128          * Although this ping is just for debug, it could potentially be
3129          * used for getting a better measurement of downtime at the source.
3130          */
3131         qemu_savevm_send_ping(ms->to_dst_file, 4);
3132     }
3133 
3134     if (migrate_release_ram()) {
3135         ram_postcopy_migrated_memory_release(ms);
3136     }
3137 
3138     ret = qemu_file_get_error(ms->to_dst_file);
3139     if (ret) {
3140         error_report("postcopy_start: Migration stream errored");
3141         migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3142                               MIGRATION_STATUS_FAILED);
3143     }
3144 
3145     return ret;
3146 
3147 fail_closefb:
3148     qemu_fclose(fb);
3149 fail:
3150     migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3151                           MIGRATION_STATUS_FAILED);
3152     if (restart_block) {
3153         /* A failure happened early enough that we know the destination hasn't
3154          * accessed block devices, so we're safe to recover.
3155          */
3156         Error *local_err = NULL;
3157 
3158         bdrv_activate_all(&local_err);
3159         if (local_err) {
3160             error_report_err(local_err);
3161         }
3162     }
3163     qemu_mutex_unlock_iothread();
3164     return -1;
3165 }
3166 
3167 /**
3168  * migration_maybe_pause: Pause if required to by
3169  * migrate_pause_before_switchover called with the iothread locked
3170  * Returns: 0 on success
3171  */
3172 static int migration_maybe_pause(MigrationState *s,
3173                                  int *current_active_state,
3174                                  int new_state)
3175 {
3176     if (!migrate_pause_before_switchover()) {
3177         return 0;
3178     }
3179 
3180     /* Since leaving this state is not atomic with posting the semaphore
3181      * it's possible that someone could have issued multiple migrate_continue
3182      * and the semaphore is incorrectly positive at this point;
3183      * the docs say it's undefined to reinit a semaphore that's already
3184      * init'd, so use timedwait to eat up any existing posts.
3185      */
3186     while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
3187         /* This block intentionally left blank */
3188     }
3189 
3190     /*
3191      * If the migration is cancelled when it is in the completion phase,
3192      * the migration state is set to MIGRATION_STATUS_CANCELLING.
3193      * So we don't need to wait a semaphore, otherwise we would always
3194      * wait for the 'pause_sem' semaphore.
3195      */
3196     if (s->state != MIGRATION_STATUS_CANCELLING) {
3197         qemu_mutex_unlock_iothread();
3198         migrate_set_state(&s->state, *current_active_state,
3199                           MIGRATION_STATUS_PRE_SWITCHOVER);
3200         qemu_sem_wait(&s->pause_sem);
3201         migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
3202                           new_state);
3203         *current_active_state = new_state;
3204         qemu_mutex_lock_iothread();
3205     }
3206 
3207     return s->state == new_state ? 0 : -EINVAL;
3208 }
3209 
3210 /**
3211  * migration_completion: Used by migration_thread when there's not much left.
3212  *   The caller 'breaks' the loop when this returns.
3213  *
3214  * @s: Current migration state
3215  */
3216 static void migration_completion(MigrationState *s)
3217 {
3218     int ret;
3219     int current_active_state = s->state;
3220 
3221     if (s->state == MIGRATION_STATUS_ACTIVE) {
3222         qemu_mutex_lock_iothread();
3223         s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3224         qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3225         s->vm_was_running = runstate_is_running();
3226         ret = global_state_store();
3227 
3228         if (!ret) {
3229             bool inactivate = !migrate_colo_enabled();
3230             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
3231             trace_migration_completion_vm_stop(ret);
3232             if (ret >= 0) {
3233                 ret = migration_maybe_pause(s, &current_active_state,
3234                                             MIGRATION_STATUS_DEVICE);
3235             }
3236             if (ret >= 0) {
3237                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
3238                 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
3239                                                          inactivate);
3240             }
3241             if (inactivate && ret >= 0) {
3242                 s->block_inactive = true;
3243             }
3244         }
3245         qemu_mutex_unlock_iothread();
3246 
3247         if (ret < 0) {
3248             goto fail;
3249         }
3250     } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3251         trace_migration_completion_postcopy_end();
3252 
3253         qemu_mutex_lock_iothread();
3254         qemu_savevm_state_complete_postcopy(s->to_dst_file);
3255         qemu_mutex_unlock_iothread();
3256 
3257         trace_migration_completion_postcopy_end_after_complete();
3258     } else {
3259         goto fail;
3260     }
3261 
3262     /*
3263      * If rp was opened we must clean up the thread before
3264      * cleaning everything else up (since if there are no failures
3265      * it will wait for the destination to send it's status in
3266      * a SHUT command).
3267      */
3268     if (s->rp_state.rp_thread_created) {
3269         int rp_error;
3270         trace_migration_return_path_end_before();
3271         rp_error = await_return_path_close_on_source(s);
3272         trace_migration_return_path_end_after(rp_error);
3273         if (rp_error) {
3274             goto fail_invalidate;
3275         }
3276     }
3277 
3278     if (qemu_file_get_error(s->to_dst_file)) {
3279         trace_migration_completion_file_err();
3280         goto fail_invalidate;
3281     }
3282 
3283     if (migrate_colo_enabled() && s->state == MIGRATION_STATUS_ACTIVE) {
3284         /* COLO does not support postcopy */
3285         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3286                           MIGRATION_STATUS_COLO);
3287     } else {
3288         migrate_set_state(&s->state, current_active_state,
3289                           MIGRATION_STATUS_COMPLETED);
3290     }
3291 
3292     return;
3293 
3294 fail_invalidate:
3295     /* If not doing postcopy, vm_start() will be called: let's regain
3296      * control on images.
3297      */
3298     if (s->state == MIGRATION_STATUS_ACTIVE ||
3299         s->state == MIGRATION_STATUS_DEVICE) {
3300         Error *local_err = NULL;
3301 
3302         qemu_mutex_lock_iothread();
3303         bdrv_activate_all(&local_err);
3304         if (local_err) {
3305             error_report_err(local_err);
3306         } else {
3307             s->block_inactive = false;
3308         }
3309         qemu_mutex_unlock_iothread();
3310     }
3311 
3312 fail:
3313     migrate_set_state(&s->state, current_active_state,
3314                       MIGRATION_STATUS_FAILED);
3315 }
3316 
3317 /**
3318  * bg_migration_completion: Used by bg_migration_thread when after all the
3319  *   RAM has been saved. The caller 'breaks' the loop when this returns.
3320  *
3321  * @s: Current migration state
3322  */
3323 static void bg_migration_completion(MigrationState *s)
3324 {
3325     int current_active_state = s->state;
3326 
3327     /*
3328      * Stop tracking RAM writes - un-protect memory, un-register UFFD
3329      * memory ranges, flush kernel wait queues and wake up threads
3330      * waiting for write fault to be resolved.
3331      */
3332     ram_write_tracking_stop();
3333 
3334     if (s->state == MIGRATION_STATUS_ACTIVE) {
3335         /*
3336          * By this moment we have RAM content saved into the migration stream.
3337          * The next step is to flush the non-RAM content (device state)
3338          * right after the ram content. The device state has been stored into
3339          * the temporary buffer before RAM saving started.
3340          */
3341         qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
3342         qemu_fflush(s->to_dst_file);
3343     } else if (s->state == MIGRATION_STATUS_CANCELLING) {
3344         goto fail;
3345     }
3346 
3347     if (qemu_file_get_error(s->to_dst_file)) {
3348         trace_migration_completion_file_err();
3349         goto fail;
3350     }
3351 
3352     migrate_set_state(&s->state, current_active_state,
3353                       MIGRATION_STATUS_COMPLETED);
3354     return;
3355 
3356 fail:
3357     migrate_set_state(&s->state, current_active_state,
3358                       MIGRATION_STATUS_FAILED);
3359 }
3360 
3361 bool migrate_colo_enabled(void)
3362 {
3363     MigrationState *s = migrate_get_current();
3364     return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
3365 }
3366 
3367 typedef enum MigThrError {
3368     /* No error detected */
3369     MIG_THR_ERR_NONE = 0,
3370     /* Detected error, but resumed successfully */
3371     MIG_THR_ERR_RECOVERED = 1,
3372     /* Detected fatal error, need to exit */
3373     MIG_THR_ERR_FATAL = 2,
3374 } MigThrError;
3375 
3376 static int postcopy_resume_handshake(MigrationState *s)
3377 {
3378     qemu_savevm_send_postcopy_resume(s->to_dst_file);
3379 
3380     while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3381         qemu_sem_wait(&s->rp_state.rp_sem);
3382     }
3383 
3384     if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3385         return 0;
3386     }
3387 
3388     return -1;
3389 }
3390 
3391 /* Return zero if success, or <0 for error */
3392 static int postcopy_do_resume(MigrationState *s)
3393 {
3394     int ret;
3395 
3396     /*
3397      * Call all the resume_prepare() hooks, so that modules can be
3398      * ready for the migration resume.
3399      */
3400     ret = qemu_savevm_state_resume_prepare(s);
3401     if (ret) {
3402         error_report("%s: resume_prepare() failure detected: %d",
3403                      __func__, ret);
3404         return ret;
3405     }
3406 
3407     /*
3408      * Last handshake with destination on the resume (destination will
3409      * switch to postcopy-active afterwards)
3410      */
3411     ret = postcopy_resume_handshake(s);
3412     if (ret) {
3413         error_report("%s: handshake failed: %d", __func__, ret);
3414         return ret;
3415     }
3416 
3417     return 0;
3418 }
3419 
3420 /*
3421  * We don't return until we are in a safe state to continue current
3422  * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
3423  * MIG_THR_ERR_FATAL if unrecovery failure happened.
3424  */
3425 static MigThrError postcopy_pause(MigrationState *s)
3426 {
3427     assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
3428 
3429     while (true) {
3430         QEMUFile *file;
3431 
3432         /*
3433          * Current channel is possibly broken. Release it.  Note that this is
3434          * guaranteed even without lock because to_dst_file should only be
3435          * modified by the migration thread.  That also guarantees that the
3436          * unregister of yank is safe too without the lock.  It should be safe
3437          * even to be within the qemu_file_lock, but we didn't do that to avoid
3438          * taking more mutex (yank_lock) within qemu_file_lock.  TL;DR: we make
3439          * the qemu_file_lock critical section as small as possible.
3440          */
3441         assert(s->to_dst_file);
3442         migration_ioc_unregister_yank_from_file(s->to_dst_file);
3443         qemu_mutex_lock(&s->qemu_file_lock);
3444         file = s->to_dst_file;
3445         s->to_dst_file = NULL;
3446         qemu_mutex_unlock(&s->qemu_file_lock);
3447 
3448         qemu_file_shutdown(file);
3449         qemu_fclose(file);
3450 
3451         migrate_set_state(&s->state, s->state,
3452                           MIGRATION_STATUS_POSTCOPY_PAUSED);
3453 
3454         error_report("Detected IO failure for postcopy. "
3455                      "Migration paused.");
3456 
3457         /*
3458          * We wait until things fixed up. Then someone will setup the
3459          * status back for us.
3460          */
3461         while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
3462             qemu_sem_wait(&s->postcopy_pause_sem);
3463         }
3464 
3465         if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3466             /* Woken up by a recover procedure. Give it a shot */
3467 
3468             /*
3469              * Firstly, let's wake up the return path now, with a new
3470              * return path channel.
3471              */
3472             qemu_sem_post(&s->postcopy_pause_rp_sem);
3473 
3474             /* Do the resume logic */
3475             if (postcopy_do_resume(s) == 0) {
3476                 /* Let's continue! */
3477                 trace_postcopy_pause_continued();
3478                 return MIG_THR_ERR_RECOVERED;
3479             } else {
3480                 /*
3481                  * Something wrong happened during the recovery, let's
3482                  * pause again. Pause is always better than throwing
3483                  * data away.
3484                  */
3485                 continue;
3486             }
3487         } else {
3488             /* This is not right... Time to quit. */
3489             return MIG_THR_ERR_FATAL;
3490         }
3491     }
3492 }
3493 
3494 static MigThrError migration_detect_error(MigrationState *s)
3495 {
3496     int ret;
3497     int state = s->state;
3498     Error *local_error = NULL;
3499 
3500     if (state == MIGRATION_STATUS_CANCELLING ||
3501         state == MIGRATION_STATUS_CANCELLED) {
3502         /* End the migration, but don't set the state to failed */
3503         return MIG_THR_ERR_FATAL;
3504     }
3505 
3506     /* Try to detect any file errors */
3507     ret = qemu_file_get_error_obj(s->to_dst_file, &local_error);
3508     if (!ret) {
3509         /* Everything is fine */
3510         assert(!local_error);
3511         return MIG_THR_ERR_NONE;
3512     }
3513 
3514     if (local_error) {
3515         migrate_set_error(s, local_error);
3516         error_free(local_error);
3517     }
3518 
3519     if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) {
3520         /*
3521          * For postcopy, we allow the network to be down for a
3522          * while. After that, it can be continued by a
3523          * recovery phase.
3524          */
3525         return postcopy_pause(s);
3526     } else {
3527         /*
3528          * For precopy (or postcopy with error outside IO), we fail
3529          * with no time.
3530          */
3531         migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
3532         trace_migration_thread_file_err();
3533 
3534         /* Time to stop the migration, now. */
3535         return MIG_THR_ERR_FATAL;
3536     }
3537 }
3538 
3539 /* How many bytes have we transferred since the beginning of the migration */
3540 static uint64_t migration_total_bytes(MigrationState *s)
3541 {
3542     return qemu_ftell(s->to_dst_file) + ram_counters.multifd_bytes;
3543 }
3544 
3545 static void migration_calculate_complete(MigrationState *s)
3546 {
3547     uint64_t bytes = migration_total_bytes(s);
3548     int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3549     int64_t transfer_time;
3550 
3551     s->total_time = end_time - s->start_time;
3552     if (!s->downtime) {
3553         /*
3554          * It's still not set, so we are precopy migration.  For
3555          * postcopy, downtime is calculated during postcopy_start().
3556          */
3557         s->downtime = end_time - s->downtime_start;
3558     }
3559 
3560     transfer_time = s->total_time - s->setup_time;
3561     if (transfer_time) {
3562         s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
3563     }
3564 }
3565 
3566 static void update_iteration_initial_status(MigrationState *s)
3567 {
3568     /*
3569      * Update these three fields at the same time to avoid mismatch info lead
3570      * wrong speed calculation.
3571      */
3572     s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3573     s->iteration_initial_bytes = migration_total_bytes(s);
3574     s->iteration_initial_pages = ram_get_total_transferred_pages();
3575 }
3576 
3577 static void migration_update_counters(MigrationState *s,
3578                                       int64_t current_time)
3579 {
3580     uint64_t transferred, transferred_pages, time_spent;
3581     uint64_t current_bytes; /* bytes transferred since the beginning */
3582     double bandwidth;
3583 
3584     if (current_time < s->iteration_start_time + BUFFER_DELAY) {
3585         return;
3586     }
3587 
3588     current_bytes = migration_total_bytes(s);
3589     transferred = current_bytes - s->iteration_initial_bytes;
3590     time_spent = current_time - s->iteration_start_time;
3591     bandwidth = (double)transferred / time_spent;
3592     s->threshold_size = bandwidth * s->parameters.downtime_limit;
3593 
3594     s->mbps = (((double) transferred * 8.0) /
3595                ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
3596 
3597     transferred_pages = ram_get_total_transferred_pages() -
3598                             s->iteration_initial_pages;
3599     s->pages_per_second = (double) transferred_pages /
3600                              (((double) time_spent / 1000.0));
3601 
3602     /*
3603      * if we haven't sent anything, we don't want to
3604      * recalculate. 10000 is a small enough number for our purposes
3605      */
3606     if (ram_counters.dirty_pages_rate && transferred > 10000) {
3607         s->expected_downtime = ram_counters.remaining / bandwidth;
3608     }
3609 
3610     qemu_file_reset_rate_limit(s->to_dst_file);
3611 
3612     update_iteration_initial_status(s);
3613 
3614     trace_migrate_transferred(transferred, time_spent,
3615                               bandwidth, s->threshold_size);
3616 }
3617 
3618 /* Migration thread iteration status */
3619 typedef enum {
3620     MIG_ITERATE_RESUME,         /* Resume current iteration */
3621     MIG_ITERATE_SKIP,           /* Skip current iteration */
3622     MIG_ITERATE_BREAK,          /* Break the loop */
3623 } MigIterateState;
3624 
3625 /*
3626  * Return true if continue to the next iteration directly, false
3627  * otherwise.
3628  */
3629 static MigIterateState migration_iteration_run(MigrationState *s)
3630 {
3631     uint64_t pending_size, pend_pre, pend_compat, pend_post;
3632     bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
3633 
3634     qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
3635                               &pend_compat, &pend_post);
3636     pending_size = pend_pre + pend_compat + pend_post;
3637 
3638     trace_migrate_pending(pending_size, s->threshold_size,
3639                           pend_pre, pend_compat, pend_post);
3640 
3641     if (pending_size && pending_size >= s->threshold_size) {
3642         /* Still a significant amount to transfer */
3643         if (!in_postcopy && pend_pre <= s->threshold_size &&
3644             qatomic_read(&s->start_postcopy)) {
3645             if (postcopy_start(s)) {
3646                 error_report("%s: postcopy failed to start", __func__);
3647             }
3648             return MIG_ITERATE_SKIP;
3649         }
3650         /* Just another iteration step */
3651         qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
3652     } else {
3653         trace_migration_thread_low_pending(pending_size);
3654         migration_completion(s);
3655         return MIG_ITERATE_BREAK;
3656     }
3657 
3658     return MIG_ITERATE_RESUME;
3659 }
3660 
3661 static void migration_iteration_finish(MigrationState *s)
3662 {
3663     /* If we enabled cpu throttling for auto-converge, turn it off. */
3664     cpu_throttle_stop();
3665 
3666     qemu_mutex_lock_iothread();
3667     switch (s->state) {
3668     case MIGRATION_STATUS_COMPLETED:
3669         migration_calculate_complete(s);
3670         runstate_set(RUN_STATE_POSTMIGRATE);
3671         break;
3672     case MIGRATION_STATUS_COLO:
3673         if (!migrate_colo_enabled()) {
3674             error_report("%s: critical error: calling COLO code without "
3675                          "COLO enabled", __func__);
3676         }
3677         migrate_start_colo_process(s);
3678         s->vm_was_running = true;
3679         /* Fallthrough */
3680     case MIGRATION_STATUS_FAILED:
3681     case MIGRATION_STATUS_CANCELLED:
3682     case MIGRATION_STATUS_CANCELLING:
3683         if (s->vm_was_running) {
3684             if (!runstate_check(RUN_STATE_SHUTDOWN)) {
3685                 vm_start();
3686             }
3687         } else {
3688             if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
3689                 runstate_set(RUN_STATE_POSTMIGRATE);
3690             }
3691         }
3692         break;
3693 
3694     default:
3695         /* Should not reach here, but if so, forgive the VM. */
3696         error_report("%s: Unknown ending state %d", __func__, s->state);
3697         break;
3698     }
3699     migrate_fd_cleanup_schedule(s);
3700     qemu_mutex_unlock_iothread();
3701 }
3702 
3703 static void bg_migration_iteration_finish(MigrationState *s)
3704 {
3705     qemu_mutex_lock_iothread();
3706     switch (s->state) {
3707     case MIGRATION_STATUS_COMPLETED:
3708         migration_calculate_complete(s);
3709         break;
3710 
3711     case MIGRATION_STATUS_ACTIVE:
3712     case MIGRATION_STATUS_FAILED:
3713     case MIGRATION_STATUS_CANCELLED:
3714     case MIGRATION_STATUS_CANCELLING:
3715         break;
3716 
3717     default:
3718         /* Should not reach here, but if so, forgive the VM. */
3719         error_report("%s: Unknown ending state %d", __func__, s->state);
3720         break;
3721     }
3722 
3723     migrate_fd_cleanup_schedule(s);
3724     qemu_mutex_unlock_iothread();
3725 }
3726 
3727 /*
3728  * Return true if continue to the next iteration directly, false
3729  * otherwise.
3730  */
3731 static MigIterateState bg_migration_iteration_run(MigrationState *s)
3732 {
3733     int res;
3734 
3735     res = qemu_savevm_state_iterate(s->to_dst_file, false);
3736     if (res > 0) {
3737         bg_migration_completion(s);
3738         return MIG_ITERATE_BREAK;
3739     }
3740 
3741     return MIG_ITERATE_RESUME;
3742 }
3743 
3744 void migration_make_urgent_request(void)
3745 {
3746     qemu_sem_post(&migrate_get_current()->rate_limit_sem);
3747 }
3748 
3749 void migration_consume_urgent_request(void)
3750 {
3751     qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
3752 }
3753 
3754 /* Returns true if the rate limiting was broken by an urgent request */
3755 bool migration_rate_limit(void)
3756 {
3757     int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3758     MigrationState *s = migrate_get_current();
3759 
3760     bool urgent = false;
3761     migration_update_counters(s, now);
3762     if (qemu_file_rate_limit(s->to_dst_file)) {
3763 
3764         if (qemu_file_get_error(s->to_dst_file)) {
3765             return false;
3766         }
3767         /*
3768          * Wait for a delay to do rate limiting OR
3769          * something urgent to post the semaphore.
3770          */
3771         int ms = s->iteration_start_time + BUFFER_DELAY - now;
3772         trace_migration_rate_limit_pre(ms);
3773         if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
3774             /*
3775              * We were woken by one or more urgent things but
3776              * the timedwait will have consumed one of them.
3777              * The service routine for the urgent wake will dec
3778              * the semaphore itself for each item it consumes,
3779              * so add this one we just eat back.
3780              */
3781             qemu_sem_post(&s->rate_limit_sem);
3782             urgent = true;
3783         }
3784         trace_migration_rate_limit_post(urgent);
3785     }
3786     return urgent;
3787 }
3788 
3789 /*
3790  * if failover devices are present, wait they are completely
3791  * unplugged
3792  */
3793 
3794 static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
3795                                     int new_state)
3796 {
3797     if (qemu_savevm_state_guest_unplug_pending()) {
3798         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
3799 
3800         while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
3801                qemu_savevm_state_guest_unplug_pending()) {
3802             qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3803         }
3804         if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
3805             int timeout = 120; /* 30 seconds */
3806             /*
3807              * migration has been canceled
3808              * but as we have started an unplug we must wait the end
3809              * to be able to plug back the card
3810              */
3811             while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
3812                 qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3813             }
3814             if (qemu_savevm_state_guest_unplug_pending() &&
3815                 !qtest_enabled()) {
3816                 warn_report("migration: partially unplugged device on "
3817                             "failure");
3818             }
3819         }
3820 
3821         migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
3822     } else {
3823         migrate_set_state(&s->state, old_state, new_state);
3824     }
3825 }
3826 
3827 /*
3828  * Master migration thread on the source VM.
3829  * It drives the migration and pumps the data down the outgoing channel.
3830  */
3831 static void *migration_thread(void *opaque)
3832 {
3833     MigrationState *s = opaque;
3834     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3835     MigThrError thr_error;
3836     bool urgent = false;
3837 
3838     rcu_register_thread();
3839 
3840     object_ref(OBJECT(s));
3841     update_iteration_initial_status(s);
3842 
3843     qemu_savevm_state_header(s->to_dst_file);
3844 
3845     /*
3846      * If we opened the return path, we need to make sure dst has it
3847      * opened as well.
3848      */
3849     if (s->rp_state.rp_thread_created) {
3850         /* Now tell the dest that it should open its end so it can reply */
3851         qemu_savevm_send_open_return_path(s->to_dst_file);
3852 
3853         /* And do a ping that will make stuff easier to debug */
3854         qemu_savevm_send_ping(s->to_dst_file, 1);
3855     }
3856 
3857     if (migrate_postcopy()) {
3858         /*
3859          * Tell the destination that we *might* want to do postcopy later;
3860          * if the other end can't do postcopy it should fail now, nice and
3861          * early.
3862          */
3863         qemu_savevm_send_postcopy_advise(s->to_dst_file);
3864     }
3865 
3866     if (migrate_colo_enabled()) {
3867         /* Notify migration destination that we enable COLO */
3868         qemu_savevm_send_colo_enable(s->to_dst_file);
3869     }
3870 
3871     qemu_savevm_state_setup(s->to_dst_file);
3872 
3873     qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3874                                MIGRATION_STATUS_ACTIVE);
3875 
3876     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3877 
3878     trace_migration_thread_setup_complete();
3879 
3880     while (migration_is_active(s)) {
3881         if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
3882             MigIterateState iter_state = migration_iteration_run(s);
3883             if (iter_state == MIG_ITERATE_SKIP) {
3884                 continue;
3885             } else if (iter_state == MIG_ITERATE_BREAK) {
3886                 break;
3887             }
3888         }
3889 
3890         /*
3891          * Try to detect any kind of failures, and see whether we
3892          * should stop the migration now.
3893          */
3894         thr_error = migration_detect_error(s);
3895         if (thr_error == MIG_THR_ERR_FATAL) {
3896             /* Stop migration */
3897             break;
3898         } else if (thr_error == MIG_THR_ERR_RECOVERED) {
3899             /*
3900              * Just recovered from a e.g. network failure, reset all
3901              * the local variables. This is important to avoid
3902              * breaking transferred_bytes and bandwidth calculation
3903              */
3904             update_iteration_initial_status(s);
3905         }
3906 
3907         urgent = migration_rate_limit();
3908     }
3909 
3910     trace_migration_thread_after_loop();
3911     migration_iteration_finish(s);
3912     object_unref(OBJECT(s));
3913     rcu_unregister_thread();
3914     return NULL;
3915 }
3916 
3917 static void bg_migration_vm_start_bh(void *opaque)
3918 {
3919     MigrationState *s = opaque;
3920 
3921     qemu_bh_delete(s->vm_start_bh);
3922     s->vm_start_bh = NULL;
3923 
3924     vm_start();
3925     s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
3926 }
3927 
3928 /**
3929  * Background snapshot thread, based on live migration code.
3930  * This is an alternative implementation of live migration mechanism
3931  * introduced specifically to support background snapshots.
3932  *
3933  * It takes advantage of userfault_fd write protection mechanism introduced
3934  * in v5.7 kernel. Compared to existing dirty page logging migration much
3935  * lesser stream traffic is produced resulting in smaller snapshot images,
3936  * simply cause of no page duplicates can get into the stream.
3937  *
3938  * Another key point is that generated vmstate stream reflects machine state
3939  * 'frozen' at the beginning of snapshot creation compared to dirty page logging
3940  * mechanism, which effectively results in that saved snapshot is the state of VM
3941  * at the end of the process.
3942  */
3943 static void *bg_migration_thread(void *opaque)
3944 {
3945     MigrationState *s = opaque;
3946     int64_t setup_start;
3947     MigThrError thr_error;
3948     QEMUFile *fb;
3949     bool early_fail = true;
3950 
3951     rcu_register_thread();
3952     object_ref(OBJECT(s));
3953 
3954     qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
3955 
3956     setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3957     /*
3958      * We want to save vmstate for the moment when migration has been
3959      * initiated but also we want to save RAM content while VM is running.
3960      * The RAM content should appear first in the vmstate. So, we first
3961      * stash the non-RAM part of the vmstate to the temporary buffer,
3962      * then write RAM part of the vmstate to the migration stream
3963      * with vCPUs running and, finally, write stashed non-RAM part of
3964      * the vmstate from the buffer to the migration stream.
3965      */
3966     s->bioc = qio_channel_buffer_new(512 * 1024);
3967     qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
3968     fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc));
3969     object_unref(OBJECT(s->bioc));
3970 
3971     update_iteration_initial_status(s);
3972 
3973     /*
3974      * Prepare for tracking memory writes with UFFD-WP - populate
3975      * RAM pages before protecting.
3976      */
3977 #ifdef __linux__
3978     ram_write_tracking_prepare();
3979 #endif
3980 
3981     qemu_savevm_state_header(s->to_dst_file);
3982     qemu_savevm_state_setup(s->to_dst_file);
3983 
3984     qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3985                                MIGRATION_STATUS_ACTIVE);
3986 
3987     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3988 
3989     trace_migration_thread_setup_complete();
3990     s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3991 
3992     qemu_mutex_lock_iothread();
3993 
3994     /*
3995      * If VM is currently in suspended state, then, to make a valid runstate
3996      * transition in vm_stop_force_state() we need to wakeup it up.
3997      */
3998     qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3999     s->vm_was_running = runstate_is_running();
4000 
4001     if (global_state_store()) {
4002         goto fail;
4003     }
4004     /* Forcibly stop VM before saving state of vCPUs and devices */
4005     if (vm_stop_force_state(RUN_STATE_PAUSED)) {
4006         goto fail;
4007     }
4008     /*
4009      * Put vCPUs in sync with shadow context structures, then
4010      * save their state to channel-buffer along with devices.
4011      */
4012     cpu_synchronize_all_states();
4013     if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
4014         goto fail;
4015     }
4016     /*
4017      * Since we are going to get non-iterable state data directly
4018      * from s->bioc->data, explicit flush is needed here.
4019      */
4020     qemu_fflush(fb);
4021 
4022     /* Now initialize UFFD context and start tracking RAM writes */
4023     if (ram_write_tracking_start()) {
4024         goto fail;
4025     }
4026     early_fail = false;
4027 
4028     /*
4029      * Start VM from BH handler to avoid write-fault lock here.
4030      * UFFD-WP protection for the whole RAM is already enabled so
4031      * calling VM state change notifiers from vm_start() would initiate
4032      * writes to virtio VQs memory which is in write-protected region.
4033      */
4034     s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
4035     qemu_bh_schedule(s->vm_start_bh);
4036 
4037     qemu_mutex_unlock_iothread();
4038 
4039     while (migration_is_active(s)) {
4040         MigIterateState iter_state = bg_migration_iteration_run(s);
4041         if (iter_state == MIG_ITERATE_SKIP) {
4042             continue;
4043         } else if (iter_state == MIG_ITERATE_BREAK) {
4044             break;
4045         }
4046 
4047         /*
4048          * Try to detect any kind of failures, and see whether we
4049          * should stop the migration now.
4050          */
4051         thr_error = migration_detect_error(s);
4052         if (thr_error == MIG_THR_ERR_FATAL) {
4053             /* Stop migration */
4054             break;
4055         }
4056 
4057         migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
4058     }
4059 
4060     trace_migration_thread_after_loop();
4061 
4062 fail:
4063     if (early_fail) {
4064         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
4065                 MIGRATION_STATUS_FAILED);
4066         qemu_mutex_unlock_iothread();
4067     }
4068 
4069     bg_migration_iteration_finish(s);
4070 
4071     qemu_fclose(fb);
4072     object_unref(OBJECT(s));
4073     rcu_unregister_thread();
4074 
4075     return NULL;
4076 }
4077 
4078 void migrate_fd_connect(MigrationState *s, Error *error_in)
4079 {
4080     Error *local_err = NULL;
4081     int64_t rate_limit;
4082     bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
4083 
4084     /*
4085      * If there's a previous error, free it and prepare for another one.
4086      * Meanwhile if migration completes successfully, there won't have an error
4087      * dumped when calling migrate_fd_cleanup().
4088      */
4089     migrate_error_free(s);
4090 
4091     s->expected_downtime = s->parameters.downtime_limit;
4092     if (resume) {
4093         assert(s->cleanup_bh);
4094     } else {
4095         assert(!s->cleanup_bh);
4096         s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s);
4097     }
4098     if (error_in) {
4099         migrate_fd_error(s, error_in);
4100         if (resume) {
4101             /*
4102              * Don't do cleanup for resume if channel is invalid, but only dump
4103              * the error.  We wait for another channel connect from the user.
4104              * The error_report still gives HMP user a hint on what failed.
4105              * It's normally done in migrate_fd_cleanup(), but call it here
4106              * explicitly.
4107              */
4108             error_report_err(error_copy(s->error));
4109         } else {
4110             migrate_fd_cleanup(s);
4111         }
4112         return;
4113     }
4114 
4115     if (resume) {
4116         /* This is a resumed migration */
4117         rate_limit = s->parameters.max_postcopy_bandwidth /
4118             XFER_LIMIT_RATIO;
4119     } else {
4120         /* This is a fresh new migration */
4121         rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
4122 
4123         /* Notify before starting migration thread */
4124         notifier_list_notify(&migration_state_notifiers, s);
4125     }
4126 
4127     qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
4128     qemu_file_set_blocking(s->to_dst_file, true);
4129 
4130     /*
4131      * Open the return path. For postcopy, it is used exclusively. For
4132      * precopy, only if user specified "return-path" capability would
4133      * QEMU uses the return path.
4134      */
4135     if (migrate_postcopy_ram() || migrate_use_return_path()) {
4136         if (open_return_path_on_source(s, !resume)) {
4137             error_report("Unable to open return-path for postcopy");
4138             migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
4139             migrate_fd_cleanup(s);
4140             return;
4141         }
4142     }
4143 
4144     if (resume) {
4145         /* Wakeup the main migration thread to do the recovery */
4146         migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
4147                           MIGRATION_STATUS_POSTCOPY_RECOVER);
4148         qemu_sem_post(&s->postcopy_pause_sem);
4149         return;
4150     }
4151 
4152     if (multifd_save_setup(&local_err) != 0) {
4153         error_report_err(local_err);
4154         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
4155                           MIGRATION_STATUS_FAILED);
4156         migrate_fd_cleanup(s);
4157         return;
4158     }
4159 
4160     if (migrate_background_snapshot()) {
4161         qemu_thread_create(&s->thread, "bg_snapshot",
4162                 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
4163     } else {
4164         qemu_thread_create(&s->thread, "live_migration",
4165                 migration_thread, s, QEMU_THREAD_JOINABLE);
4166     }
4167     s->migration_thread_running = true;
4168 }
4169 
4170 void migration_global_dump(Monitor *mon)
4171 {
4172     MigrationState *ms = migrate_get_current();
4173 
4174     monitor_printf(mon, "globals:\n");
4175     monitor_printf(mon, "store-global-state: %s\n",
4176                    ms->store_global_state ? "on" : "off");
4177     monitor_printf(mon, "only-migratable: %s\n",
4178                    only_migratable ? "on" : "off");
4179     monitor_printf(mon, "send-configuration: %s\n",
4180                    ms->send_configuration ? "on" : "off");
4181     monitor_printf(mon, "send-section-footer: %s\n",
4182                    ms->send_section_footer ? "on" : "off");
4183     monitor_printf(mon, "decompress-error-check: %s\n",
4184                    ms->decompress_error_check ? "on" : "off");
4185     monitor_printf(mon, "clear-bitmap-shift: %u\n",
4186                    ms->clear_bitmap_shift);
4187 }
4188 
4189 #define DEFINE_PROP_MIG_CAP(name, x)             \
4190     DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false)
4191 
4192 static Property migration_properties[] = {
4193     DEFINE_PROP_BOOL("store-global-state", MigrationState,
4194                      store_global_state, true),
4195     DEFINE_PROP_BOOL("send-configuration", MigrationState,
4196                      send_configuration, true),
4197     DEFINE_PROP_BOOL("send-section-footer", MigrationState,
4198                      send_section_footer, true),
4199     DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
4200                       decompress_error_check, true),
4201     DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
4202                       clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
4203 
4204     /* Migration parameters */
4205     DEFINE_PROP_UINT8("x-compress-level", MigrationState,
4206                       parameters.compress_level,
4207                       DEFAULT_MIGRATE_COMPRESS_LEVEL),
4208     DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
4209                       parameters.compress_threads,
4210                       DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
4211     DEFINE_PROP_BOOL("x-compress-wait-thread", MigrationState,
4212                       parameters.compress_wait_thread, true),
4213     DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
4214                       parameters.decompress_threads,
4215                       DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
4216     DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
4217                       parameters.throttle_trigger_threshold,
4218                       DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
4219     DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
4220                       parameters.cpu_throttle_initial,
4221                       DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
4222     DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
4223                       parameters.cpu_throttle_increment,
4224                       DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
4225     DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
4226                       parameters.cpu_throttle_tailslow, false),
4227     DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
4228                       parameters.max_bandwidth, MAX_THROTTLE),
4229     DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
4230                       parameters.downtime_limit,
4231                       DEFAULT_MIGRATE_SET_DOWNTIME),
4232     DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
4233                       parameters.x_checkpoint_delay,
4234                       DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
4235     DEFINE_PROP_UINT8("multifd-channels", MigrationState,
4236                       parameters.multifd_channels,
4237                       DEFAULT_MIGRATE_MULTIFD_CHANNELS),
4238     DEFINE_PROP_MULTIFD_COMPRESSION("multifd-compression", MigrationState,
4239                       parameters.multifd_compression,
4240                       DEFAULT_MIGRATE_MULTIFD_COMPRESSION),
4241     DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
4242                       parameters.multifd_zlib_level,
4243                       DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
4244     DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
4245                       parameters.multifd_zstd_level,
4246                       DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
4247     DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
4248                       parameters.xbzrle_cache_size,
4249                       DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
4250     DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState,
4251                       parameters.max_postcopy_bandwidth,
4252                       DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH),
4253     DEFINE_PROP_UINT8("max-cpu-throttle", MigrationState,
4254                       parameters.max_cpu_throttle,
4255                       DEFAULT_MIGRATE_MAX_CPU_THROTTLE),
4256     DEFINE_PROP_SIZE("announce-initial", MigrationState,
4257                       parameters.announce_initial,
4258                       DEFAULT_MIGRATE_ANNOUNCE_INITIAL),
4259     DEFINE_PROP_SIZE("announce-max", MigrationState,
4260                       parameters.announce_max,
4261                       DEFAULT_MIGRATE_ANNOUNCE_MAX),
4262     DEFINE_PROP_SIZE("announce-rounds", MigrationState,
4263                       parameters.announce_rounds,
4264                       DEFAULT_MIGRATE_ANNOUNCE_ROUNDS),
4265     DEFINE_PROP_SIZE("announce-step", MigrationState,
4266                       parameters.announce_step,
4267                       DEFAULT_MIGRATE_ANNOUNCE_STEP),
4268 
4269     /* Migration capabilities */
4270     DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
4271     DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL),
4272     DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE),
4273     DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS),
4274     DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
4275     DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
4276     DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
4277     DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
4278     DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
4279     DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
4280     DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
4281     DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
4282     DEFINE_PROP_MIG_CAP("x-background-snapshot",
4283             MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
4284 #ifdef CONFIG_LINUX
4285     DEFINE_PROP_MIG_CAP("x-zero-copy-send",
4286             MIGRATION_CAPABILITY_ZERO_COPY_SEND),
4287 #endif
4288 
4289     DEFINE_PROP_END_OF_LIST(),
4290 };
4291 
4292 static void migration_class_init(ObjectClass *klass, void *data)
4293 {
4294     DeviceClass *dc = DEVICE_CLASS(klass);
4295 
4296     dc->user_creatable = false;
4297     device_class_set_props(dc, migration_properties);
4298 }
4299 
4300 static void migration_instance_finalize(Object *obj)
4301 {
4302     MigrationState *ms = MIGRATION_OBJ(obj);
4303     MigrationParameters *params = &ms->parameters;
4304 
4305     qemu_mutex_destroy(&ms->error_mutex);
4306     qemu_mutex_destroy(&ms->qemu_file_lock);
4307     g_free(params->tls_hostname);
4308     g_free(params->tls_creds);
4309     qemu_sem_destroy(&ms->wait_unplug_sem);
4310     qemu_sem_destroy(&ms->rate_limit_sem);
4311     qemu_sem_destroy(&ms->pause_sem);
4312     qemu_sem_destroy(&ms->postcopy_pause_sem);
4313     qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
4314     qemu_sem_destroy(&ms->rp_state.rp_sem);
4315     error_free(ms->error);
4316 }
4317 
4318 static void migration_instance_init(Object *obj)
4319 {
4320     MigrationState *ms = MIGRATION_OBJ(obj);
4321     MigrationParameters *params = &ms->parameters;
4322 
4323     ms->state = MIGRATION_STATUS_NONE;
4324     ms->mbps = -1;
4325     ms->pages_per_second = -1;
4326     qemu_sem_init(&ms->pause_sem, 0);
4327     qemu_mutex_init(&ms->error_mutex);
4328 
4329     params->tls_hostname = g_strdup("");
4330     params->tls_creds = g_strdup("");
4331 
4332     /* Set has_* up only for parameter checks */
4333     params->has_compress_level = true;
4334     params->has_compress_threads = true;
4335     params->has_decompress_threads = true;
4336     params->has_throttle_trigger_threshold = true;
4337     params->has_cpu_throttle_initial = true;
4338     params->has_cpu_throttle_increment = true;
4339     params->has_cpu_throttle_tailslow = true;
4340     params->has_max_bandwidth = true;
4341     params->has_downtime_limit = true;
4342     params->has_x_checkpoint_delay = true;
4343     params->has_block_incremental = true;
4344     params->has_multifd_channels = true;
4345     params->has_multifd_compression = true;
4346     params->has_multifd_zlib_level = true;
4347     params->has_multifd_zstd_level = true;
4348     params->has_xbzrle_cache_size = true;
4349     params->has_max_postcopy_bandwidth = true;
4350     params->has_max_cpu_throttle = true;
4351     params->has_announce_initial = true;
4352     params->has_announce_max = true;
4353     params->has_announce_rounds = true;
4354     params->has_announce_step = true;
4355 
4356     qemu_sem_init(&ms->postcopy_pause_sem, 0);
4357     qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
4358     qemu_sem_init(&ms->rp_state.rp_sem, 0);
4359     qemu_sem_init(&ms->rate_limit_sem, 0);
4360     qemu_sem_init(&ms->wait_unplug_sem, 0);
4361     qemu_mutex_init(&ms->qemu_file_lock);
4362 }
4363 
4364 /*
4365  * Return true if check pass, false otherwise. Error will be put
4366  * inside errp if provided.
4367  */
4368 static bool migration_object_check(MigrationState *ms, Error **errp)
4369 {
4370     MigrationCapabilityStatusList *head = NULL;
4371     /* Assuming all off */
4372     bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret;
4373     int i;
4374 
4375     if (!migrate_params_check(&ms->parameters, errp)) {
4376         return false;
4377     }
4378 
4379     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
4380         if (ms->enabled_capabilities[i]) {
4381             QAPI_LIST_PREPEND(head, migrate_cap_add(i, true));
4382         }
4383     }
4384 
4385     ret = migrate_caps_check(cap_list, head, errp);
4386 
4387     /* It works with head == NULL */
4388     qapi_free_MigrationCapabilityStatusList(head);
4389 
4390     return ret;
4391 }
4392 
4393 static const TypeInfo migration_type = {
4394     .name = TYPE_MIGRATION,
4395     /*
4396      * NOTE: TYPE_MIGRATION is not really a device, as the object is
4397      * not created using qdev_new(), it is not attached to the qdev
4398      * device tree, and it is never realized.
4399      *
4400      * TODO: Make this TYPE_OBJECT once QOM provides something like
4401      * TYPE_DEVICE's "-global" properties.
4402      */
4403     .parent = TYPE_DEVICE,
4404     .class_init = migration_class_init,
4405     .class_size = sizeof(MigrationClass),
4406     .instance_size = sizeof(MigrationState),
4407     .instance_init = migration_instance_init,
4408     .instance_finalize = migration_instance_finalize,
4409 };
4410 
4411 static void register_migration_types(void)
4412 {
4413     type_register_static(&migration_type);
4414 }
4415 
4416 type_init(register_migration_types);
4417