xref: /openbmc/qemu/migration/migration.c (revision 1d76437b)
1 /*
2  * QEMU live migration
3  *
4  * Copyright IBM, Corp. 2008
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qemu/cutils.h"
18 #include "qemu/error-report.h"
19 #include "qemu/main-loop.h"
20 #include "migration/blocker.h"
21 #include "exec.h"
22 #include "fd.h"
23 #include "socket.h"
24 #include "sysemu/runstate.h"
25 #include "sysemu/sysemu.h"
26 #include "sysemu/cpu-throttle.h"
27 #include "rdma.h"
28 #include "ram.h"
29 #include "migration/global_state.h"
30 #include "migration/misc.h"
31 #include "migration.h"
32 #include "savevm.h"
33 #include "qemu-file-channel.h"
34 #include "qemu-file.h"
35 #include "migration/vmstate.h"
36 #include "block/block.h"
37 #include "qapi/error.h"
38 #include "qapi/clone-visitor.h"
39 #include "qapi/qapi-visit-migration.h"
40 #include "qapi/qapi-visit-sockets.h"
41 #include "qapi/qapi-commands-migration.h"
42 #include "qapi/qapi-events-migration.h"
43 #include "qapi/qmp/qerror.h"
44 #include "qapi/qmp/qnull.h"
45 #include "qemu/rcu.h"
46 #include "block.h"
47 #include "postcopy-ram.h"
48 #include "qemu/thread.h"
49 #include "trace.h"
50 #include "exec/target_page.h"
51 #include "io/channel-buffer.h"
52 #include "migration/colo.h"
53 #include "hw/boards.h"
54 #include "hw/qdev-properties.h"
55 #include "hw/qdev-properties-system.h"
56 #include "monitor/monitor.h"
57 #include "net/announce.h"
58 #include "qemu/queue.h"
59 #include "multifd.h"
60 #include "qemu/yank.h"
61 #include "sysemu/cpus.h"
62 #include "yank_functions.h"
63 
64 #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
65 
66 /* Amount of time to allocate to each "chunk" of bandwidth-throttled
67  * data. */
68 #define BUFFER_DELAY     100
69 #define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
70 
71 /* Time in milliseconds we are allowed to stop the source,
72  * for sending the last part */
73 #define DEFAULT_MIGRATE_SET_DOWNTIME 300
74 
75 /* Maximum migrate downtime set to 2000 seconds */
76 #define MAX_MIGRATE_DOWNTIME_SECONDS 2000
77 #define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000)
78 
79 /* Default compression thread count */
80 #define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
81 /* Default decompression thread count, usually decompression is at
82  * least 4 times as fast as compression.*/
83 #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
84 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
85 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
86 /* Define default autoconverge cpu throttle migration parameters */
87 #define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
88 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
89 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
90 #define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
91 
92 /* Migration XBZRLE default cache size */
93 #define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
94 
95 /* The delay time (in ms) between two COLO checkpoints */
96 #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
97 #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
98 #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
99 /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
100 #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
101 /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
102 #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
103 
104 /* Background transfer rate for postcopy, 0 means unlimited, note
105  * that page requests can still exceed this limit.
106  */
107 #define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0
108 
109 /*
110  * Parameters for self_announce_delay giving a stream of RARP/ARP
111  * packets after migration.
112  */
113 #define DEFAULT_MIGRATE_ANNOUNCE_INITIAL  50
114 #define DEFAULT_MIGRATE_ANNOUNCE_MAX     550
115 #define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS    5
116 #define DEFAULT_MIGRATE_ANNOUNCE_STEP    100
117 
118 static NotifierList migration_state_notifiers =
119     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
120 
121 /* Messages sent on the return path from destination to source */
122 enum mig_rp_message_type {
123     MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
124     MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
125     MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
126 
127     MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
128     MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
129     MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
130     MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
131 
132     MIG_RP_MSG_MAX
133 };
134 
135 /* Migration capabilities set */
136 struct MigrateCapsSet {
137     int size;                       /* Capability set size */
138     MigrationCapability caps[];     /* Variadic array of capabilities */
139 };
140 typedef struct MigrateCapsSet MigrateCapsSet;
141 
142 /* Define and initialize MigrateCapsSet */
143 #define INITIALIZE_MIGRATE_CAPS_SET(_name, ...)   \
144     MigrateCapsSet _name = {    \
145         .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \
146         .caps = { __VA_ARGS__ } \
147     }
148 
149 /* Background-snapshot compatibility check list */
150 static const
151 INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
152     MIGRATION_CAPABILITY_POSTCOPY_RAM,
153     MIGRATION_CAPABILITY_DIRTY_BITMAPS,
154     MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME,
155     MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE,
156     MIGRATION_CAPABILITY_RETURN_PATH,
157     MIGRATION_CAPABILITY_MULTIFD,
158     MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER,
159     MIGRATION_CAPABILITY_AUTO_CONVERGE,
160     MIGRATION_CAPABILITY_RELEASE_RAM,
161     MIGRATION_CAPABILITY_RDMA_PIN_ALL,
162     MIGRATION_CAPABILITY_COMPRESS,
163     MIGRATION_CAPABILITY_XBZRLE,
164     MIGRATION_CAPABILITY_X_COLO,
165     MIGRATION_CAPABILITY_VALIDATE_UUID);
166 
167 /* When we add fault tolerance, we could have several
168    migrations at once.  For now we don't need to add
169    dynamic creation of migration */
170 
171 static MigrationState *current_migration;
172 static MigrationIncomingState *current_incoming;
173 
174 static GSList *migration_blockers;
175 
176 static bool migration_object_check(MigrationState *ms, Error **errp);
177 static int migration_maybe_pause(MigrationState *s,
178                                  int *current_active_state,
179                                  int new_state);
180 static void migrate_fd_cancel(MigrationState *s);
181 
182 static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
183 {
184     uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
185 
186     return (a > b) - (a < b);
187 }
188 
189 void migration_object_init(void)
190 {
191     Error *err = NULL;
192 
193     /* This can only be called once. */
194     assert(!current_migration);
195     current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
196 
197     /*
198      * Init the migrate incoming object as well no matter whether
199      * we'll use it or not.
200      */
201     assert(!current_incoming);
202     current_incoming = g_new0(MigrationIncomingState, 1);
203     current_incoming->state = MIGRATION_STATUS_NONE;
204     current_incoming->postcopy_remote_fds =
205         g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
206     qemu_mutex_init(&current_incoming->rp_mutex);
207     qemu_event_init(&current_incoming->main_thread_load_event, false);
208     qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
209     qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
210     qemu_mutex_init(&current_incoming->page_request_mutex);
211     current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
212 
213     if (!migration_object_check(current_migration, &err)) {
214         error_report_err(err);
215         exit(1);
216     }
217 
218     blk_mig_init();
219     ram_mig_init();
220     dirty_bitmap_mig_init();
221 }
222 
223 void migration_cancel(void)
224 {
225     migrate_fd_cancel(current_migration);
226 }
227 
228 void migration_shutdown(void)
229 {
230     /*
231      * Cancel the current migration - that will (eventually)
232      * stop the migration using this structure
233      */
234     migration_cancel();
235     object_unref(OBJECT(current_migration));
236 
237     /*
238      * Cancel outgoing migration of dirty bitmaps. It should
239      * at least unref used block nodes.
240      */
241     dirty_bitmap_mig_cancel_outgoing();
242 
243     /*
244      * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
245      * are non-critical data, and their loss never considered as
246      * something serious.
247      */
248     dirty_bitmap_mig_cancel_incoming();
249 }
250 
251 /* For outgoing */
252 MigrationState *migrate_get_current(void)
253 {
254     /* This can only be called after the object created. */
255     assert(current_migration);
256     return current_migration;
257 }
258 
259 MigrationIncomingState *migration_incoming_get_current(void)
260 {
261     assert(current_incoming);
262     return current_incoming;
263 }
264 
265 void migration_incoming_state_destroy(void)
266 {
267     struct MigrationIncomingState *mis = migration_incoming_get_current();
268 
269     if (mis->to_src_file) {
270         /* Tell source that we are done */
271         migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
272         qemu_fclose(mis->to_src_file);
273         mis->to_src_file = NULL;
274     }
275 
276     if (mis->from_src_file) {
277         migration_ioc_unregister_yank_from_file(mis->from_src_file);
278         qemu_fclose(mis->from_src_file);
279         mis->from_src_file = NULL;
280     }
281     if (mis->postcopy_remote_fds) {
282         g_array_free(mis->postcopy_remote_fds, TRUE);
283         mis->postcopy_remote_fds = NULL;
284     }
285     if (mis->transport_cleanup) {
286         mis->transport_cleanup(mis->transport_data);
287     }
288 
289     qemu_event_reset(&mis->main_thread_load_event);
290 
291     if (mis->page_requested) {
292         g_tree_destroy(mis->page_requested);
293         mis->page_requested = NULL;
294     }
295 
296     if (mis->socket_address_list) {
297         qapi_free_SocketAddressList(mis->socket_address_list);
298         mis->socket_address_list = NULL;
299     }
300 
301     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
302 }
303 
304 static void migrate_generate_event(int new_state)
305 {
306     if (migrate_use_events()) {
307         qapi_event_send_migration(new_state);
308     }
309 }
310 
311 static bool migrate_late_block_activate(void)
312 {
313     MigrationState *s;
314 
315     s = migrate_get_current();
316 
317     return s->enabled_capabilities[
318         MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE];
319 }
320 
321 /*
322  * Send a message on the return channel back to the source
323  * of the migration.
324  */
325 static int migrate_send_rp_message(MigrationIncomingState *mis,
326                                    enum mig_rp_message_type message_type,
327                                    uint16_t len, void *data)
328 {
329     int ret = 0;
330 
331     trace_migrate_send_rp_message((int)message_type, len);
332     QEMU_LOCK_GUARD(&mis->rp_mutex);
333 
334     /*
335      * It's possible that the file handle got lost due to network
336      * failures.
337      */
338     if (!mis->to_src_file) {
339         ret = -EIO;
340         return ret;
341     }
342 
343     qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
344     qemu_put_be16(mis->to_src_file, len);
345     qemu_put_buffer(mis->to_src_file, data, len);
346     qemu_fflush(mis->to_src_file);
347 
348     /* It's possible that qemu file got error during sending */
349     ret = qemu_file_get_error(mis->to_src_file);
350 
351     return ret;
352 }
353 
354 /* Request one page from the source VM at the given start address.
355  *   rb: the RAMBlock to request the page in
356  *   Start: Address offset within the RB
357  *   Len: Length in bytes required - must be a multiple of pagesize
358  */
359 int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
360                                       RAMBlock *rb, ram_addr_t start)
361 {
362     uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
363     size_t msglen = 12; /* start + len */
364     size_t len = qemu_ram_pagesize(rb);
365     enum mig_rp_message_type msg_type;
366     const char *rbname;
367     int rbname_len;
368 
369     *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
370     *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
371 
372     /*
373      * We maintain the last ramblock that we requested for page.  Note that we
374      * don't need locking because this function will only be called within the
375      * postcopy ram fault thread.
376      */
377     if (rb != mis->last_rb) {
378         mis->last_rb = rb;
379 
380         rbname = qemu_ram_get_idstr(rb);
381         rbname_len = strlen(rbname);
382 
383         assert(rbname_len < 256);
384 
385         bufc[msglen++] = rbname_len;
386         memcpy(bufc + msglen, rbname, rbname_len);
387         msglen += rbname_len;
388         msg_type = MIG_RP_MSG_REQ_PAGES_ID;
389     } else {
390         msg_type = MIG_RP_MSG_REQ_PAGES;
391     }
392 
393     return migrate_send_rp_message(mis, msg_type, msglen, bufc);
394 }
395 
396 int migrate_send_rp_req_pages(MigrationIncomingState *mis,
397                               RAMBlock *rb, ram_addr_t start, uint64_t haddr)
398 {
399     void *aligned = (void *)(uintptr_t)(haddr & (-qemu_ram_pagesize(rb)));
400     bool received = false;
401 
402     WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
403         received = ramblock_recv_bitmap_test_byte_offset(rb, start);
404         if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
405             /*
406              * The page has not been received, and it's not yet in the page
407              * request list.  Queue it.  Set the value of element to 1, so that
408              * things like g_tree_lookup() will return TRUE (1) when found.
409              */
410             g_tree_insert(mis->page_requested, aligned, (gpointer)1);
411             mis->page_requested_count++;
412             trace_postcopy_page_req_add(aligned, mis->page_requested_count);
413         }
414     }
415 
416     /*
417      * If the page is there, skip sending the message.  We don't even need the
418      * lock because as long as the page arrived, it'll be there forever.
419      */
420     if (received) {
421         return 0;
422     }
423 
424     return migrate_send_rp_message_req_pages(mis, rb, start);
425 }
426 
427 static bool migration_colo_enabled;
428 bool migration_incoming_colo_enabled(void)
429 {
430     return migration_colo_enabled;
431 }
432 
433 void migration_incoming_disable_colo(void)
434 {
435     ram_block_discard_disable(false);
436     migration_colo_enabled = false;
437 }
438 
439 int migration_incoming_enable_colo(void)
440 {
441     if (ram_block_discard_disable(true)) {
442         error_report("COLO: cannot disable RAM discard");
443         return -EBUSY;
444     }
445     migration_colo_enabled = true;
446     return 0;
447 }
448 
449 void migrate_add_address(SocketAddress *address)
450 {
451     MigrationIncomingState *mis = migration_incoming_get_current();
452 
453     QAPI_LIST_PREPEND(mis->socket_address_list,
454                       QAPI_CLONE(SocketAddress, address));
455 }
456 
457 static void qemu_start_incoming_migration(const char *uri, Error **errp)
458 {
459     const char *p = NULL;
460 
461     qapi_event_send_migration(MIGRATION_STATUS_SETUP);
462     if (strstart(uri, "tcp:", &p) ||
463         strstart(uri, "unix:", NULL) ||
464         strstart(uri, "vsock:", NULL)) {
465         socket_start_incoming_migration(p ? p : uri, errp);
466 #ifdef CONFIG_RDMA
467     } else if (strstart(uri, "rdma:", &p)) {
468         rdma_start_incoming_migration(p, errp);
469 #endif
470     } else if (strstart(uri, "exec:", &p)) {
471         exec_start_incoming_migration(p, errp);
472     } else if (strstart(uri, "fd:", &p)) {
473         fd_start_incoming_migration(p, errp);
474     } else {
475         error_setg(errp, "unknown migration protocol: %s", uri);
476     }
477 }
478 
479 static void process_incoming_migration_bh(void *opaque)
480 {
481     Error *local_err = NULL;
482     MigrationIncomingState *mis = opaque;
483 
484     /* If capability late_block_activate is set:
485      * Only fire up the block code now if we're going to restart the
486      * VM, else 'cont' will do it.
487      * This causes file locking to happen; so we don't want it to happen
488      * unless we really are starting the VM.
489      */
490     if (!migrate_late_block_activate() ||
491          (autostart && (!global_state_received() ||
492             global_state_get_runstate() == RUN_STATE_RUNNING))) {
493         /* Make sure all file formats flush their mutable metadata.
494          * If we get an error here, just don't restart the VM yet. */
495         bdrv_invalidate_cache_all(&local_err);
496         if (local_err) {
497             error_report_err(local_err);
498             local_err = NULL;
499             autostart = false;
500         }
501     }
502 
503     /*
504      * This must happen after all error conditions are dealt with and
505      * we're sure the VM is going to be running on this host.
506      */
507     qemu_announce_self(&mis->announce_timer, migrate_announce_params());
508 
509     if (multifd_load_cleanup(&local_err) != 0) {
510         error_report_err(local_err);
511         autostart = false;
512     }
513     /* If global state section was not received or we are in running
514        state, we need to obey autostart. Any other state is set with
515        runstate_set. */
516 
517     dirty_bitmap_mig_before_vm_start();
518 
519     if (!global_state_received() ||
520         global_state_get_runstate() == RUN_STATE_RUNNING) {
521         if (autostart) {
522             vm_start();
523         } else {
524             runstate_set(RUN_STATE_PAUSED);
525         }
526     } else if (migration_incoming_colo_enabled()) {
527         migration_incoming_disable_colo();
528         vm_start();
529     } else {
530         runstate_set(global_state_get_runstate());
531     }
532     /*
533      * This must happen after any state changes since as soon as an external
534      * observer sees this event they might start to prod at the VM assuming
535      * it's ready to use.
536      */
537     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
538                       MIGRATION_STATUS_COMPLETED);
539     qemu_bh_delete(mis->bh);
540     migration_incoming_state_destroy();
541 }
542 
543 static void process_incoming_migration_co(void *opaque)
544 {
545     MigrationIncomingState *mis = migration_incoming_get_current();
546     PostcopyState ps;
547     int ret;
548     Error *local_err = NULL;
549 
550     assert(mis->from_src_file);
551     mis->migration_incoming_co = qemu_coroutine_self();
552     mis->largest_page_size = qemu_ram_pagesize_largest();
553     postcopy_state_set(POSTCOPY_INCOMING_NONE);
554     migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
555                       MIGRATION_STATUS_ACTIVE);
556     ret = qemu_loadvm_state(mis->from_src_file);
557 
558     ps = postcopy_state_get();
559     trace_process_incoming_migration_co_end(ret, ps);
560     if (ps != POSTCOPY_INCOMING_NONE) {
561         if (ps == POSTCOPY_INCOMING_ADVISE) {
562             /*
563              * Where a migration had postcopy enabled (and thus went to advise)
564              * but managed to complete within the precopy period, we can use
565              * the normal exit.
566              */
567             postcopy_ram_incoming_cleanup(mis);
568         } else if (ret >= 0) {
569             /*
570              * Postcopy was started, cleanup should happen at the end of the
571              * postcopy thread.
572              */
573             trace_process_incoming_migration_co_postcopy_end_main();
574             return;
575         }
576         /* Else if something went wrong then just fall out of the normal exit */
577     }
578 
579     /* we get COLO info, and know if we are in COLO mode */
580     if (!ret && migration_incoming_colo_enabled()) {
581         /* Make sure all file formats flush their mutable metadata */
582         bdrv_invalidate_cache_all(&local_err);
583         if (local_err) {
584             error_report_err(local_err);
585             goto fail;
586         }
587 
588         qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
589              colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
590         mis->have_colo_incoming_thread = true;
591         qemu_coroutine_yield();
592 
593         /* Wait checkpoint incoming thread exit before free resource */
594         qemu_thread_join(&mis->colo_incoming_thread);
595         /* We hold the global iothread lock, so it is safe here */
596         colo_release_ram_cache();
597     }
598 
599     if (ret < 0) {
600         error_report("load of migration failed: %s", strerror(-ret));
601         goto fail;
602     }
603     mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
604     qemu_bh_schedule(mis->bh);
605     mis->migration_incoming_co = NULL;
606     return;
607 fail:
608     local_err = NULL;
609     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
610                       MIGRATION_STATUS_FAILED);
611     qemu_fclose(mis->from_src_file);
612     if (multifd_load_cleanup(&local_err) != 0) {
613         error_report_err(local_err);
614     }
615     exit(EXIT_FAILURE);
616 }
617 
618 /**
619  * @migration_incoming_setup: Setup incoming migration
620  *
621  * Returns 0 for no error or 1 for error
622  *
623  * @f: file for main migration channel
624  * @errp: where to put errors
625  */
626 static int migration_incoming_setup(QEMUFile *f, Error **errp)
627 {
628     MigrationIncomingState *mis = migration_incoming_get_current();
629     Error *local_err = NULL;
630 
631     if (multifd_load_setup(&local_err) != 0) {
632         /* We haven't been able to create multifd threads
633            nothing better to do */
634         error_report_err(local_err);
635         exit(EXIT_FAILURE);
636     }
637 
638     if (!mis->from_src_file) {
639         mis->from_src_file = f;
640     }
641     qemu_file_set_blocking(f, false);
642     return 0;
643 }
644 
645 void migration_incoming_process(void)
646 {
647     Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
648     qemu_coroutine_enter(co);
649 }
650 
651 /* Returns true if recovered from a paused migration, otherwise false */
652 static bool postcopy_try_recover(QEMUFile *f)
653 {
654     MigrationIncomingState *mis = migration_incoming_get_current();
655 
656     if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
657         /* Resumed from a paused postcopy migration */
658 
659         mis->from_src_file = f;
660         /* Postcopy has standalone thread to do vm load */
661         qemu_file_set_blocking(f, true);
662 
663         /* Re-configure the return path */
664         mis->to_src_file = qemu_file_get_return_path(f);
665 
666         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
667                           MIGRATION_STATUS_POSTCOPY_RECOVER);
668 
669         /*
670          * Here, we only wake up the main loading thread (while the
671          * fault thread will still be waiting), so that we can receive
672          * commands from source now, and answer it if needed. The
673          * fault thread will be woken up afterwards until we are sure
674          * that source is ready to reply to page requests.
675          */
676         qemu_sem_post(&mis->postcopy_pause_sem_dst);
677         return true;
678     }
679 
680     return false;
681 }
682 
683 void migration_fd_process_incoming(QEMUFile *f, Error **errp)
684 {
685     Error *local_err = NULL;
686 
687     if (postcopy_try_recover(f)) {
688         return;
689     }
690 
691     if (migration_incoming_setup(f, &local_err)) {
692         error_propagate(errp, local_err);
693         return;
694     }
695     migration_incoming_process();
696 }
697 
698 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
699 {
700     MigrationIncomingState *mis = migration_incoming_get_current();
701     Error *local_err = NULL;
702     bool start_migration;
703 
704     if (!mis->from_src_file) {
705         /* The first connection (multifd may have multiple) */
706         QEMUFile *f = qemu_fopen_channel_input(ioc);
707 
708         /* If it's a recovery, we're done */
709         if (postcopy_try_recover(f)) {
710             return;
711         }
712 
713         if (migration_incoming_setup(f, &local_err)) {
714             error_propagate(errp, local_err);
715             return;
716         }
717 
718         /*
719          * Common migration only needs one channel, so we can start
720          * right now.  Multifd needs more than one channel, we wait.
721          */
722         start_migration = !migrate_use_multifd();
723     } else {
724         /* Multiple connections */
725         assert(migrate_use_multifd());
726         start_migration = multifd_recv_new_channel(ioc, &local_err);
727         if (local_err) {
728             error_propagate(errp, local_err);
729             return;
730         }
731     }
732 
733     if (start_migration) {
734         migration_incoming_process();
735     }
736 }
737 
738 /**
739  * @migration_has_all_channels: We have received all channels that we need
740  *
741  * Returns true when we have got connections to all the channels that
742  * we need for migration.
743  */
744 bool migration_has_all_channels(void)
745 {
746     MigrationIncomingState *mis = migration_incoming_get_current();
747     bool all_channels;
748 
749     all_channels = multifd_recv_all_channels_created();
750 
751     return all_channels && mis->from_src_file != NULL;
752 }
753 
754 /*
755  * Send a 'SHUT' message on the return channel with the given value
756  * to indicate that we've finished with the RP.  Non-0 value indicates
757  * error.
758  */
759 void migrate_send_rp_shut(MigrationIncomingState *mis,
760                           uint32_t value)
761 {
762     uint32_t buf;
763 
764     buf = cpu_to_be32(value);
765     migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
766 }
767 
768 /*
769  * Send a 'PONG' message on the return channel with the given value
770  * (normally in response to a 'PING')
771  */
772 void migrate_send_rp_pong(MigrationIncomingState *mis,
773                           uint32_t value)
774 {
775     uint32_t buf;
776 
777     buf = cpu_to_be32(value);
778     migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
779 }
780 
781 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
782                                  char *block_name)
783 {
784     char buf[512];
785     int len;
786     int64_t res;
787 
788     /*
789      * First, we send the header part. It contains only the len of
790      * idstr, and the idstr itself.
791      */
792     len = strlen(block_name);
793     buf[0] = len;
794     memcpy(buf + 1, block_name, len);
795 
796     if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
797         error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
798                      __func__);
799         return;
800     }
801 
802     migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
803 
804     /*
805      * Next, we dump the received bitmap to the stream.
806      *
807      * TODO: currently we are safe since we are the only one that is
808      * using the to_src_file handle (fault thread is still paused),
809      * and it's ok even not taking the mutex. However the best way is
810      * to take the lock before sending the message header, and release
811      * the lock after sending the bitmap.
812      */
813     qemu_mutex_lock(&mis->rp_mutex);
814     res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
815     qemu_mutex_unlock(&mis->rp_mutex);
816 
817     trace_migrate_send_rp_recv_bitmap(block_name, res);
818 }
819 
820 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
821 {
822     uint32_t buf;
823 
824     buf = cpu_to_be32(value);
825     migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
826 }
827 
828 MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
829 {
830     MigrationCapabilityStatusList *head = NULL, **tail = &head;
831     MigrationCapabilityStatus *caps;
832     MigrationState *s = migrate_get_current();
833     int i;
834 
835     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
836 #ifndef CONFIG_LIVE_BLOCK_MIGRATION
837         if (i == MIGRATION_CAPABILITY_BLOCK) {
838             continue;
839         }
840 #endif
841         caps = g_malloc0(sizeof(*caps));
842         caps->capability = i;
843         caps->state = s->enabled_capabilities[i];
844         QAPI_LIST_APPEND(tail, caps);
845     }
846 
847     return head;
848 }
849 
850 MigrationParameters *qmp_query_migrate_parameters(Error **errp)
851 {
852     MigrationParameters *params;
853     MigrationState *s = migrate_get_current();
854 
855     /* TODO use QAPI_CLONE() instead of duplicating it inline */
856     params = g_malloc0(sizeof(*params));
857     params->has_compress_level = true;
858     params->compress_level = s->parameters.compress_level;
859     params->has_compress_threads = true;
860     params->compress_threads = s->parameters.compress_threads;
861     params->has_compress_wait_thread = true;
862     params->compress_wait_thread = s->parameters.compress_wait_thread;
863     params->has_decompress_threads = true;
864     params->decompress_threads = s->parameters.decompress_threads;
865     params->has_throttle_trigger_threshold = true;
866     params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold;
867     params->has_cpu_throttle_initial = true;
868     params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
869     params->has_cpu_throttle_increment = true;
870     params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
871     params->has_cpu_throttle_tailslow = true;
872     params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
873     params->has_tls_creds = true;
874     params->tls_creds = g_strdup(s->parameters.tls_creds);
875     params->has_tls_hostname = true;
876     params->tls_hostname = g_strdup(s->parameters.tls_hostname);
877     params->has_tls_authz = true;
878     params->tls_authz = g_strdup(s->parameters.tls_authz ?
879                                  s->parameters.tls_authz : "");
880     params->has_max_bandwidth = true;
881     params->max_bandwidth = s->parameters.max_bandwidth;
882     params->has_downtime_limit = true;
883     params->downtime_limit = s->parameters.downtime_limit;
884     params->has_x_checkpoint_delay = true;
885     params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
886     params->has_block_incremental = true;
887     params->block_incremental = s->parameters.block_incremental;
888     params->has_multifd_channels = true;
889     params->multifd_channels = s->parameters.multifd_channels;
890     params->has_multifd_compression = true;
891     params->multifd_compression = s->parameters.multifd_compression;
892     params->has_multifd_zlib_level = true;
893     params->multifd_zlib_level = s->parameters.multifd_zlib_level;
894     params->has_multifd_zstd_level = true;
895     params->multifd_zstd_level = s->parameters.multifd_zstd_level;
896     params->has_xbzrle_cache_size = true;
897     params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
898     params->has_max_postcopy_bandwidth = true;
899     params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth;
900     params->has_max_cpu_throttle = true;
901     params->max_cpu_throttle = s->parameters.max_cpu_throttle;
902     params->has_announce_initial = true;
903     params->announce_initial = s->parameters.announce_initial;
904     params->has_announce_max = true;
905     params->announce_max = s->parameters.announce_max;
906     params->has_announce_rounds = true;
907     params->announce_rounds = s->parameters.announce_rounds;
908     params->has_announce_step = true;
909     params->announce_step = s->parameters.announce_step;
910 
911     if (s->parameters.has_block_bitmap_mapping) {
912         params->has_block_bitmap_mapping = true;
913         params->block_bitmap_mapping =
914             QAPI_CLONE(BitmapMigrationNodeAliasList,
915                        s->parameters.block_bitmap_mapping);
916     }
917 
918     return params;
919 }
920 
921 AnnounceParameters *migrate_announce_params(void)
922 {
923     static AnnounceParameters ap;
924 
925     MigrationState *s = migrate_get_current();
926 
927     ap.initial = s->parameters.announce_initial;
928     ap.max = s->parameters.announce_max;
929     ap.rounds = s->parameters.announce_rounds;
930     ap.step = s->parameters.announce_step;
931 
932     return &ap;
933 }
934 
935 /*
936  * Return true if we're already in the middle of a migration
937  * (i.e. any of the active or setup states)
938  */
939 bool migration_is_setup_or_active(int state)
940 {
941     switch (state) {
942     case MIGRATION_STATUS_ACTIVE:
943     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
944     case MIGRATION_STATUS_POSTCOPY_PAUSED:
945     case MIGRATION_STATUS_POSTCOPY_RECOVER:
946     case MIGRATION_STATUS_SETUP:
947     case MIGRATION_STATUS_PRE_SWITCHOVER:
948     case MIGRATION_STATUS_DEVICE:
949     case MIGRATION_STATUS_WAIT_UNPLUG:
950     case MIGRATION_STATUS_COLO:
951         return true;
952 
953     default:
954         return false;
955 
956     }
957 }
958 
959 bool migration_is_running(int state)
960 {
961     switch (state) {
962     case MIGRATION_STATUS_ACTIVE:
963     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
964     case MIGRATION_STATUS_POSTCOPY_PAUSED:
965     case MIGRATION_STATUS_POSTCOPY_RECOVER:
966     case MIGRATION_STATUS_SETUP:
967     case MIGRATION_STATUS_PRE_SWITCHOVER:
968     case MIGRATION_STATUS_DEVICE:
969     case MIGRATION_STATUS_WAIT_UNPLUG:
970     case MIGRATION_STATUS_CANCELLING:
971         return true;
972 
973     default:
974         return false;
975 
976     }
977 }
978 
979 static void populate_time_info(MigrationInfo *info, MigrationState *s)
980 {
981     info->has_status = true;
982     info->has_setup_time = true;
983     info->setup_time = s->setup_time;
984     if (s->state == MIGRATION_STATUS_COMPLETED) {
985         info->has_total_time = true;
986         info->total_time = s->total_time;
987         info->has_downtime = true;
988         info->downtime = s->downtime;
989     } else {
990         info->has_total_time = true;
991         info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
992                            s->start_time;
993         info->has_expected_downtime = true;
994         info->expected_downtime = s->expected_downtime;
995     }
996 }
997 
998 static void populate_ram_info(MigrationInfo *info, MigrationState *s)
999 {
1000     info->has_ram = true;
1001     info->ram = g_malloc0(sizeof(*info->ram));
1002     info->ram->transferred = ram_counters.transferred;
1003     info->ram->total = ram_bytes_total();
1004     info->ram->duplicate = ram_counters.duplicate;
1005     /* legacy value.  It is not used anymore */
1006     info->ram->skipped = 0;
1007     info->ram->normal = ram_counters.normal;
1008     info->ram->normal_bytes = ram_counters.normal *
1009         qemu_target_page_size();
1010     info->ram->mbps = s->mbps;
1011     info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
1012     info->ram->postcopy_requests = ram_counters.postcopy_requests;
1013     info->ram->page_size = qemu_target_page_size();
1014     info->ram->multifd_bytes = ram_counters.multifd_bytes;
1015     info->ram->pages_per_second = s->pages_per_second;
1016 
1017     if (migrate_use_xbzrle()) {
1018         info->has_xbzrle_cache = true;
1019         info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
1020         info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
1021         info->xbzrle_cache->bytes = xbzrle_counters.bytes;
1022         info->xbzrle_cache->pages = xbzrle_counters.pages;
1023         info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
1024         info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
1025         info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
1026         info->xbzrle_cache->overflow = xbzrle_counters.overflow;
1027     }
1028 
1029     if (migrate_use_compression()) {
1030         info->has_compression = true;
1031         info->compression = g_malloc0(sizeof(*info->compression));
1032         info->compression->pages = compression_counters.pages;
1033         info->compression->busy = compression_counters.busy;
1034         info->compression->busy_rate = compression_counters.busy_rate;
1035         info->compression->compressed_size =
1036                                     compression_counters.compressed_size;
1037         info->compression->compression_rate =
1038                                     compression_counters.compression_rate;
1039     }
1040 
1041     if (cpu_throttle_active()) {
1042         info->has_cpu_throttle_percentage = true;
1043         info->cpu_throttle_percentage = cpu_throttle_get_percentage();
1044     }
1045 
1046     if (s->state != MIGRATION_STATUS_COMPLETED) {
1047         info->ram->remaining = ram_bytes_remaining();
1048         info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate;
1049     }
1050 }
1051 
1052 static void populate_disk_info(MigrationInfo *info)
1053 {
1054     if (blk_mig_active()) {
1055         info->has_disk = true;
1056         info->disk = g_malloc0(sizeof(*info->disk));
1057         info->disk->transferred = blk_mig_bytes_transferred();
1058         info->disk->remaining = blk_mig_bytes_remaining();
1059         info->disk->total = blk_mig_bytes_total();
1060     }
1061 }
1062 
1063 static void fill_source_migration_info(MigrationInfo *info)
1064 {
1065     MigrationState *s = migrate_get_current();
1066     GSList *cur_blocker = migration_blockers;
1067 
1068     info->blocked_reasons = NULL;
1069 
1070     /*
1071      * There are two types of reasons a migration might be blocked;
1072      * a) devices marked in VMState as non-migratable, and
1073      * b) Explicit migration blockers
1074      * We need to add both of them here.
1075      */
1076     qemu_savevm_non_migratable_list(&info->blocked_reasons);
1077 
1078     while (cur_blocker) {
1079         QAPI_LIST_PREPEND(info->blocked_reasons,
1080                           g_strdup(error_get_pretty(cur_blocker->data)));
1081         cur_blocker = g_slist_next(cur_blocker);
1082     }
1083     info->has_blocked_reasons = info->blocked_reasons != NULL;
1084 
1085     switch (s->state) {
1086     case MIGRATION_STATUS_NONE:
1087         /* no migration has happened ever */
1088         /* do not overwrite destination migration status */
1089         return;
1090     case MIGRATION_STATUS_SETUP:
1091         info->has_status = true;
1092         info->has_total_time = false;
1093         break;
1094     case MIGRATION_STATUS_ACTIVE:
1095     case MIGRATION_STATUS_CANCELLING:
1096     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1097     case MIGRATION_STATUS_PRE_SWITCHOVER:
1098     case MIGRATION_STATUS_DEVICE:
1099     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1100     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1101         /* TODO add some postcopy stats */
1102         populate_time_info(info, s);
1103         populate_ram_info(info, s);
1104         populate_disk_info(info);
1105         populate_vfio_info(info);
1106         break;
1107     case MIGRATION_STATUS_COLO:
1108         info->has_status = true;
1109         /* TODO: display COLO specific information (checkpoint info etc.) */
1110         break;
1111     case MIGRATION_STATUS_COMPLETED:
1112         populate_time_info(info, s);
1113         populate_ram_info(info, s);
1114         populate_vfio_info(info);
1115         break;
1116     case MIGRATION_STATUS_FAILED:
1117         info->has_status = true;
1118         if (s->error) {
1119             info->has_error_desc = true;
1120             info->error_desc = g_strdup(error_get_pretty(s->error));
1121         }
1122         break;
1123     case MIGRATION_STATUS_CANCELLED:
1124         info->has_status = true;
1125         break;
1126     case MIGRATION_STATUS_WAIT_UNPLUG:
1127         info->has_status = true;
1128         break;
1129     }
1130     info->status = s->state;
1131 }
1132 
1133 typedef enum WriteTrackingSupport {
1134     WT_SUPPORT_UNKNOWN = 0,
1135     WT_SUPPORT_ABSENT,
1136     WT_SUPPORT_AVAILABLE,
1137     WT_SUPPORT_COMPATIBLE
1138 } WriteTrackingSupport;
1139 
1140 static
1141 WriteTrackingSupport migrate_query_write_tracking(void)
1142 {
1143     /* Check if kernel supports required UFFD features */
1144     if (!ram_write_tracking_available()) {
1145         return WT_SUPPORT_ABSENT;
1146     }
1147     /*
1148      * Check if current memory configuration is
1149      * compatible with required UFFD features.
1150      */
1151     if (!ram_write_tracking_compatible()) {
1152         return WT_SUPPORT_AVAILABLE;
1153     }
1154 
1155     return WT_SUPPORT_COMPATIBLE;
1156 }
1157 
1158 /**
1159  * @migration_caps_check - check capability validity
1160  *
1161  * @cap_list: old capability list, array of bool
1162  * @params: new capabilities to be applied soon
1163  * @errp: set *errp if the check failed, with reason
1164  *
1165  * Returns true if check passed, otherwise false.
1166  */
1167 static bool migrate_caps_check(bool *cap_list,
1168                                MigrationCapabilityStatusList *params,
1169                                Error **errp)
1170 {
1171     MigrationCapabilityStatusList *cap;
1172     bool old_postcopy_cap;
1173     MigrationIncomingState *mis = migration_incoming_get_current();
1174 
1175     old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM];
1176 
1177     for (cap = params; cap; cap = cap->next) {
1178         cap_list[cap->value->capability] = cap->value->state;
1179     }
1180 
1181 #ifndef CONFIG_LIVE_BLOCK_MIGRATION
1182     if (cap_list[MIGRATION_CAPABILITY_BLOCK]) {
1183         error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) "
1184                    "block migration");
1185         error_append_hint(errp, "Use drive_mirror+NBD instead.\n");
1186         return false;
1187     }
1188 #endif
1189 
1190 #ifndef CONFIG_REPLICATION
1191     if (cap_list[MIGRATION_CAPABILITY_X_COLO]) {
1192         error_setg(errp, "QEMU compiled without replication module"
1193                    " can't enable COLO");
1194         error_append_hint(errp, "Please enable replication before COLO.\n");
1195         return false;
1196     }
1197 #endif
1198 
1199     if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
1200         /* This check is reasonably expensive, so only when it's being
1201          * set the first time, also it's only the destination that needs
1202          * special support.
1203          */
1204         if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
1205             !postcopy_ram_supported_by_host(mis)) {
1206             /* postcopy_ram_supported_by_host will have emitted a more
1207              * detailed message
1208              */
1209             error_setg(errp, "Postcopy is not supported");
1210             return false;
1211         }
1212 
1213         if (cap_list[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) {
1214             error_setg(errp, "Postcopy is not compatible with ignore-shared");
1215             return false;
1216         }
1217     }
1218 
1219     if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
1220         WriteTrackingSupport wt_support;
1221         int idx;
1222         /*
1223          * Check if 'background-snapshot' capability is supported by
1224          * host kernel and compatible with guest memory configuration.
1225          */
1226         wt_support = migrate_query_write_tracking();
1227         if (wt_support < WT_SUPPORT_AVAILABLE) {
1228             error_setg(errp, "Background-snapshot is not supported by host kernel");
1229             return false;
1230         }
1231         if (wt_support < WT_SUPPORT_COMPATIBLE) {
1232             error_setg(errp, "Background-snapshot is not compatible "
1233                     "with guest memory configuration");
1234             return false;
1235         }
1236 
1237         /*
1238          * Check if there are any migration capabilities
1239          * incompatible with 'background-snapshot'.
1240          */
1241         for (idx = 0; idx < check_caps_background_snapshot.size; idx++) {
1242             int incomp_cap = check_caps_background_snapshot.caps[idx];
1243             if (cap_list[incomp_cap]) {
1244                 error_setg(errp,
1245                         "Background-snapshot is not compatible with %s",
1246                         MigrationCapability_str(incomp_cap));
1247                 return false;
1248             }
1249         }
1250     }
1251 
1252     return true;
1253 }
1254 
1255 static void fill_destination_migration_info(MigrationInfo *info)
1256 {
1257     MigrationIncomingState *mis = migration_incoming_get_current();
1258 
1259     if (mis->socket_address_list) {
1260         info->has_socket_address = true;
1261         info->socket_address =
1262             QAPI_CLONE(SocketAddressList, mis->socket_address_list);
1263     }
1264 
1265     switch (mis->state) {
1266     case MIGRATION_STATUS_NONE:
1267         return;
1268     case MIGRATION_STATUS_SETUP:
1269     case MIGRATION_STATUS_CANCELLING:
1270     case MIGRATION_STATUS_CANCELLED:
1271     case MIGRATION_STATUS_ACTIVE:
1272     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1273     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1274     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1275     case MIGRATION_STATUS_FAILED:
1276     case MIGRATION_STATUS_COLO:
1277         info->has_status = true;
1278         break;
1279     case MIGRATION_STATUS_COMPLETED:
1280         info->has_status = true;
1281         fill_destination_postcopy_migration_info(info);
1282         break;
1283     }
1284     info->status = mis->state;
1285 }
1286 
1287 MigrationInfo *qmp_query_migrate(Error **errp)
1288 {
1289     MigrationInfo *info = g_malloc0(sizeof(*info));
1290 
1291     fill_destination_migration_info(info);
1292     fill_source_migration_info(info);
1293 
1294     return info;
1295 }
1296 
1297 void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
1298                                   Error **errp)
1299 {
1300     MigrationState *s = migrate_get_current();
1301     MigrationCapabilityStatusList *cap;
1302     bool cap_list[MIGRATION_CAPABILITY__MAX];
1303 
1304     if (migration_is_running(s->state)) {
1305         error_setg(errp, QERR_MIGRATION_ACTIVE);
1306         return;
1307     }
1308 
1309     memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
1310     if (!migrate_caps_check(cap_list, params, errp)) {
1311         return;
1312     }
1313 
1314     for (cap = params; cap; cap = cap->next) {
1315         s->enabled_capabilities[cap->value->capability] = cap->value->state;
1316     }
1317 }
1318 
1319 /*
1320  * Check whether the parameters are valid. Error will be put into errp
1321  * (if provided). Return true if valid, otherwise false.
1322  */
1323 static bool migrate_params_check(MigrationParameters *params, Error **errp)
1324 {
1325     if (params->has_compress_level &&
1326         (params->compress_level > 9)) {
1327         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
1328                    "a value between 0 and 9");
1329         return false;
1330     }
1331 
1332     if (params->has_compress_threads && (params->compress_threads < 1)) {
1333         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1334                    "compress_threads",
1335                    "a value between 1 and 255");
1336         return false;
1337     }
1338 
1339     if (params->has_decompress_threads && (params->decompress_threads < 1)) {
1340         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1341                    "decompress_threads",
1342                    "a value between 1 and 255");
1343         return false;
1344     }
1345 
1346     if (params->has_throttle_trigger_threshold &&
1347         (params->throttle_trigger_threshold < 1 ||
1348          params->throttle_trigger_threshold > 100)) {
1349         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1350                    "throttle_trigger_threshold",
1351                    "an integer in the range of 1 to 100");
1352         return false;
1353     }
1354 
1355     if (params->has_cpu_throttle_initial &&
1356         (params->cpu_throttle_initial < 1 ||
1357          params->cpu_throttle_initial > 99)) {
1358         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1359                    "cpu_throttle_initial",
1360                    "an integer in the range of 1 to 99");
1361         return false;
1362     }
1363 
1364     if (params->has_cpu_throttle_increment &&
1365         (params->cpu_throttle_increment < 1 ||
1366          params->cpu_throttle_increment > 99)) {
1367         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1368                    "cpu_throttle_increment",
1369                    "an integer in the range of 1 to 99");
1370         return false;
1371     }
1372 
1373     if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
1374         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1375                    "max_bandwidth",
1376                    "an integer in the range of 0 to "stringify(SIZE_MAX)
1377                    " bytes/second");
1378         return false;
1379     }
1380 
1381     if (params->has_downtime_limit &&
1382         (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
1383         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1384                    "downtime_limit",
1385                    "an integer in the range of 0 to "
1386                     stringify(MAX_MIGRATE_DOWNTIME)" ms");
1387         return false;
1388     }
1389 
1390     /* x_checkpoint_delay is now always positive */
1391 
1392     if (params->has_multifd_channels && (params->multifd_channels < 1)) {
1393         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1394                    "multifd_channels",
1395                    "a value between 1 and 255");
1396         return false;
1397     }
1398 
1399     if (params->has_multifd_zlib_level &&
1400         (params->multifd_zlib_level > 9)) {
1401         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level",
1402                    "a value between 0 and 9");
1403         return false;
1404     }
1405 
1406     if (params->has_multifd_zstd_level &&
1407         (params->multifd_zstd_level > 20)) {
1408         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
1409                    "a value between 0 and 20");
1410         return false;
1411     }
1412 
1413     if (params->has_xbzrle_cache_size &&
1414         (params->xbzrle_cache_size < qemu_target_page_size() ||
1415          !is_power_of_2(params->xbzrle_cache_size))) {
1416         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1417                    "xbzrle_cache_size",
1418                    "a power of two no less than the target page size");
1419         return false;
1420     }
1421 
1422     if (params->has_max_cpu_throttle &&
1423         (params->max_cpu_throttle < params->cpu_throttle_initial ||
1424          params->max_cpu_throttle > 99)) {
1425         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1426                    "max_cpu_throttle",
1427                    "an integer in the range of cpu_throttle_initial to 99");
1428         return false;
1429     }
1430 
1431     if (params->has_announce_initial &&
1432         params->announce_initial > 100000) {
1433         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1434                    "announce_initial",
1435                    "a value between 0 and 100000");
1436         return false;
1437     }
1438     if (params->has_announce_max &&
1439         params->announce_max > 100000) {
1440         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1441                    "announce_max",
1442                    "a value between 0 and 100000");
1443        return false;
1444     }
1445     if (params->has_announce_rounds &&
1446         params->announce_rounds > 1000) {
1447         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1448                    "announce_rounds",
1449                    "a value between 0 and 1000");
1450        return false;
1451     }
1452     if (params->has_announce_step &&
1453         (params->announce_step < 1 ||
1454         params->announce_step > 10000)) {
1455         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1456                    "announce_step",
1457                    "a value between 0 and 10000");
1458        return false;
1459     }
1460 
1461     if (params->has_block_bitmap_mapping &&
1462         !check_dirty_bitmap_mig_alias_map(params->block_bitmap_mapping, errp)) {
1463         error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: ");
1464         return false;
1465     }
1466 
1467     return true;
1468 }
1469 
1470 static void migrate_params_test_apply(MigrateSetParameters *params,
1471                                       MigrationParameters *dest)
1472 {
1473     *dest = migrate_get_current()->parameters;
1474 
1475     /* TODO use QAPI_CLONE() instead of duplicating it inline */
1476 
1477     if (params->has_compress_level) {
1478         dest->compress_level = params->compress_level;
1479     }
1480 
1481     if (params->has_compress_threads) {
1482         dest->compress_threads = params->compress_threads;
1483     }
1484 
1485     if (params->has_compress_wait_thread) {
1486         dest->compress_wait_thread = params->compress_wait_thread;
1487     }
1488 
1489     if (params->has_decompress_threads) {
1490         dest->decompress_threads = params->decompress_threads;
1491     }
1492 
1493     if (params->has_throttle_trigger_threshold) {
1494         dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
1495     }
1496 
1497     if (params->has_cpu_throttle_initial) {
1498         dest->cpu_throttle_initial = params->cpu_throttle_initial;
1499     }
1500 
1501     if (params->has_cpu_throttle_increment) {
1502         dest->cpu_throttle_increment = params->cpu_throttle_increment;
1503     }
1504 
1505     if (params->has_cpu_throttle_tailslow) {
1506         dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1507     }
1508 
1509     if (params->has_tls_creds) {
1510         assert(params->tls_creds->type == QTYPE_QSTRING);
1511         dest->tls_creds = params->tls_creds->u.s;
1512     }
1513 
1514     if (params->has_tls_hostname) {
1515         assert(params->tls_hostname->type == QTYPE_QSTRING);
1516         dest->tls_hostname = params->tls_hostname->u.s;
1517     }
1518 
1519     if (params->has_max_bandwidth) {
1520         dest->max_bandwidth = params->max_bandwidth;
1521     }
1522 
1523     if (params->has_downtime_limit) {
1524         dest->downtime_limit = params->downtime_limit;
1525     }
1526 
1527     if (params->has_x_checkpoint_delay) {
1528         dest->x_checkpoint_delay = params->x_checkpoint_delay;
1529     }
1530 
1531     if (params->has_block_incremental) {
1532         dest->block_incremental = params->block_incremental;
1533     }
1534     if (params->has_multifd_channels) {
1535         dest->multifd_channels = params->multifd_channels;
1536     }
1537     if (params->has_multifd_compression) {
1538         dest->multifd_compression = params->multifd_compression;
1539     }
1540     if (params->has_xbzrle_cache_size) {
1541         dest->xbzrle_cache_size = params->xbzrle_cache_size;
1542     }
1543     if (params->has_max_postcopy_bandwidth) {
1544         dest->max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1545     }
1546     if (params->has_max_cpu_throttle) {
1547         dest->max_cpu_throttle = params->max_cpu_throttle;
1548     }
1549     if (params->has_announce_initial) {
1550         dest->announce_initial = params->announce_initial;
1551     }
1552     if (params->has_announce_max) {
1553         dest->announce_max = params->announce_max;
1554     }
1555     if (params->has_announce_rounds) {
1556         dest->announce_rounds = params->announce_rounds;
1557     }
1558     if (params->has_announce_step) {
1559         dest->announce_step = params->announce_step;
1560     }
1561 
1562     if (params->has_block_bitmap_mapping) {
1563         dest->has_block_bitmap_mapping = true;
1564         dest->block_bitmap_mapping = params->block_bitmap_mapping;
1565     }
1566 }
1567 
1568 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
1569 {
1570     MigrationState *s = migrate_get_current();
1571 
1572     /* TODO use QAPI_CLONE() instead of duplicating it inline */
1573 
1574     if (params->has_compress_level) {
1575         s->parameters.compress_level = params->compress_level;
1576     }
1577 
1578     if (params->has_compress_threads) {
1579         s->parameters.compress_threads = params->compress_threads;
1580     }
1581 
1582     if (params->has_compress_wait_thread) {
1583         s->parameters.compress_wait_thread = params->compress_wait_thread;
1584     }
1585 
1586     if (params->has_decompress_threads) {
1587         s->parameters.decompress_threads = params->decompress_threads;
1588     }
1589 
1590     if (params->has_throttle_trigger_threshold) {
1591         s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold;
1592     }
1593 
1594     if (params->has_cpu_throttle_initial) {
1595         s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
1596     }
1597 
1598     if (params->has_cpu_throttle_increment) {
1599         s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
1600     }
1601 
1602     if (params->has_cpu_throttle_tailslow) {
1603         s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1604     }
1605 
1606     if (params->has_tls_creds) {
1607         g_free(s->parameters.tls_creds);
1608         assert(params->tls_creds->type == QTYPE_QSTRING);
1609         s->parameters.tls_creds = g_strdup(params->tls_creds->u.s);
1610     }
1611 
1612     if (params->has_tls_hostname) {
1613         g_free(s->parameters.tls_hostname);
1614         assert(params->tls_hostname->type == QTYPE_QSTRING);
1615         s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s);
1616     }
1617 
1618     if (params->has_tls_authz) {
1619         g_free(s->parameters.tls_authz);
1620         assert(params->tls_authz->type == QTYPE_QSTRING);
1621         s->parameters.tls_authz = g_strdup(params->tls_authz->u.s);
1622     }
1623 
1624     if (params->has_max_bandwidth) {
1625         s->parameters.max_bandwidth = params->max_bandwidth;
1626         if (s->to_dst_file && !migration_in_postcopy()) {
1627             qemu_file_set_rate_limit(s->to_dst_file,
1628                                 s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
1629         }
1630     }
1631 
1632     if (params->has_downtime_limit) {
1633         s->parameters.downtime_limit = params->downtime_limit;
1634     }
1635 
1636     if (params->has_x_checkpoint_delay) {
1637         s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
1638         if (migration_in_colo_state()) {
1639             colo_checkpoint_notify(s);
1640         }
1641     }
1642 
1643     if (params->has_block_incremental) {
1644         s->parameters.block_incremental = params->block_incremental;
1645     }
1646     if (params->has_multifd_channels) {
1647         s->parameters.multifd_channels = params->multifd_channels;
1648     }
1649     if (params->has_multifd_compression) {
1650         s->parameters.multifd_compression = params->multifd_compression;
1651     }
1652     if (params->has_xbzrle_cache_size) {
1653         s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
1654         xbzrle_cache_resize(params->xbzrle_cache_size, errp);
1655     }
1656     if (params->has_max_postcopy_bandwidth) {
1657         s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1658         if (s->to_dst_file && migration_in_postcopy()) {
1659             qemu_file_set_rate_limit(s->to_dst_file,
1660                     s->parameters.max_postcopy_bandwidth / XFER_LIMIT_RATIO);
1661         }
1662     }
1663     if (params->has_max_cpu_throttle) {
1664         s->parameters.max_cpu_throttle = params->max_cpu_throttle;
1665     }
1666     if (params->has_announce_initial) {
1667         s->parameters.announce_initial = params->announce_initial;
1668     }
1669     if (params->has_announce_max) {
1670         s->parameters.announce_max = params->announce_max;
1671     }
1672     if (params->has_announce_rounds) {
1673         s->parameters.announce_rounds = params->announce_rounds;
1674     }
1675     if (params->has_announce_step) {
1676         s->parameters.announce_step = params->announce_step;
1677     }
1678 
1679     if (params->has_block_bitmap_mapping) {
1680         qapi_free_BitmapMigrationNodeAliasList(
1681             s->parameters.block_bitmap_mapping);
1682 
1683         s->parameters.has_block_bitmap_mapping = true;
1684         s->parameters.block_bitmap_mapping =
1685             QAPI_CLONE(BitmapMigrationNodeAliasList,
1686                        params->block_bitmap_mapping);
1687     }
1688 }
1689 
1690 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
1691 {
1692     MigrationParameters tmp;
1693 
1694     /* TODO Rewrite "" to null instead */
1695     if (params->has_tls_creds
1696         && params->tls_creds->type == QTYPE_QNULL) {
1697         qobject_unref(params->tls_creds->u.n);
1698         params->tls_creds->type = QTYPE_QSTRING;
1699         params->tls_creds->u.s = strdup("");
1700     }
1701     /* TODO Rewrite "" to null instead */
1702     if (params->has_tls_hostname
1703         && params->tls_hostname->type == QTYPE_QNULL) {
1704         qobject_unref(params->tls_hostname->u.n);
1705         params->tls_hostname->type = QTYPE_QSTRING;
1706         params->tls_hostname->u.s = strdup("");
1707     }
1708 
1709     migrate_params_test_apply(params, &tmp);
1710 
1711     if (!migrate_params_check(&tmp, errp)) {
1712         /* Invalid parameter */
1713         return;
1714     }
1715 
1716     migrate_params_apply(params, errp);
1717 }
1718 
1719 
1720 void qmp_migrate_start_postcopy(Error **errp)
1721 {
1722     MigrationState *s = migrate_get_current();
1723 
1724     if (!migrate_postcopy()) {
1725         error_setg(errp, "Enable postcopy with migrate_set_capability before"
1726                          " the start of migration");
1727         return;
1728     }
1729 
1730     if (s->state == MIGRATION_STATUS_NONE) {
1731         error_setg(errp, "Postcopy must be started after migration has been"
1732                          " started");
1733         return;
1734     }
1735     /*
1736      * we don't error if migration has finished since that would be racy
1737      * with issuing this command.
1738      */
1739     qatomic_set(&s->start_postcopy, true);
1740 }
1741 
1742 /* shared migration helpers */
1743 
1744 void migrate_set_state(int *state, int old_state, int new_state)
1745 {
1746     assert(new_state < MIGRATION_STATUS__MAX);
1747     if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
1748         trace_migrate_set_state(MigrationStatus_str(new_state));
1749         migrate_generate_event(new_state);
1750     }
1751 }
1752 
1753 static MigrationCapabilityStatus *migrate_cap_add(MigrationCapability index,
1754                                                   bool state)
1755 {
1756     MigrationCapabilityStatus *cap;
1757 
1758     cap = g_new0(MigrationCapabilityStatus, 1);
1759     cap->capability = index;
1760     cap->state = state;
1761 
1762     return cap;
1763 }
1764 
1765 void migrate_set_block_enabled(bool value, Error **errp)
1766 {
1767     MigrationCapabilityStatusList *cap = NULL;
1768 
1769     QAPI_LIST_PREPEND(cap, migrate_cap_add(MIGRATION_CAPABILITY_BLOCK, value));
1770     qmp_migrate_set_capabilities(cap, errp);
1771     qapi_free_MigrationCapabilityStatusList(cap);
1772 }
1773 
1774 static void migrate_set_block_incremental(MigrationState *s, bool value)
1775 {
1776     s->parameters.block_incremental = value;
1777 }
1778 
1779 static void block_cleanup_parameters(MigrationState *s)
1780 {
1781     if (s->must_remove_block_options) {
1782         /* setting to false can never fail */
1783         migrate_set_block_enabled(false, &error_abort);
1784         migrate_set_block_incremental(s, false);
1785         s->must_remove_block_options = false;
1786     }
1787 }
1788 
1789 static void migrate_fd_cleanup(MigrationState *s)
1790 {
1791     qemu_bh_delete(s->cleanup_bh);
1792     s->cleanup_bh = NULL;
1793 
1794     qemu_savevm_state_cleanup();
1795 
1796     if (s->to_dst_file) {
1797         QEMUFile *tmp;
1798 
1799         trace_migrate_fd_cleanup();
1800         qemu_mutex_unlock_iothread();
1801         if (s->migration_thread_running) {
1802             qemu_thread_join(&s->thread);
1803             s->migration_thread_running = false;
1804         }
1805         qemu_mutex_lock_iothread();
1806 
1807         multifd_save_cleanup();
1808         qemu_mutex_lock(&s->qemu_file_lock);
1809         tmp = s->to_dst_file;
1810         s->to_dst_file = NULL;
1811         qemu_mutex_unlock(&s->qemu_file_lock);
1812         /*
1813          * Close the file handle without the lock to make sure the
1814          * critical section won't block for long.
1815          */
1816         migration_ioc_unregister_yank_from_file(tmp);
1817         qemu_fclose(tmp);
1818     }
1819 
1820     assert(!migration_is_active(s));
1821 
1822     if (s->state == MIGRATION_STATUS_CANCELLING) {
1823         migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1824                           MIGRATION_STATUS_CANCELLED);
1825     }
1826 
1827     if (s->error) {
1828         /* It is used on info migrate.  We can't free it */
1829         error_report_err(error_copy(s->error));
1830     }
1831     notifier_list_notify(&migration_state_notifiers, s);
1832     block_cleanup_parameters(s);
1833     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1834 }
1835 
1836 static void migrate_fd_cleanup_schedule(MigrationState *s)
1837 {
1838     /*
1839      * Ref the state for bh, because it may be called when
1840      * there're already no other refs
1841      */
1842     object_ref(OBJECT(s));
1843     qemu_bh_schedule(s->cleanup_bh);
1844 }
1845 
1846 static void migrate_fd_cleanup_bh(void *opaque)
1847 {
1848     MigrationState *s = opaque;
1849     migrate_fd_cleanup(s);
1850     object_unref(OBJECT(s));
1851 }
1852 
1853 void migrate_set_error(MigrationState *s, const Error *error)
1854 {
1855     QEMU_LOCK_GUARD(&s->error_mutex);
1856     if (!s->error) {
1857         s->error = error_copy(error);
1858     }
1859 }
1860 
1861 static void migrate_error_free(MigrationState *s)
1862 {
1863     QEMU_LOCK_GUARD(&s->error_mutex);
1864     if (s->error) {
1865         error_free(s->error);
1866         s->error = NULL;
1867     }
1868 }
1869 
1870 void migrate_fd_error(MigrationState *s, const Error *error)
1871 {
1872     trace_migrate_fd_error(error_get_pretty(error));
1873     assert(s->to_dst_file == NULL);
1874     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1875                       MIGRATION_STATUS_FAILED);
1876     migrate_set_error(s, error);
1877 }
1878 
1879 static void migrate_fd_cancel(MigrationState *s)
1880 {
1881     int old_state ;
1882     QEMUFile *f = migrate_get_current()->to_dst_file;
1883     trace_migrate_fd_cancel();
1884 
1885     WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1886         if (s->rp_state.from_dst_file) {
1887             /* shutdown the rp socket, so causing the rp thread to shutdown */
1888             qemu_file_shutdown(s->rp_state.from_dst_file);
1889         }
1890     }
1891 
1892     do {
1893         old_state = s->state;
1894         if (!migration_is_running(old_state)) {
1895             break;
1896         }
1897         /* If the migration is paused, kick it out of the pause */
1898         if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1899             qemu_sem_post(&s->pause_sem);
1900         }
1901         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
1902     } while (s->state != MIGRATION_STATUS_CANCELLING);
1903 
1904     /*
1905      * If we're unlucky the migration code might be stuck somewhere in a
1906      * send/write while the network has failed and is waiting to timeout;
1907      * if we've got shutdown(2) available then we can force it to quit.
1908      * The outgoing qemu file gets closed in migrate_fd_cleanup that is
1909      * called in a bh, so there is no race against this cancel.
1910      */
1911     if (s->state == MIGRATION_STATUS_CANCELLING && f) {
1912         qemu_file_shutdown(f);
1913     }
1914     if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
1915         Error *local_err = NULL;
1916 
1917         bdrv_invalidate_cache_all(&local_err);
1918         if (local_err) {
1919             error_report_err(local_err);
1920         } else {
1921             s->block_inactive = false;
1922         }
1923     }
1924 }
1925 
1926 void add_migration_state_change_notifier(Notifier *notify)
1927 {
1928     notifier_list_add(&migration_state_notifiers, notify);
1929 }
1930 
1931 void remove_migration_state_change_notifier(Notifier *notify)
1932 {
1933     notifier_remove(notify);
1934 }
1935 
1936 bool migration_in_setup(MigrationState *s)
1937 {
1938     return s->state == MIGRATION_STATUS_SETUP;
1939 }
1940 
1941 bool migration_has_finished(MigrationState *s)
1942 {
1943     return s->state == MIGRATION_STATUS_COMPLETED;
1944 }
1945 
1946 bool migration_has_failed(MigrationState *s)
1947 {
1948     return (s->state == MIGRATION_STATUS_CANCELLED ||
1949             s->state == MIGRATION_STATUS_FAILED);
1950 }
1951 
1952 bool migration_in_postcopy(void)
1953 {
1954     MigrationState *s = migrate_get_current();
1955 
1956     switch (s->state) {
1957     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1958     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1959     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1960         return true;
1961     default:
1962         return false;
1963     }
1964 }
1965 
1966 bool migration_in_postcopy_after_devices(MigrationState *s)
1967 {
1968     return migration_in_postcopy() && s->postcopy_after_devices;
1969 }
1970 
1971 bool migration_in_incoming_postcopy(void)
1972 {
1973     PostcopyState ps = postcopy_state_get();
1974 
1975     return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
1976 }
1977 
1978 bool migration_in_bg_snapshot(void)
1979 {
1980     MigrationState *s = migrate_get_current();
1981 
1982     return migrate_background_snapshot() &&
1983             migration_is_setup_or_active(s->state);
1984 }
1985 
1986 bool migration_is_idle(void)
1987 {
1988     MigrationState *s = current_migration;
1989 
1990     if (!s) {
1991         return true;
1992     }
1993 
1994     switch (s->state) {
1995     case MIGRATION_STATUS_NONE:
1996     case MIGRATION_STATUS_CANCELLED:
1997     case MIGRATION_STATUS_COMPLETED:
1998     case MIGRATION_STATUS_FAILED:
1999         return true;
2000     case MIGRATION_STATUS_SETUP:
2001     case MIGRATION_STATUS_CANCELLING:
2002     case MIGRATION_STATUS_ACTIVE:
2003     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
2004     case MIGRATION_STATUS_COLO:
2005     case MIGRATION_STATUS_PRE_SWITCHOVER:
2006     case MIGRATION_STATUS_DEVICE:
2007     case MIGRATION_STATUS_WAIT_UNPLUG:
2008         return false;
2009     case MIGRATION_STATUS__MAX:
2010         g_assert_not_reached();
2011     }
2012 
2013     return false;
2014 }
2015 
2016 bool migration_is_active(MigrationState *s)
2017 {
2018     return (s->state == MIGRATION_STATUS_ACTIVE ||
2019             s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2020 }
2021 
2022 void migrate_init(MigrationState *s)
2023 {
2024     /*
2025      * Reinitialise all migration state, except
2026      * parameters/capabilities that the user set, and
2027      * locks.
2028      */
2029     s->cleanup_bh = 0;
2030     s->vm_start_bh = 0;
2031     s->to_dst_file = NULL;
2032     s->state = MIGRATION_STATUS_NONE;
2033     s->rp_state.from_dst_file = NULL;
2034     s->rp_state.error = false;
2035     s->mbps = 0.0;
2036     s->pages_per_second = 0.0;
2037     s->downtime = 0;
2038     s->expected_downtime = 0;
2039     s->setup_time = 0;
2040     s->start_postcopy = false;
2041     s->postcopy_after_devices = false;
2042     s->migration_thread_running = false;
2043     error_free(s->error);
2044     s->error = NULL;
2045     s->hostname = NULL;
2046 
2047     migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
2048 
2049     s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2050     s->total_time = 0;
2051     s->vm_was_running = false;
2052     s->iteration_initial_bytes = 0;
2053     s->threshold_size = 0;
2054 }
2055 
2056 int migrate_add_blocker(Error *reason, Error **errp)
2057 {
2058     if (only_migratable) {
2059         error_propagate_prepend(errp, error_copy(reason),
2060                                 "disallowing migration blocker "
2061                                 "(--only-migratable) for: ");
2062         return -EACCES;
2063     }
2064 
2065     if (migration_is_idle()) {
2066         migration_blockers = g_slist_prepend(migration_blockers, reason);
2067         return 0;
2068     }
2069 
2070     error_propagate_prepend(errp, error_copy(reason),
2071                             "disallowing migration blocker "
2072                             "(migration in progress) for: ");
2073     return -EBUSY;
2074 }
2075 
2076 void migrate_del_blocker(Error *reason)
2077 {
2078     migration_blockers = g_slist_remove(migration_blockers, reason);
2079 }
2080 
2081 void qmp_migrate_incoming(const char *uri, Error **errp)
2082 {
2083     Error *local_err = NULL;
2084     static bool once = true;
2085 
2086     if (!once) {
2087         error_setg(errp, "The incoming migration has already been started");
2088         return;
2089     }
2090     if (!runstate_check(RUN_STATE_INMIGRATE)) {
2091         error_setg(errp, "'-incoming' was not specified on the command line");
2092         return;
2093     }
2094 
2095     if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2096         return;
2097     }
2098 
2099     qemu_start_incoming_migration(uri, &local_err);
2100 
2101     if (local_err) {
2102         yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2103         error_propagate(errp, local_err);
2104         return;
2105     }
2106 
2107     once = false;
2108 }
2109 
2110 void qmp_migrate_recover(const char *uri, Error **errp)
2111 {
2112     MigrationIncomingState *mis = migration_incoming_get_current();
2113 
2114     /*
2115      * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
2116      * callers (no one should ignore a recover failure); if there is, it's a
2117      * programming error.
2118      */
2119     assert(errp);
2120 
2121     if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2122         error_setg(errp, "Migrate recover can only be run "
2123                    "when postcopy is paused.");
2124         return;
2125     }
2126 
2127     if (qatomic_cmpxchg(&mis->postcopy_recover_triggered,
2128                        false, true) == true) {
2129         error_setg(errp, "Migrate recovery is triggered already");
2130         return;
2131     }
2132 
2133     /*
2134      * Note that this call will never start a real migration; it will
2135      * only re-setup the migration stream and poke existing migration
2136      * to continue using that newly established channel.
2137      */
2138     qemu_start_incoming_migration(uri, errp);
2139 
2140     /* Safe to dereference with the assert above */
2141     if (*errp) {
2142         /* Reset the flag so user could still retry */
2143         qatomic_set(&mis->postcopy_recover_triggered, false);
2144     }
2145 }
2146 
2147 void qmp_migrate_pause(Error **errp)
2148 {
2149     MigrationState *ms = migrate_get_current();
2150     MigrationIncomingState *mis = migration_incoming_get_current();
2151     int ret;
2152 
2153     if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2154         /* Source side, during postcopy */
2155         qemu_mutex_lock(&ms->qemu_file_lock);
2156         ret = qemu_file_shutdown(ms->to_dst_file);
2157         qemu_mutex_unlock(&ms->qemu_file_lock);
2158         if (ret) {
2159             error_setg(errp, "Failed to pause source migration");
2160         }
2161         return;
2162     }
2163 
2164     if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2165         ret = qemu_file_shutdown(mis->from_src_file);
2166         if (ret) {
2167             error_setg(errp, "Failed to pause destination migration");
2168         }
2169         return;
2170     }
2171 
2172     error_setg(errp, "migrate-pause is currently only supported "
2173                "during postcopy-active state");
2174 }
2175 
2176 bool migration_is_blocked(Error **errp)
2177 {
2178     if (qemu_savevm_state_blocked(errp)) {
2179         return true;
2180     }
2181 
2182     if (migration_blockers) {
2183         error_propagate(errp, error_copy(migration_blockers->data));
2184         return true;
2185     }
2186 
2187     return false;
2188 }
2189 
2190 /* Returns true if continue to migrate, or false if error detected */
2191 static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
2192                             bool resume, Error **errp)
2193 {
2194     Error *local_err = NULL;
2195 
2196     if (resume) {
2197         if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2198             error_setg(errp, "Cannot resume if there is no "
2199                        "paused migration");
2200             return false;
2201         }
2202 
2203         /*
2204          * Postcopy recovery won't work well with release-ram
2205          * capability since release-ram will drop the page buffer as
2206          * long as the page is put into the send buffer.  So if there
2207          * is a network failure happened, any page buffers that have
2208          * not yet reached the destination VM but have already been
2209          * sent from the source VM will be lost forever.  Let's refuse
2210          * the client from resuming such a postcopy migration.
2211          * Luckily release-ram was designed to only be used when src
2212          * and destination VMs are on the same host, so it should be
2213          * fine.
2214          */
2215         if (migrate_release_ram()) {
2216             error_setg(errp, "Postcopy recovery cannot work "
2217                        "when release-ram capability is set");
2218             return false;
2219         }
2220 
2221         /* This is a resume, skip init status */
2222         return true;
2223     }
2224 
2225     if (migration_is_running(s->state)) {
2226         error_setg(errp, QERR_MIGRATION_ACTIVE);
2227         return false;
2228     }
2229 
2230     if (runstate_check(RUN_STATE_INMIGRATE)) {
2231         error_setg(errp, "Guest is waiting for an incoming migration");
2232         return false;
2233     }
2234 
2235     if (runstate_check(RUN_STATE_POSTMIGRATE)) {
2236         error_setg(errp, "Can't migrate the vm that was paused due to "
2237                    "previous migration");
2238         return false;
2239     }
2240 
2241     if (migration_is_blocked(errp)) {
2242         return false;
2243     }
2244 
2245     if (blk || blk_inc) {
2246         if (migrate_colo_enabled()) {
2247             error_setg(errp, "No disk migration is required in COLO mode");
2248             return false;
2249         }
2250         if (migrate_use_block() || migrate_use_block_incremental()) {
2251             error_setg(errp, "Command options are incompatible with "
2252                        "current migration capabilities");
2253             return false;
2254         }
2255         migrate_set_block_enabled(true, &local_err);
2256         if (local_err) {
2257             error_propagate(errp, local_err);
2258             return false;
2259         }
2260         s->must_remove_block_options = true;
2261     }
2262 
2263     if (blk_inc) {
2264         migrate_set_block_incremental(s, true);
2265     }
2266 
2267     migrate_init(s);
2268     /*
2269      * set ram_counters memory to zero for a
2270      * new migration
2271      */
2272     memset(&ram_counters, 0, sizeof(ram_counters));
2273 
2274     return true;
2275 }
2276 
2277 void qmp_migrate(const char *uri, bool has_blk, bool blk,
2278                  bool has_inc, bool inc, bool has_detach, bool detach,
2279                  bool has_resume, bool resume, Error **errp)
2280 {
2281     Error *local_err = NULL;
2282     MigrationState *s = migrate_get_current();
2283     const char *p = NULL;
2284 
2285     if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
2286                          has_resume && resume, errp)) {
2287         /* Error detected, put into errp */
2288         return;
2289     }
2290 
2291     if (!(has_resume && resume)) {
2292         if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2293             return;
2294         }
2295     }
2296 
2297     if (strstart(uri, "tcp:", &p) ||
2298         strstart(uri, "unix:", NULL) ||
2299         strstart(uri, "vsock:", NULL)) {
2300         socket_start_outgoing_migration(s, p ? p : uri, &local_err);
2301 #ifdef CONFIG_RDMA
2302     } else if (strstart(uri, "rdma:", &p)) {
2303         rdma_start_outgoing_migration(s, p, &local_err);
2304 #endif
2305     } else if (strstart(uri, "exec:", &p)) {
2306         exec_start_outgoing_migration(s, p, &local_err);
2307     } else if (strstart(uri, "fd:", &p)) {
2308         fd_start_outgoing_migration(s, p, &local_err);
2309     } else {
2310         if (!(has_resume && resume)) {
2311             yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2312         }
2313         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
2314                    "a valid migration protocol");
2315         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2316                           MIGRATION_STATUS_FAILED);
2317         block_cleanup_parameters(s);
2318         return;
2319     }
2320 
2321     if (local_err) {
2322         if (!(has_resume && resume)) {
2323             yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2324         }
2325         migrate_fd_error(s, local_err);
2326         error_propagate(errp, local_err);
2327         return;
2328     }
2329 }
2330 
2331 void qmp_migrate_cancel(Error **errp)
2332 {
2333     migration_cancel();
2334 }
2335 
2336 void qmp_migrate_continue(MigrationStatus state, Error **errp)
2337 {
2338     MigrationState *s = migrate_get_current();
2339     if (s->state != state) {
2340         error_setg(errp,  "Migration not in expected state: %s",
2341                    MigrationStatus_str(s->state));
2342         return;
2343     }
2344     qemu_sem_post(&s->pause_sem);
2345 }
2346 
2347 bool migrate_release_ram(void)
2348 {
2349     MigrationState *s;
2350 
2351     s = migrate_get_current();
2352 
2353     return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
2354 }
2355 
2356 bool migrate_postcopy_ram(void)
2357 {
2358     MigrationState *s;
2359 
2360     s = migrate_get_current();
2361 
2362     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
2363 }
2364 
2365 bool migrate_postcopy(void)
2366 {
2367     return migrate_postcopy_ram() || migrate_dirty_bitmaps();
2368 }
2369 
2370 bool migrate_auto_converge(void)
2371 {
2372     MigrationState *s;
2373 
2374     s = migrate_get_current();
2375 
2376     return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
2377 }
2378 
2379 bool migrate_zero_blocks(void)
2380 {
2381     MigrationState *s;
2382 
2383     s = migrate_get_current();
2384 
2385     return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
2386 }
2387 
2388 bool migrate_postcopy_blocktime(void)
2389 {
2390     MigrationState *s;
2391 
2392     s = migrate_get_current();
2393 
2394     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
2395 }
2396 
2397 bool migrate_use_compression(void)
2398 {
2399     MigrationState *s;
2400 
2401     s = migrate_get_current();
2402 
2403     return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
2404 }
2405 
2406 int migrate_compress_level(void)
2407 {
2408     MigrationState *s;
2409 
2410     s = migrate_get_current();
2411 
2412     return s->parameters.compress_level;
2413 }
2414 
2415 int migrate_compress_threads(void)
2416 {
2417     MigrationState *s;
2418 
2419     s = migrate_get_current();
2420 
2421     return s->parameters.compress_threads;
2422 }
2423 
2424 int migrate_compress_wait_thread(void)
2425 {
2426     MigrationState *s;
2427 
2428     s = migrate_get_current();
2429 
2430     return s->parameters.compress_wait_thread;
2431 }
2432 
2433 int migrate_decompress_threads(void)
2434 {
2435     MigrationState *s;
2436 
2437     s = migrate_get_current();
2438 
2439     return s->parameters.decompress_threads;
2440 }
2441 
2442 bool migrate_dirty_bitmaps(void)
2443 {
2444     MigrationState *s;
2445 
2446     s = migrate_get_current();
2447 
2448     return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
2449 }
2450 
2451 bool migrate_ignore_shared(void)
2452 {
2453     MigrationState *s;
2454 
2455     s = migrate_get_current();
2456 
2457     return s->enabled_capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED];
2458 }
2459 
2460 bool migrate_validate_uuid(void)
2461 {
2462     MigrationState *s;
2463 
2464     s = migrate_get_current();
2465 
2466     return s->enabled_capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID];
2467 }
2468 
2469 bool migrate_use_events(void)
2470 {
2471     MigrationState *s;
2472 
2473     s = migrate_get_current();
2474 
2475     return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
2476 }
2477 
2478 bool migrate_use_multifd(void)
2479 {
2480     MigrationState *s;
2481 
2482     s = migrate_get_current();
2483 
2484     return s->enabled_capabilities[MIGRATION_CAPABILITY_MULTIFD];
2485 }
2486 
2487 bool migrate_pause_before_switchover(void)
2488 {
2489     MigrationState *s;
2490 
2491     s = migrate_get_current();
2492 
2493     return s->enabled_capabilities[
2494         MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
2495 }
2496 
2497 int migrate_multifd_channels(void)
2498 {
2499     MigrationState *s;
2500 
2501     s = migrate_get_current();
2502 
2503     return s->parameters.multifd_channels;
2504 }
2505 
2506 MultiFDCompression migrate_multifd_compression(void)
2507 {
2508     MigrationState *s;
2509 
2510     s = migrate_get_current();
2511 
2512     return s->parameters.multifd_compression;
2513 }
2514 
2515 int migrate_multifd_zlib_level(void)
2516 {
2517     MigrationState *s;
2518 
2519     s = migrate_get_current();
2520 
2521     return s->parameters.multifd_zlib_level;
2522 }
2523 
2524 int migrate_multifd_zstd_level(void)
2525 {
2526     MigrationState *s;
2527 
2528     s = migrate_get_current();
2529 
2530     return s->parameters.multifd_zstd_level;
2531 }
2532 
2533 int migrate_use_xbzrle(void)
2534 {
2535     MigrationState *s;
2536 
2537     s = migrate_get_current();
2538 
2539     return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
2540 }
2541 
2542 uint64_t migrate_xbzrle_cache_size(void)
2543 {
2544     MigrationState *s;
2545 
2546     s = migrate_get_current();
2547 
2548     return s->parameters.xbzrle_cache_size;
2549 }
2550 
2551 static int64_t migrate_max_postcopy_bandwidth(void)
2552 {
2553     MigrationState *s;
2554 
2555     s = migrate_get_current();
2556 
2557     return s->parameters.max_postcopy_bandwidth;
2558 }
2559 
2560 bool migrate_use_block(void)
2561 {
2562     MigrationState *s;
2563 
2564     s = migrate_get_current();
2565 
2566     return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK];
2567 }
2568 
2569 bool migrate_use_return_path(void)
2570 {
2571     MigrationState *s;
2572 
2573     s = migrate_get_current();
2574 
2575     return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
2576 }
2577 
2578 bool migrate_use_block_incremental(void)
2579 {
2580     MigrationState *s;
2581 
2582     s = migrate_get_current();
2583 
2584     return s->parameters.block_incremental;
2585 }
2586 
2587 bool migrate_background_snapshot(void)
2588 {
2589     MigrationState *s;
2590 
2591     s = migrate_get_current();
2592 
2593     return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
2594 }
2595 
2596 /* migration thread support */
2597 /*
2598  * Something bad happened to the RP stream, mark an error
2599  * The caller shall print or trace something to indicate why
2600  */
2601 static void mark_source_rp_bad(MigrationState *s)
2602 {
2603     s->rp_state.error = true;
2604 }
2605 
2606 static struct rp_cmd_args {
2607     ssize_t     len; /* -1 = variable */
2608     const char *name;
2609 } rp_cmd_args[] = {
2610     [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
2611     [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
2612     [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
2613     [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
2614     [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
2615     [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
2616     [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
2617     [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
2618 };
2619 
2620 /*
2621  * Process a request for pages received on the return path,
2622  * We're allowed to send more than requested (e.g. to round to our page size)
2623  * and we don't need to send pages that have already been sent.
2624  */
2625 static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
2626                                        ram_addr_t start, size_t len)
2627 {
2628     long our_host_ps = qemu_real_host_page_size;
2629 
2630     trace_migrate_handle_rp_req_pages(rbname, start, len);
2631 
2632     /*
2633      * Since we currently insist on matching page sizes, just sanity check
2634      * we're being asked for whole host pages.
2635      */
2636     if (start & (our_host_ps - 1) ||
2637        (len & (our_host_ps - 1))) {
2638         error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
2639                      " len: %zd", __func__, start, len);
2640         mark_source_rp_bad(ms);
2641         return;
2642     }
2643 
2644     if (ram_save_queue_pages(rbname, start, len)) {
2645         mark_source_rp_bad(ms);
2646     }
2647 }
2648 
2649 /* Return true to retry, false to quit */
2650 static bool postcopy_pause_return_path_thread(MigrationState *s)
2651 {
2652     trace_postcopy_pause_return_path();
2653 
2654     qemu_sem_wait(&s->postcopy_pause_rp_sem);
2655 
2656     trace_postcopy_pause_return_path_continued();
2657 
2658     return true;
2659 }
2660 
2661 static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
2662 {
2663     RAMBlock *block = qemu_ram_block_by_name(block_name);
2664 
2665     if (!block) {
2666         error_report("%s: invalid block name '%s'", __func__, block_name);
2667         return -EINVAL;
2668     }
2669 
2670     /* Fetch the received bitmap and refresh the dirty bitmap */
2671     return ram_dirty_bitmap_reload(s, block);
2672 }
2673 
2674 static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
2675 {
2676     trace_source_return_path_thread_resume_ack(value);
2677 
2678     if (value != MIGRATION_RESUME_ACK_VALUE) {
2679         error_report("%s: illegal resume_ack value %"PRIu32,
2680                      __func__, value);
2681         return -1;
2682     }
2683 
2684     /* Now both sides are active. */
2685     migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2686                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
2687 
2688     /* Notify send thread that time to continue send pages */
2689     qemu_sem_post(&s->rp_state.rp_sem);
2690 
2691     return 0;
2692 }
2693 
2694 /* Release ms->rp_state.from_dst_file in a safe way */
2695 static void migration_release_from_dst_file(MigrationState *ms)
2696 {
2697     QEMUFile *file;
2698 
2699     WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2700         /*
2701          * Reset the from_dst_file pointer first before releasing it, as we
2702          * can't block within lock section
2703          */
2704         file = ms->rp_state.from_dst_file;
2705         ms->rp_state.from_dst_file = NULL;
2706     }
2707 
2708     qemu_fclose(file);
2709 }
2710 
2711 /*
2712  * Handles messages sent on the return path towards the source VM
2713  *
2714  */
2715 static void *source_return_path_thread(void *opaque)
2716 {
2717     MigrationState *ms = opaque;
2718     QEMUFile *rp = ms->rp_state.from_dst_file;
2719     uint16_t header_len, header_type;
2720     uint8_t buf[512];
2721     uint32_t tmp32, sibling_error;
2722     ram_addr_t start = 0; /* =0 to silence warning */
2723     size_t  len = 0, expected_len;
2724     int res;
2725 
2726     trace_source_return_path_thread_entry();
2727     rcu_register_thread();
2728 
2729 retry:
2730     while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
2731            migration_is_setup_or_active(ms->state)) {
2732         trace_source_return_path_thread_loop_top();
2733         header_type = qemu_get_be16(rp);
2734         header_len = qemu_get_be16(rp);
2735 
2736         if (qemu_file_get_error(rp)) {
2737             mark_source_rp_bad(ms);
2738             goto out;
2739         }
2740 
2741         if (header_type >= MIG_RP_MSG_MAX ||
2742             header_type == MIG_RP_MSG_INVALID) {
2743             error_report("RP: Received invalid message 0x%04x length 0x%04x",
2744                          header_type, header_len);
2745             mark_source_rp_bad(ms);
2746             goto out;
2747         }
2748 
2749         if ((rp_cmd_args[header_type].len != -1 &&
2750             header_len != rp_cmd_args[header_type].len) ||
2751             header_len > sizeof(buf)) {
2752             error_report("RP: Received '%s' message (0x%04x) with"
2753                          "incorrect length %d expecting %zu",
2754                          rp_cmd_args[header_type].name, header_type, header_len,
2755                          (size_t)rp_cmd_args[header_type].len);
2756             mark_source_rp_bad(ms);
2757             goto out;
2758         }
2759 
2760         /* We know we've got a valid header by this point */
2761         res = qemu_get_buffer(rp, buf, header_len);
2762         if (res != header_len) {
2763             error_report("RP: Failed reading data for message 0x%04x"
2764                          " read %d expected %d",
2765                          header_type, res, header_len);
2766             mark_source_rp_bad(ms);
2767             goto out;
2768         }
2769 
2770         /* OK, we have the message and the data */
2771         switch (header_type) {
2772         case MIG_RP_MSG_SHUT:
2773             sibling_error = ldl_be_p(buf);
2774             trace_source_return_path_thread_shut(sibling_error);
2775             if (sibling_error) {
2776                 error_report("RP: Sibling indicated error %d", sibling_error);
2777                 mark_source_rp_bad(ms);
2778             }
2779             /*
2780              * We'll let the main thread deal with closing the RP
2781              * we could do a shutdown(2) on it, but we're the only user
2782              * anyway, so there's nothing gained.
2783              */
2784             goto out;
2785 
2786         case MIG_RP_MSG_PONG:
2787             tmp32 = ldl_be_p(buf);
2788             trace_source_return_path_thread_pong(tmp32);
2789             break;
2790 
2791         case MIG_RP_MSG_REQ_PAGES:
2792             start = ldq_be_p(buf);
2793             len = ldl_be_p(buf + 8);
2794             migrate_handle_rp_req_pages(ms, NULL, start, len);
2795             break;
2796 
2797         case MIG_RP_MSG_REQ_PAGES_ID:
2798             expected_len = 12 + 1; /* header + termination */
2799 
2800             if (header_len >= expected_len) {
2801                 start = ldq_be_p(buf);
2802                 len = ldl_be_p(buf + 8);
2803                 /* Now we expect an idstr */
2804                 tmp32 = buf[12]; /* Length of the following idstr */
2805                 buf[13 + tmp32] = '\0';
2806                 expected_len += tmp32;
2807             }
2808             if (header_len != expected_len) {
2809                 error_report("RP: Req_Page_id with length %d expecting %zd",
2810                              header_len, expected_len);
2811                 mark_source_rp_bad(ms);
2812                 goto out;
2813             }
2814             migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
2815             break;
2816 
2817         case MIG_RP_MSG_RECV_BITMAP:
2818             if (header_len < 1) {
2819                 error_report("%s: missing block name", __func__);
2820                 mark_source_rp_bad(ms);
2821                 goto out;
2822             }
2823             /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2824             buf[buf[0] + 1] = '\0';
2825             if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
2826                 mark_source_rp_bad(ms);
2827                 goto out;
2828             }
2829             break;
2830 
2831         case MIG_RP_MSG_RESUME_ACK:
2832             tmp32 = ldl_be_p(buf);
2833             if (migrate_handle_rp_resume_ack(ms, tmp32)) {
2834                 mark_source_rp_bad(ms);
2835                 goto out;
2836             }
2837             break;
2838 
2839         default:
2840             break;
2841         }
2842     }
2843 
2844 out:
2845     res = qemu_file_get_error(rp);
2846     if (res) {
2847         if (res == -EIO && migration_in_postcopy()) {
2848             /*
2849              * Maybe there is something we can do: it looks like a
2850              * network down issue, and we pause for a recovery.
2851              */
2852             migration_release_from_dst_file(ms);
2853             rp = NULL;
2854             if (postcopy_pause_return_path_thread(ms)) {
2855                 /*
2856                  * Reload rp, reset the rest.  Referencing it is safe since
2857                  * it's reset only by us above, or when migration completes
2858                  */
2859                 rp = ms->rp_state.from_dst_file;
2860                 ms->rp_state.error = false;
2861                 goto retry;
2862             }
2863         }
2864 
2865         trace_source_return_path_thread_bad_end();
2866         mark_source_rp_bad(ms);
2867     }
2868 
2869     trace_source_return_path_thread_end();
2870     migration_release_from_dst_file(ms);
2871     rcu_unregister_thread();
2872     return NULL;
2873 }
2874 
2875 static int open_return_path_on_source(MigrationState *ms,
2876                                       bool create_thread)
2877 {
2878     ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
2879     if (!ms->rp_state.from_dst_file) {
2880         return -1;
2881     }
2882 
2883     trace_open_return_path_on_source();
2884 
2885     if (!create_thread) {
2886         /* We're done */
2887         return 0;
2888     }
2889 
2890     qemu_thread_create(&ms->rp_state.rp_thread, "return path",
2891                        source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
2892     ms->rp_state.rp_thread_created = true;
2893 
2894     trace_open_return_path_on_source_continue();
2895 
2896     return 0;
2897 }
2898 
2899 /* Returns 0 if the RP was ok, otherwise there was an error on the RP */
2900 static int await_return_path_close_on_source(MigrationState *ms)
2901 {
2902     /*
2903      * If this is a normal exit then the destination will send a SHUT and the
2904      * rp_thread will exit, however if there's an error we need to cause
2905      * it to exit.
2906      */
2907     if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
2908         /*
2909          * shutdown(2), if we have it, will cause it to unblock if it's stuck
2910          * waiting for the destination.
2911          */
2912         qemu_file_shutdown(ms->rp_state.from_dst_file);
2913         mark_source_rp_bad(ms);
2914     }
2915     trace_await_return_path_close_on_source_joining();
2916     qemu_thread_join(&ms->rp_state.rp_thread);
2917     ms->rp_state.rp_thread_created = false;
2918     trace_await_return_path_close_on_source_close();
2919     return ms->rp_state.error;
2920 }
2921 
2922 /*
2923  * Switch from normal iteration to postcopy
2924  * Returns non-0 on error
2925  */
2926 static int postcopy_start(MigrationState *ms)
2927 {
2928     int ret;
2929     QIOChannelBuffer *bioc;
2930     QEMUFile *fb;
2931     int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2932     int64_t bandwidth = migrate_max_postcopy_bandwidth();
2933     bool restart_block = false;
2934     int cur_state = MIGRATION_STATUS_ACTIVE;
2935     if (!migrate_pause_before_switchover()) {
2936         migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
2937                           MIGRATION_STATUS_POSTCOPY_ACTIVE);
2938     }
2939 
2940     trace_postcopy_start();
2941     qemu_mutex_lock_iothread();
2942     trace_postcopy_start_set_run();
2943 
2944     qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
2945     global_state_store();
2946     ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
2947     if (ret < 0) {
2948         goto fail;
2949     }
2950 
2951     ret = migration_maybe_pause(ms, &cur_state,
2952                                 MIGRATION_STATUS_POSTCOPY_ACTIVE);
2953     if (ret < 0) {
2954         goto fail;
2955     }
2956 
2957     ret = bdrv_inactivate_all();
2958     if (ret < 0) {
2959         goto fail;
2960     }
2961     restart_block = true;
2962 
2963     /*
2964      * Cause any non-postcopiable, but iterative devices to
2965      * send out their final data.
2966      */
2967     qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
2968 
2969     /*
2970      * in Finish migrate and with the io-lock held everything should
2971      * be quiet, but we've potentially still got dirty pages and we
2972      * need to tell the destination to throw any pages it's already received
2973      * that are dirty
2974      */
2975     if (migrate_postcopy_ram()) {
2976         if (ram_postcopy_send_discard_bitmap(ms)) {
2977             error_report("postcopy send discard bitmap failed");
2978             goto fail;
2979         }
2980     }
2981 
2982     /*
2983      * send rest of state - note things that are doing postcopy
2984      * will notice we're in POSTCOPY_ACTIVE and not actually
2985      * wrap their state up here
2986      */
2987     /* 0 max-postcopy-bandwidth means unlimited */
2988     if (!bandwidth) {
2989         qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
2990     } else {
2991         qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO);
2992     }
2993     if (migrate_postcopy_ram()) {
2994         /* Ping just for debugging, helps line traces up */
2995         qemu_savevm_send_ping(ms->to_dst_file, 2);
2996     }
2997 
2998     /*
2999      * While loading the device state we may trigger page transfer
3000      * requests and the fd must be free to process those, and thus
3001      * the destination must read the whole device state off the fd before
3002      * it starts processing it.  Unfortunately the ad-hoc migration format
3003      * doesn't allow the destination to know the size to read without fully
3004      * parsing it through each devices load-state code (especially the open
3005      * coded devices that use get/put).
3006      * So we wrap the device state up in a package with a length at the start;
3007      * to do this we use a qemu_buf to hold the whole of the device state.
3008      */
3009     bioc = qio_channel_buffer_new(4096);
3010     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
3011     fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
3012     object_unref(OBJECT(bioc));
3013 
3014     /*
3015      * Make sure the receiver can get incoming pages before we send the rest
3016      * of the state
3017      */
3018     qemu_savevm_send_postcopy_listen(fb);
3019 
3020     qemu_savevm_state_complete_precopy(fb, false, false);
3021     if (migrate_postcopy_ram()) {
3022         qemu_savevm_send_ping(fb, 3);
3023     }
3024 
3025     qemu_savevm_send_postcopy_run(fb);
3026 
3027     /* <><> end of stuff going into the package */
3028 
3029     /* Last point of recovery; as soon as we send the package the destination
3030      * can open devices and potentially start running.
3031      * Lets just check again we've not got any errors.
3032      */
3033     ret = qemu_file_get_error(ms->to_dst_file);
3034     if (ret) {
3035         error_report("postcopy_start: Migration stream errored (pre package)");
3036         goto fail_closefb;
3037     }
3038 
3039     restart_block = false;
3040 
3041     /* Now send that blob */
3042     if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
3043         goto fail_closefb;
3044     }
3045     qemu_fclose(fb);
3046 
3047     /* Send a notify to give a chance for anything that needs to happen
3048      * at the transition to postcopy and after the device state; in particular
3049      * spice needs to trigger a transition now
3050      */
3051     ms->postcopy_after_devices = true;
3052     notifier_list_notify(&migration_state_notifiers, ms);
3053 
3054     ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
3055 
3056     qemu_mutex_unlock_iothread();
3057 
3058     if (migrate_postcopy_ram()) {
3059         /*
3060          * Although this ping is just for debug, it could potentially be
3061          * used for getting a better measurement of downtime at the source.
3062          */
3063         qemu_savevm_send_ping(ms->to_dst_file, 4);
3064     }
3065 
3066     if (migrate_release_ram()) {
3067         ram_postcopy_migrated_memory_release(ms);
3068     }
3069 
3070     ret = qemu_file_get_error(ms->to_dst_file);
3071     if (ret) {
3072         error_report("postcopy_start: Migration stream errored");
3073         migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3074                               MIGRATION_STATUS_FAILED);
3075     }
3076 
3077     return ret;
3078 
3079 fail_closefb:
3080     qemu_fclose(fb);
3081 fail:
3082     migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3083                           MIGRATION_STATUS_FAILED);
3084     if (restart_block) {
3085         /* A failure happened early enough that we know the destination hasn't
3086          * accessed block devices, so we're safe to recover.
3087          */
3088         Error *local_err = NULL;
3089 
3090         bdrv_invalidate_cache_all(&local_err);
3091         if (local_err) {
3092             error_report_err(local_err);
3093         }
3094     }
3095     qemu_mutex_unlock_iothread();
3096     return -1;
3097 }
3098 
3099 /**
3100  * migration_maybe_pause: Pause if required to by
3101  * migrate_pause_before_switchover called with the iothread locked
3102  * Returns: 0 on success
3103  */
3104 static int migration_maybe_pause(MigrationState *s,
3105                                  int *current_active_state,
3106                                  int new_state)
3107 {
3108     if (!migrate_pause_before_switchover()) {
3109         return 0;
3110     }
3111 
3112     /* Since leaving this state is not atomic with posting the semaphore
3113      * it's possible that someone could have issued multiple migrate_continue
3114      * and the semaphore is incorrectly positive at this point;
3115      * the docs say it's undefined to reinit a semaphore that's already
3116      * init'd, so use timedwait to eat up any existing posts.
3117      */
3118     while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
3119         /* This block intentionally left blank */
3120     }
3121 
3122     /*
3123      * If the migration is cancelled when it is in the completion phase,
3124      * the migration state is set to MIGRATION_STATUS_CANCELLING.
3125      * So we don't need to wait a semaphore, otherwise we would always
3126      * wait for the 'pause_sem' semaphore.
3127      */
3128     if (s->state != MIGRATION_STATUS_CANCELLING) {
3129         qemu_mutex_unlock_iothread();
3130         migrate_set_state(&s->state, *current_active_state,
3131                           MIGRATION_STATUS_PRE_SWITCHOVER);
3132         qemu_sem_wait(&s->pause_sem);
3133         migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
3134                           new_state);
3135         *current_active_state = new_state;
3136         qemu_mutex_lock_iothread();
3137     }
3138 
3139     return s->state == new_state ? 0 : -EINVAL;
3140 }
3141 
3142 /**
3143  * migration_completion: Used by migration_thread when there's not much left.
3144  *   The caller 'breaks' the loop when this returns.
3145  *
3146  * @s: Current migration state
3147  */
3148 static void migration_completion(MigrationState *s)
3149 {
3150     int ret;
3151     int current_active_state = s->state;
3152 
3153     if (s->state == MIGRATION_STATUS_ACTIVE) {
3154         qemu_mutex_lock_iothread();
3155         s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3156         qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3157         s->vm_was_running = runstate_is_running();
3158         ret = global_state_store();
3159 
3160         if (!ret) {
3161             bool inactivate = !migrate_colo_enabled();
3162             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
3163             trace_migration_completion_vm_stop(ret);
3164             if (ret >= 0) {
3165                 ret = migration_maybe_pause(s, &current_active_state,
3166                                             MIGRATION_STATUS_DEVICE);
3167             }
3168             if (ret >= 0) {
3169                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
3170                 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
3171                                                          inactivate);
3172             }
3173             if (inactivate && ret >= 0) {
3174                 s->block_inactive = true;
3175             }
3176         }
3177         qemu_mutex_unlock_iothread();
3178 
3179         if (ret < 0) {
3180             goto fail;
3181         }
3182     } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3183         trace_migration_completion_postcopy_end();
3184 
3185         qemu_savevm_state_complete_postcopy(s->to_dst_file);
3186         trace_migration_completion_postcopy_end_after_complete();
3187     } else if (s->state == MIGRATION_STATUS_CANCELLING) {
3188         goto fail;
3189     }
3190 
3191     /*
3192      * If rp was opened we must clean up the thread before
3193      * cleaning everything else up (since if there are no failures
3194      * it will wait for the destination to send it's status in
3195      * a SHUT command).
3196      */
3197     if (s->rp_state.rp_thread_created) {
3198         int rp_error;
3199         trace_migration_return_path_end_before();
3200         rp_error = await_return_path_close_on_source(s);
3201         trace_migration_return_path_end_after(rp_error);
3202         if (rp_error) {
3203             goto fail_invalidate;
3204         }
3205     }
3206 
3207     if (qemu_file_get_error(s->to_dst_file)) {
3208         trace_migration_completion_file_err();
3209         goto fail_invalidate;
3210     }
3211 
3212     if (!migrate_colo_enabled()) {
3213         migrate_set_state(&s->state, current_active_state,
3214                           MIGRATION_STATUS_COMPLETED);
3215     }
3216 
3217     return;
3218 
3219 fail_invalidate:
3220     /* If not doing postcopy, vm_start() will be called: let's regain
3221      * control on images.
3222      */
3223     if (s->state == MIGRATION_STATUS_ACTIVE ||
3224         s->state == MIGRATION_STATUS_DEVICE) {
3225         Error *local_err = NULL;
3226 
3227         qemu_mutex_lock_iothread();
3228         bdrv_invalidate_cache_all(&local_err);
3229         if (local_err) {
3230             error_report_err(local_err);
3231         } else {
3232             s->block_inactive = false;
3233         }
3234         qemu_mutex_unlock_iothread();
3235     }
3236 
3237 fail:
3238     migrate_set_state(&s->state, current_active_state,
3239                       MIGRATION_STATUS_FAILED);
3240 }
3241 
3242 /**
3243  * bg_migration_completion: Used by bg_migration_thread when after all the
3244  *   RAM has been saved. The caller 'breaks' the loop when this returns.
3245  *
3246  * @s: Current migration state
3247  */
3248 static void bg_migration_completion(MigrationState *s)
3249 {
3250     int current_active_state = s->state;
3251 
3252     /*
3253      * Stop tracking RAM writes - un-protect memory, un-register UFFD
3254      * memory ranges, flush kernel wait queues and wake up threads
3255      * waiting for write fault to be resolved.
3256      */
3257     ram_write_tracking_stop();
3258 
3259     if (s->state == MIGRATION_STATUS_ACTIVE) {
3260         /*
3261          * By this moment we have RAM content saved into the migration stream.
3262          * The next step is to flush the non-RAM content (device state)
3263          * right after the ram content. The device state has been stored into
3264          * the temporary buffer before RAM saving started.
3265          */
3266         qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
3267         qemu_fflush(s->to_dst_file);
3268     } else if (s->state == MIGRATION_STATUS_CANCELLING) {
3269         goto fail;
3270     }
3271 
3272     if (qemu_file_get_error(s->to_dst_file)) {
3273         trace_migration_completion_file_err();
3274         goto fail;
3275     }
3276 
3277     migrate_set_state(&s->state, current_active_state,
3278                       MIGRATION_STATUS_COMPLETED);
3279     return;
3280 
3281 fail:
3282     migrate_set_state(&s->state, current_active_state,
3283                       MIGRATION_STATUS_FAILED);
3284 }
3285 
3286 bool migrate_colo_enabled(void)
3287 {
3288     MigrationState *s = migrate_get_current();
3289     return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
3290 }
3291 
3292 typedef enum MigThrError {
3293     /* No error detected */
3294     MIG_THR_ERR_NONE = 0,
3295     /* Detected error, but resumed successfully */
3296     MIG_THR_ERR_RECOVERED = 1,
3297     /* Detected fatal error, need to exit */
3298     MIG_THR_ERR_FATAL = 2,
3299 } MigThrError;
3300 
3301 static int postcopy_resume_handshake(MigrationState *s)
3302 {
3303     qemu_savevm_send_postcopy_resume(s->to_dst_file);
3304 
3305     while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3306         qemu_sem_wait(&s->rp_state.rp_sem);
3307     }
3308 
3309     if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3310         return 0;
3311     }
3312 
3313     return -1;
3314 }
3315 
3316 /* Return zero if success, or <0 for error */
3317 static int postcopy_do_resume(MigrationState *s)
3318 {
3319     int ret;
3320 
3321     /*
3322      * Call all the resume_prepare() hooks, so that modules can be
3323      * ready for the migration resume.
3324      */
3325     ret = qemu_savevm_state_resume_prepare(s);
3326     if (ret) {
3327         error_report("%s: resume_prepare() failure detected: %d",
3328                      __func__, ret);
3329         return ret;
3330     }
3331 
3332     /*
3333      * Last handshake with destination on the resume (destination will
3334      * switch to postcopy-active afterwards)
3335      */
3336     ret = postcopy_resume_handshake(s);
3337     if (ret) {
3338         error_report("%s: handshake failed: %d", __func__, ret);
3339         return ret;
3340     }
3341 
3342     return 0;
3343 }
3344 
3345 /*
3346  * We don't return until we are in a safe state to continue current
3347  * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
3348  * MIG_THR_ERR_FATAL if unrecovery failure happened.
3349  */
3350 static MigThrError postcopy_pause(MigrationState *s)
3351 {
3352     assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
3353 
3354     while (true) {
3355         QEMUFile *file;
3356 
3357         /*
3358          * Current channel is possibly broken. Release it.  Note that this is
3359          * guaranteed even without lock because to_dst_file should only be
3360          * modified by the migration thread.  That also guarantees that the
3361          * unregister of yank is safe too without the lock.  It should be safe
3362          * even to be within the qemu_file_lock, but we didn't do that to avoid
3363          * taking more mutex (yank_lock) within qemu_file_lock.  TL;DR: we make
3364          * the qemu_file_lock critical section as small as possible.
3365          */
3366         assert(s->to_dst_file);
3367         migration_ioc_unregister_yank_from_file(s->to_dst_file);
3368         qemu_mutex_lock(&s->qemu_file_lock);
3369         file = s->to_dst_file;
3370         s->to_dst_file = NULL;
3371         qemu_mutex_unlock(&s->qemu_file_lock);
3372 
3373         qemu_file_shutdown(file);
3374         qemu_fclose(file);
3375 
3376         migrate_set_state(&s->state, s->state,
3377                           MIGRATION_STATUS_POSTCOPY_PAUSED);
3378 
3379         error_report("Detected IO failure for postcopy. "
3380                      "Migration paused.");
3381 
3382         /*
3383          * We wait until things fixed up. Then someone will setup the
3384          * status back for us.
3385          */
3386         while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
3387             qemu_sem_wait(&s->postcopy_pause_sem);
3388         }
3389 
3390         if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3391             /* Woken up by a recover procedure. Give it a shot */
3392 
3393             /*
3394              * Firstly, let's wake up the return path now, with a new
3395              * return path channel.
3396              */
3397             qemu_sem_post(&s->postcopy_pause_rp_sem);
3398 
3399             /* Do the resume logic */
3400             if (postcopy_do_resume(s) == 0) {
3401                 /* Let's continue! */
3402                 trace_postcopy_pause_continued();
3403                 return MIG_THR_ERR_RECOVERED;
3404             } else {
3405                 /*
3406                  * Something wrong happened during the recovery, let's
3407                  * pause again. Pause is always better than throwing
3408                  * data away.
3409                  */
3410                 continue;
3411             }
3412         } else {
3413             /* This is not right... Time to quit. */
3414             return MIG_THR_ERR_FATAL;
3415         }
3416     }
3417 }
3418 
3419 static MigThrError migration_detect_error(MigrationState *s)
3420 {
3421     int ret;
3422     int state = s->state;
3423     Error *local_error = NULL;
3424 
3425     if (state == MIGRATION_STATUS_CANCELLING ||
3426         state == MIGRATION_STATUS_CANCELLED) {
3427         /* End the migration, but don't set the state to failed */
3428         return MIG_THR_ERR_FATAL;
3429     }
3430 
3431     /* Try to detect any file errors */
3432     ret = qemu_file_get_error_obj(s->to_dst_file, &local_error);
3433     if (!ret) {
3434         /* Everything is fine */
3435         assert(!local_error);
3436         return MIG_THR_ERR_NONE;
3437     }
3438 
3439     if (local_error) {
3440         migrate_set_error(s, local_error);
3441         error_free(local_error);
3442     }
3443 
3444     if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
3445         /*
3446          * For postcopy, we allow the network to be down for a
3447          * while. After that, it can be continued by a
3448          * recovery phase.
3449          */
3450         return postcopy_pause(s);
3451     } else {
3452         /*
3453          * For precopy (or postcopy with error outside IO), we fail
3454          * with no time.
3455          */
3456         migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
3457         trace_migration_thread_file_err();
3458 
3459         /* Time to stop the migration, now. */
3460         return MIG_THR_ERR_FATAL;
3461     }
3462 }
3463 
3464 /* How many bytes have we transferred since the beginning of the migration */
3465 static uint64_t migration_total_bytes(MigrationState *s)
3466 {
3467     return qemu_ftell(s->to_dst_file) + ram_counters.multifd_bytes;
3468 }
3469 
3470 static void migration_calculate_complete(MigrationState *s)
3471 {
3472     uint64_t bytes = migration_total_bytes(s);
3473     int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3474     int64_t transfer_time;
3475 
3476     s->total_time = end_time - s->start_time;
3477     if (!s->downtime) {
3478         /*
3479          * It's still not set, so we are precopy migration.  For
3480          * postcopy, downtime is calculated during postcopy_start().
3481          */
3482         s->downtime = end_time - s->downtime_start;
3483     }
3484 
3485     transfer_time = s->total_time - s->setup_time;
3486     if (transfer_time) {
3487         s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
3488     }
3489 }
3490 
3491 static void update_iteration_initial_status(MigrationState *s)
3492 {
3493     /*
3494      * Update these three fields at the same time to avoid mismatch info lead
3495      * wrong speed calculation.
3496      */
3497     s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3498     s->iteration_initial_bytes = migration_total_bytes(s);
3499     s->iteration_initial_pages = ram_get_total_transferred_pages();
3500 }
3501 
3502 static void migration_update_counters(MigrationState *s,
3503                                       int64_t current_time)
3504 {
3505     uint64_t transferred, transferred_pages, time_spent;
3506     uint64_t current_bytes; /* bytes transferred since the beginning */
3507     double bandwidth;
3508 
3509     if (current_time < s->iteration_start_time + BUFFER_DELAY) {
3510         return;
3511     }
3512 
3513     current_bytes = migration_total_bytes(s);
3514     transferred = current_bytes - s->iteration_initial_bytes;
3515     time_spent = current_time - s->iteration_start_time;
3516     bandwidth = (double)transferred / time_spent;
3517     s->threshold_size = bandwidth * s->parameters.downtime_limit;
3518 
3519     s->mbps = (((double) transferred * 8.0) /
3520                ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
3521 
3522     transferred_pages = ram_get_total_transferred_pages() -
3523                             s->iteration_initial_pages;
3524     s->pages_per_second = (double) transferred_pages /
3525                              (((double) time_spent / 1000.0));
3526 
3527     /*
3528      * if we haven't sent anything, we don't want to
3529      * recalculate. 10000 is a small enough number for our purposes
3530      */
3531     if (ram_counters.dirty_pages_rate && transferred > 10000) {
3532         s->expected_downtime = ram_counters.remaining / bandwidth;
3533     }
3534 
3535     qemu_file_reset_rate_limit(s->to_dst_file);
3536 
3537     update_iteration_initial_status(s);
3538 
3539     trace_migrate_transferred(transferred, time_spent,
3540                               bandwidth, s->threshold_size);
3541 }
3542 
3543 /* Migration thread iteration status */
3544 typedef enum {
3545     MIG_ITERATE_RESUME,         /* Resume current iteration */
3546     MIG_ITERATE_SKIP,           /* Skip current iteration */
3547     MIG_ITERATE_BREAK,          /* Break the loop */
3548 } MigIterateState;
3549 
3550 /*
3551  * Return true if continue to the next iteration directly, false
3552  * otherwise.
3553  */
3554 static MigIterateState migration_iteration_run(MigrationState *s)
3555 {
3556     uint64_t pending_size, pend_pre, pend_compat, pend_post;
3557     bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
3558 
3559     qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
3560                               &pend_compat, &pend_post);
3561     pending_size = pend_pre + pend_compat + pend_post;
3562 
3563     trace_migrate_pending(pending_size, s->threshold_size,
3564                           pend_pre, pend_compat, pend_post);
3565 
3566     if (pending_size && pending_size >= s->threshold_size) {
3567         /* Still a significant amount to transfer */
3568         if (!in_postcopy && pend_pre <= s->threshold_size &&
3569             qatomic_read(&s->start_postcopy)) {
3570             if (postcopy_start(s)) {
3571                 error_report("%s: postcopy failed to start", __func__);
3572             }
3573             return MIG_ITERATE_SKIP;
3574         }
3575         /* Just another iteration step */
3576         qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
3577     } else {
3578         trace_migration_thread_low_pending(pending_size);
3579         migration_completion(s);
3580         return MIG_ITERATE_BREAK;
3581     }
3582 
3583     return MIG_ITERATE_RESUME;
3584 }
3585 
3586 static void migration_iteration_finish(MigrationState *s)
3587 {
3588     /* If we enabled cpu throttling for auto-converge, turn it off. */
3589     cpu_throttle_stop();
3590 
3591     qemu_mutex_lock_iothread();
3592     switch (s->state) {
3593     case MIGRATION_STATUS_COMPLETED:
3594         migration_calculate_complete(s);
3595         runstate_set(RUN_STATE_POSTMIGRATE);
3596         break;
3597 
3598     case MIGRATION_STATUS_ACTIVE:
3599         /*
3600          * We should really assert here, but since it's during
3601          * migration, let's try to reduce the usage of assertions.
3602          */
3603         if (!migrate_colo_enabled()) {
3604             error_report("%s: critical error: calling COLO code without "
3605                          "COLO enabled", __func__);
3606         }
3607         migrate_start_colo_process(s);
3608         /*
3609          * Fixme: we will run VM in COLO no matter its old running state.
3610          * After exited COLO, we will keep running.
3611          */
3612         s->vm_was_running = true;
3613         /* Fallthrough */
3614     case MIGRATION_STATUS_FAILED:
3615     case MIGRATION_STATUS_CANCELLED:
3616     case MIGRATION_STATUS_CANCELLING:
3617         if (s->vm_was_running) {
3618             vm_start();
3619         } else {
3620             if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
3621                 runstate_set(RUN_STATE_POSTMIGRATE);
3622             }
3623         }
3624         break;
3625 
3626     default:
3627         /* Should not reach here, but if so, forgive the VM. */
3628         error_report("%s: Unknown ending state %d", __func__, s->state);
3629         break;
3630     }
3631     migrate_fd_cleanup_schedule(s);
3632     qemu_mutex_unlock_iothread();
3633 }
3634 
3635 static void bg_migration_iteration_finish(MigrationState *s)
3636 {
3637     qemu_mutex_lock_iothread();
3638     switch (s->state) {
3639     case MIGRATION_STATUS_COMPLETED:
3640         migration_calculate_complete(s);
3641         break;
3642 
3643     case MIGRATION_STATUS_ACTIVE:
3644     case MIGRATION_STATUS_FAILED:
3645     case MIGRATION_STATUS_CANCELLED:
3646     case MIGRATION_STATUS_CANCELLING:
3647         break;
3648 
3649     default:
3650         /* Should not reach here, but if so, forgive the VM. */
3651         error_report("%s: Unknown ending state %d", __func__, s->state);
3652         break;
3653     }
3654 
3655     migrate_fd_cleanup_schedule(s);
3656     qemu_mutex_unlock_iothread();
3657 }
3658 
3659 /*
3660  * Return true if continue to the next iteration directly, false
3661  * otherwise.
3662  */
3663 static MigIterateState bg_migration_iteration_run(MigrationState *s)
3664 {
3665     int res;
3666 
3667     res = qemu_savevm_state_iterate(s->to_dst_file, false);
3668     if (res > 0) {
3669         bg_migration_completion(s);
3670         return MIG_ITERATE_BREAK;
3671     }
3672 
3673     return MIG_ITERATE_RESUME;
3674 }
3675 
3676 void migration_make_urgent_request(void)
3677 {
3678     qemu_sem_post(&migrate_get_current()->rate_limit_sem);
3679 }
3680 
3681 void migration_consume_urgent_request(void)
3682 {
3683     qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
3684 }
3685 
3686 /* Returns true if the rate limiting was broken by an urgent request */
3687 bool migration_rate_limit(void)
3688 {
3689     int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3690     MigrationState *s = migrate_get_current();
3691 
3692     bool urgent = false;
3693     migration_update_counters(s, now);
3694     if (qemu_file_rate_limit(s->to_dst_file)) {
3695 
3696         if (qemu_file_get_error(s->to_dst_file)) {
3697             return false;
3698         }
3699         /*
3700          * Wait for a delay to do rate limiting OR
3701          * something urgent to post the semaphore.
3702          */
3703         int ms = s->iteration_start_time + BUFFER_DELAY - now;
3704         trace_migration_rate_limit_pre(ms);
3705         if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
3706             /*
3707              * We were woken by one or more urgent things but
3708              * the timedwait will have consumed one of them.
3709              * The service routine for the urgent wake will dec
3710              * the semaphore itself for each item it consumes,
3711              * so add this one we just eat back.
3712              */
3713             qemu_sem_post(&s->rate_limit_sem);
3714             urgent = true;
3715         }
3716         trace_migration_rate_limit_post(urgent);
3717     }
3718     return urgent;
3719 }
3720 
3721 /*
3722  * if failover devices are present, wait they are completely
3723  * unplugged
3724  */
3725 
3726 static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
3727                                     int new_state)
3728 {
3729     if (qemu_savevm_state_guest_unplug_pending()) {
3730         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
3731 
3732         while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
3733                qemu_savevm_state_guest_unplug_pending()) {
3734             qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3735         }
3736         if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
3737             int timeout = 120; /* 30 seconds */
3738             /*
3739              * migration has been canceled
3740              * but as we have started an unplug we must wait the end
3741              * to be able to plug back the card
3742              */
3743             while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
3744                 qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3745             }
3746             if (qemu_savevm_state_guest_unplug_pending()) {
3747                 warn_report("migration: partially unplugged device on "
3748                             "failure");
3749             }
3750         }
3751 
3752         migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
3753     } else {
3754         migrate_set_state(&s->state, old_state, new_state);
3755     }
3756 }
3757 
3758 /*
3759  * Master migration thread on the source VM.
3760  * It drives the migration and pumps the data down the outgoing channel.
3761  */
3762 static void *migration_thread(void *opaque)
3763 {
3764     MigrationState *s = opaque;
3765     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3766     MigThrError thr_error;
3767     bool urgent = false;
3768 
3769     rcu_register_thread();
3770 
3771     object_ref(OBJECT(s));
3772     update_iteration_initial_status(s);
3773 
3774     qemu_savevm_state_header(s->to_dst_file);
3775 
3776     /*
3777      * If we opened the return path, we need to make sure dst has it
3778      * opened as well.
3779      */
3780     if (s->rp_state.rp_thread_created) {
3781         /* Now tell the dest that it should open its end so it can reply */
3782         qemu_savevm_send_open_return_path(s->to_dst_file);
3783 
3784         /* And do a ping that will make stuff easier to debug */
3785         qemu_savevm_send_ping(s->to_dst_file, 1);
3786     }
3787 
3788     if (migrate_postcopy()) {
3789         /*
3790          * Tell the destination that we *might* want to do postcopy later;
3791          * if the other end can't do postcopy it should fail now, nice and
3792          * early.
3793          */
3794         qemu_savevm_send_postcopy_advise(s->to_dst_file);
3795     }
3796 
3797     if (migrate_colo_enabled()) {
3798         /* Notify migration destination that we enable COLO */
3799         qemu_savevm_send_colo_enable(s->to_dst_file);
3800     }
3801 
3802     qemu_savevm_state_setup(s->to_dst_file);
3803 
3804     qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3805                                MIGRATION_STATUS_ACTIVE);
3806 
3807     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3808 
3809     trace_migration_thread_setup_complete();
3810 
3811     while (migration_is_active(s)) {
3812         if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
3813             MigIterateState iter_state = migration_iteration_run(s);
3814             if (iter_state == MIG_ITERATE_SKIP) {
3815                 continue;
3816             } else if (iter_state == MIG_ITERATE_BREAK) {
3817                 break;
3818             }
3819         }
3820 
3821         /*
3822          * Try to detect any kind of failures, and see whether we
3823          * should stop the migration now.
3824          */
3825         thr_error = migration_detect_error(s);
3826         if (thr_error == MIG_THR_ERR_FATAL) {
3827             /* Stop migration */
3828             break;
3829         } else if (thr_error == MIG_THR_ERR_RECOVERED) {
3830             /*
3831              * Just recovered from a e.g. network failure, reset all
3832              * the local variables. This is important to avoid
3833              * breaking transferred_bytes and bandwidth calculation
3834              */
3835             update_iteration_initial_status(s);
3836         }
3837 
3838         urgent = migration_rate_limit();
3839     }
3840 
3841     trace_migration_thread_after_loop();
3842     migration_iteration_finish(s);
3843     object_unref(OBJECT(s));
3844     rcu_unregister_thread();
3845     return NULL;
3846 }
3847 
3848 static void bg_migration_vm_start_bh(void *opaque)
3849 {
3850     MigrationState *s = opaque;
3851 
3852     qemu_bh_delete(s->vm_start_bh);
3853     s->vm_start_bh = NULL;
3854 
3855     vm_start();
3856     s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
3857 }
3858 
3859 /**
3860  * Background snapshot thread, based on live migration code.
3861  * This is an alternative implementation of live migration mechanism
3862  * introduced specifically to support background snapshots.
3863  *
3864  * It takes advantage of userfault_fd write protection mechanism introduced
3865  * in v5.7 kernel. Compared to existing dirty page logging migration much
3866  * lesser stream traffic is produced resulting in smaller snapshot images,
3867  * simply cause of no page duplicates can get into the stream.
3868  *
3869  * Another key point is that generated vmstate stream reflects machine state
3870  * 'frozen' at the beginning of snapshot creation compared to dirty page logging
3871  * mechanism, which effectively results in that saved snapshot is the state of VM
3872  * at the end of the process.
3873  */
3874 static void *bg_migration_thread(void *opaque)
3875 {
3876     MigrationState *s = opaque;
3877     int64_t setup_start;
3878     MigThrError thr_error;
3879     QEMUFile *fb;
3880     bool early_fail = true;
3881 
3882     rcu_register_thread();
3883     object_ref(OBJECT(s));
3884 
3885     qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
3886 
3887     setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3888     /*
3889      * We want to save vmstate for the moment when migration has been
3890      * initiated but also we want to save RAM content while VM is running.
3891      * The RAM content should appear first in the vmstate. So, we first
3892      * stash the non-RAM part of the vmstate to the temporary buffer,
3893      * then write RAM part of the vmstate to the migration stream
3894      * with vCPUs running and, finally, write stashed non-RAM part of
3895      * the vmstate from the buffer to the migration stream.
3896      */
3897     s->bioc = qio_channel_buffer_new(512 * 1024);
3898     qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
3899     fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc));
3900     object_unref(OBJECT(s->bioc));
3901 
3902     update_iteration_initial_status(s);
3903 
3904     /*
3905      * Prepare for tracking memory writes with UFFD-WP - populate
3906      * RAM pages before protecting.
3907      */
3908 #ifdef __linux__
3909     ram_write_tracking_prepare();
3910 #endif
3911 
3912     qemu_savevm_state_header(s->to_dst_file);
3913     qemu_savevm_state_setup(s->to_dst_file);
3914 
3915     qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3916                                MIGRATION_STATUS_ACTIVE);
3917 
3918     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3919 
3920     trace_migration_thread_setup_complete();
3921     s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3922 
3923     qemu_mutex_lock_iothread();
3924 
3925     /*
3926      * If VM is currently in suspended state, then, to make a valid runstate
3927      * transition in vm_stop_force_state() we need to wakeup it up.
3928      */
3929     qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3930     s->vm_was_running = runstate_is_running();
3931 
3932     if (global_state_store()) {
3933         goto fail;
3934     }
3935     /* Forcibly stop VM before saving state of vCPUs and devices */
3936     if (vm_stop_force_state(RUN_STATE_PAUSED)) {
3937         goto fail;
3938     }
3939     /*
3940      * Put vCPUs in sync with shadow context structures, then
3941      * save their state to channel-buffer along with devices.
3942      */
3943     cpu_synchronize_all_states();
3944     if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
3945         goto fail;
3946     }
3947     /*
3948      * Since we are going to get non-iterable state data directly
3949      * from s->bioc->data, explicit flush is needed here.
3950      */
3951     qemu_fflush(fb);
3952 
3953     /* Now initialize UFFD context and start tracking RAM writes */
3954     if (ram_write_tracking_start()) {
3955         goto fail;
3956     }
3957     early_fail = false;
3958 
3959     /*
3960      * Start VM from BH handler to avoid write-fault lock here.
3961      * UFFD-WP protection for the whole RAM is already enabled so
3962      * calling VM state change notifiers from vm_start() would initiate
3963      * writes to virtio VQs memory which is in write-protected region.
3964      */
3965     s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
3966     qemu_bh_schedule(s->vm_start_bh);
3967 
3968     qemu_mutex_unlock_iothread();
3969 
3970     while (migration_is_active(s)) {
3971         MigIterateState iter_state = bg_migration_iteration_run(s);
3972         if (iter_state == MIG_ITERATE_SKIP) {
3973             continue;
3974         } else if (iter_state == MIG_ITERATE_BREAK) {
3975             break;
3976         }
3977 
3978         /*
3979          * Try to detect any kind of failures, and see whether we
3980          * should stop the migration now.
3981          */
3982         thr_error = migration_detect_error(s);
3983         if (thr_error == MIG_THR_ERR_FATAL) {
3984             /* Stop migration */
3985             break;
3986         }
3987 
3988         migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
3989     }
3990 
3991     trace_migration_thread_after_loop();
3992 
3993 fail:
3994     if (early_fail) {
3995         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3996                 MIGRATION_STATUS_FAILED);
3997         qemu_mutex_unlock_iothread();
3998     }
3999 
4000     bg_migration_iteration_finish(s);
4001 
4002     qemu_fclose(fb);
4003     object_unref(OBJECT(s));
4004     rcu_unregister_thread();
4005 
4006     return NULL;
4007 }
4008 
4009 void migrate_fd_connect(MigrationState *s, Error *error_in)
4010 {
4011     Error *local_err = NULL;
4012     int64_t rate_limit;
4013     bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
4014 
4015     /*
4016      * If there's a previous error, free it and prepare for another one.
4017      * Meanwhile if migration completes successfully, there won't have an error
4018      * dumped when calling migrate_fd_cleanup().
4019      */
4020     migrate_error_free(s);
4021 
4022     s->expected_downtime = s->parameters.downtime_limit;
4023     if (resume) {
4024         assert(s->cleanup_bh);
4025     } else {
4026         assert(!s->cleanup_bh);
4027         s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s);
4028     }
4029     if (error_in) {
4030         migrate_fd_error(s, error_in);
4031         if (resume) {
4032             /*
4033              * Don't do cleanup for resume if channel is invalid, but only dump
4034              * the error.  We wait for another channel connect from the user.
4035              * The error_report still gives HMP user a hint on what failed.
4036              * It's normally done in migrate_fd_cleanup(), but call it here
4037              * explicitly.
4038              */
4039             error_report_err(error_copy(s->error));
4040         } else {
4041             migrate_fd_cleanup(s);
4042         }
4043         return;
4044     }
4045 
4046     if (resume) {
4047         /* This is a resumed migration */
4048         rate_limit = s->parameters.max_postcopy_bandwidth /
4049             XFER_LIMIT_RATIO;
4050     } else {
4051         /* This is a fresh new migration */
4052         rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
4053 
4054         /* Notify before starting migration thread */
4055         notifier_list_notify(&migration_state_notifiers, s);
4056     }
4057 
4058     qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
4059     qemu_file_set_blocking(s->to_dst_file, true);
4060 
4061     /*
4062      * Open the return path. For postcopy, it is used exclusively. For
4063      * precopy, only if user specified "return-path" capability would
4064      * QEMU uses the return path.
4065      */
4066     if (migrate_postcopy_ram() || migrate_use_return_path()) {
4067         if (open_return_path_on_source(s, !resume)) {
4068             error_report("Unable to open return-path for postcopy");
4069             migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
4070             migrate_fd_cleanup(s);
4071             return;
4072         }
4073     }
4074 
4075     if (resume) {
4076         /* Wakeup the main migration thread to do the recovery */
4077         migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
4078                           MIGRATION_STATUS_POSTCOPY_RECOVER);
4079         qemu_sem_post(&s->postcopy_pause_sem);
4080         return;
4081     }
4082 
4083     if (multifd_save_setup(&local_err) != 0) {
4084         error_report_err(local_err);
4085         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
4086                           MIGRATION_STATUS_FAILED);
4087         migrate_fd_cleanup(s);
4088         return;
4089     }
4090 
4091     if (migrate_background_snapshot()) {
4092         qemu_thread_create(&s->thread, "bg_snapshot",
4093                 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
4094     } else {
4095         qemu_thread_create(&s->thread, "live_migration",
4096                 migration_thread, s, QEMU_THREAD_JOINABLE);
4097     }
4098     s->migration_thread_running = true;
4099 }
4100 
4101 void migration_global_dump(Monitor *mon)
4102 {
4103     MigrationState *ms = migrate_get_current();
4104 
4105     monitor_printf(mon, "globals:\n");
4106     monitor_printf(mon, "store-global-state: %s\n",
4107                    ms->store_global_state ? "on" : "off");
4108     monitor_printf(mon, "only-migratable: %s\n",
4109                    only_migratable ? "on" : "off");
4110     monitor_printf(mon, "send-configuration: %s\n",
4111                    ms->send_configuration ? "on" : "off");
4112     monitor_printf(mon, "send-section-footer: %s\n",
4113                    ms->send_section_footer ? "on" : "off");
4114     monitor_printf(mon, "decompress-error-check: %s\n",
4115                    ms->decompress_error_check ? "on" : "off");
4116     monitor_printf(mon, "clear-bitmap-shift: %u\n",
4117                    ms->clear_bitmap_shift);
4118 }
4119 
4120 #define DEFINE_PROP_MIG_CAP(name, x)             \
4121     DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false)
4122 
4123 static Property migration_properties[] = {
4124     DEFINE_PROP_BOOL("store-global-state", MigrationState,
4125                      store_global_state, true),
4126     DEFINE_PROP_BOOL("send-configuration", MigrationState,
4127                      send_configuration, true),
4128     DEFINE_PROP_BOOL("send-section-footer", MigrationState,
4129                      send_section_footer, true),
4130     DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
4131                       decompress_error_check, true),
4132     DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
4133                       clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
4134 
4135     /* Migration parameters */
4136     DEFINE_PROP_UINT8("x-compress-level", MigrationState,
4137                       parameters.compress_level,
4138                       DEFAULT_MIGRATE_COMPRESS_LEVEL),
4139     DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
4140                       parameters.compress_threads,
4141                       DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
4142     DEFINE_PROP_BOOL("x-compress-wait-thread", MigrationState,
4143                       parameters.compress_wait_thread, true),
4144     DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
4145                       parameters.decompress_threads,
4146                       DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
4147     DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
4148                       parameters.throttle_trigger_threshold,
4149                       DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
4150     DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
4151                       parameters.cpu_throttle_initial,
4152                       DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
4153     DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
4154                       parameters.cpu_throttle_increment,
4155                       DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
4156     DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
4157                       parameters.cpu_throttle_tailslow, false),
4158     DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
4159                       parameters.max_bandwidth, MAX_THROTTLE),
4160     DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
4161                       parameters.downtime_limit,
4162                       DEFAULT_MIGRATE_SET_DOWNTIME),
4163     DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
4164                       parameters.x_checkpoint_delay,
4165                       DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
4166     DEFINE_PROP_UINT8("multifd-channels", MigrationState,
4167                       parameters.multifd_channels,
4168                       DEFAULT_MIGRATE_MULTIFD_CHANNELS),
4169     DEFINE_PROP_MULTIFD_COMPRESSION("multifd-compression", MigrationState,
4170                       parameters.multifd_compression,
4171                       DEFAULT_MIGRATE_MULTIFD_COMPRESSION),
4172     DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
4173                       parameters.multifd_zlib_level,
4174                       DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
4175     DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
4176                       parameters.multifd_zstd_level,
4177                       DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
4178     DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
4179                       parameters.xbzrle_cache_size,
4180                       DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
4181     DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState,
4182                       parameters.max_postcopy_bandwidth,
4183                       DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH),
4184     DEFINE_PROP_UINT8("max-cpu-throttle", MigrationState,
4185                       parameters.max_cpu_throttle,
4186                       DEFAULT_MIGRATE_MAX_CPU_THROTTLE),
4187     DEFINE_PROP_SIZE("announce-initial", MigrationState,
4188                       parameters.announce_initial,
4189                       DEFAULT_MIGRATE_ANNOUNCE_INITIAL),
4190     DEFINE_PROP_SIZE("announce-max", MigrationState,
4191                       parameters.announce_max,
4192                       DEFAULT_MIGRATE_ANNOUNCE_MAX),
4193     DEFINE_PROP_SIZE("announce-rounds", MigrationState,
4194                       parameters.announce_rounds,
4195                       DEFAULT_MIGRATE_ANNOUNCE_ROUNDS),
4196     DEFINE_PROP_SIZE("announce-step", MigrationState,
4197                       parameters.announce_step,
4198                       DEFAULT_MIGRATE_ANNOUNCE_STEP),
4199 
4200     /* Migration capabilities */
4201     DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
4202     DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL),
4203     DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE),
4204     DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS),
4205     DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
4206     DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
4207     DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
4208     DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
4209     DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
4210     DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
4211     DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
4212     DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
4213     DEFINE_PROP_MIG_CAP("x-background-snapshot",
4214             MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
4215 
4216     DEFINE_PROP_END_OF_LIST(),
4217 };
4218 
4219 static void migration_class_init(ObjectClass *klass, void *data)
4220 {
4221     DeviceClass *dc = DEVICE_CLASS(klass);
4222 
4223     dc->user_creatable = false;
4224     device_class_set_props(dc, migration_properties);
4225 }
4226 
4227 static void migration_instance_finalize(Object *obj)
4228 {
4229     MigrationState *ms = MIGRATION_OBJ(obj);
4230     MigrationParameters *params = &ms->parameters;
4231 
4232     qemu_mutex_destroy(&ms->error_mutex);
4233     qemu_mutex_destroy(&ms->qemu_file_lock);
4234     g_free(params->tls_hostname);
4235     g_free(params->tls_creds);
4236     qemu_sem_destroy(&ms->wait_unplug_sem);
4237     qemu_sem_destroy(&ms->rate_limit_sem);
4238     qemu_sem_destroy(&ms->pause_sem);
4239     qemu_sem_destroy(&ms->postcopy_pause_sem);
4240     qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
4241     qemu_sem_destroy(&ms->rp_state.rp_sem);
4242     error_free(ms->error);
4243 }
4244 
4245 static void migration_instance_init(Object *obj)
4246 {
4247     MigrationState *ms = MIGRATION_OBJ(obj);
4248     MigrationParameters *params = &ms->parameters;
4249 
4250     ms->state = MIGRATION_STATUS_NONE;
4251     ms->mbps = -1;
4252     ms->pages_per_second = -1;
4253     qemu_sem_init(&ms->pause_sem, 0);
4254     qemu_mutex_init(&ms->error_mutex);
4255 
4256     params->tls_hostname = g_strdup("");
4257     params->tls_creds = g_strdup("");
4258 
4259     /* Set has_* up only for parameter checks */
4260     params->has_compress_level = true;
4261     params->has_compress_threads = true;
4262     params->has_decompress_threads = true;
4263     params->has_throttle_trigger_threshold = true;
4264     params->has_cpu_throttle_initial = true;
4265     params->has_cpu_throttle_increment = true;
4266     params->has_cpu_throttle_tailslow = true;
4267     params->has_max_bandwidth = true;
4268     params->has_downtime_limit = true;
4269     params->has_x_checkpoint_delay = true;
4270     params->has_block_incremental = true;
4271     params->has_multifd_channels = true;
4272     params->has_multifd_compression = true;
4273     params->has_multifd_zlib_level = true;
4274     params->has_multifd_zstd_level = true;
4275     params->has_xbzrle_cache_size = true;
4276     params->has_max_postcopy_bandwidth = true;
4277     params->has_max_cpu_throttle = true;
4278     params->has_announce_initial = true;
4279     params->has_announce_max = true;
4280     params->has_announce_rounds = true;
4281     params->has_announce_step = true;
4282 
4283     qemu_sem_init(&ms->postcopy_pause_sem, 0);
4284     qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
4285     qemu_sem_init(&ms->rp_state.rp_sem, 0);
4286     qemu_sem_init(&ms->rate_limit_sem, 0);
4287     qemu_sem_init(&ms->wait_unplug_sem, 0);
4288     qemu_mutex_init(&ms->qemu_file_lock);
4289 }
4290 
4291 /*
4292  * Return true if check pass, false otherwise. Error will be put
4293  * inside errp if provided.
4294  */
4295 static bool migration_object_check(MigrationState *ms, Error **errp)
4296 {
4297     MigrationCapabilityStatusList *head = NULL;
4298     /* Assuming all off */
4299     bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret;
4300     int i;
4301 
4302     if (!migrate_params_check(&ms->parameters, errp)) {
4303         return false;
4304     }
4305 
4306     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
4307         if (ms->enabled_capabilities[i]) {
4308             QAPI_LIST_PREPEND(head, migrate_cap_add(i, true));
4309         }
4310     }
4311 
4312     ret = migrate_caps_check(cap_list, head, errp);
4313 
4314     /* It works with head == NULL */
4315     qapi_free_MigrationCapabilityStatusList(head);
4316 
4317     return ret;
4318 }
4319 
4320 static const TypeInfo migration_type = {
4321     .name = TYPE_MIGRATION,
4322     /*
4323      * NOTE: TYPE_MIGRATION is not really a device, as the object is
4324      * not created using qdev_new(), it is not attached to the qdev
4325      * device tree, and it is never realized.
4326      *
4327      * TODO: Make this TYPE_OBJECT once QOM provides something like
4328      * TYPE_DEVICE's "-global" properties.
4329      */
4330     .parent = TYPE_DEVICE,
4331     .class_init = migration_class_init,
4332     .class_size = sizeof(MigrationClass),
4333     .instance_size = sizeof(MigrationState),
4334     .instance_init = migration_instance_init,
4335     .instance_finalize = migration_instance_finalize,
4336 };
4337 
4338 static void register_migration_types(void)
4339 {
4340     type_register_static(&migration_type);
4341 }
4342 
4343 type_init(register_migration_types);
4344