xref: /openbmc/qemu/migration/migration.c (revision 265b578c)
1 /*
2  * QEMU live migration
3  *
4  * Copyright IBM, Corp. 2008
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qemu/cutils.h"
18 #include "qemu/error-report.h"
19 #include "migration/blocker.h"
20 #include "exec.h"
21 #include "fd.h"
22 #include "socket.h"
23 #include "rdma.h"
24 #include "ram.h"
25 #include "migration/global_state.h"
26 #include "migration/misc.h"
27 #include "migration.h"
28 #include "savevm.h"
29 #include "qemu-file-channel.h"
30 #include "qemu-file.h"
31 #include "migration/vmstate.h"
32 #include "block/block.h"
33 #include "qapi/error.h"
34 #include "qapi/qapi-commands-migration.h"
35 #include "qapi/qapi-events-migration.h"
36 #include "qapi/qmp/qerror.h"
37 #include "qapi/qmp/qnull.h"
38 #include "qemu/rcu.h"
39 #include "block.h"
40 #include "postcopy-ram.h"
41 #include "qemu/thread.h"
42 #include "trace.h"
43 #include "exec/target_page.h"
44 #include "io/channel-buffer.h"
45 #include "migration/colo.h"
46 #include "hw/boards.h"
47 #include "monitor/monitor.h"
48 
49 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
50 
51 /* Amount of time to allocate to each "chunk" of bandwidth-throttled
52  * data. */
53 #define BUFFER_DELAY     100
54 #define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
55 
56 /* Time in milliseconds we are allowed to stop the source,
57  * for sending the last part */
58 #define DEFAULT_MIGRATE_SET_DOWNTIME 300
59 
60 /* Maximum migrate downtime set to 2000 seconds */
61 #define MAX_MIGRATE_DOWNTIME_SECONDS 2000
62 #define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000)
63 
64 /* Default compression thread count */
65 #define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
66 /* Default decompression thread count, usually decompression is at
67  * least 4 times as fast as compression.*/
68 #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
69 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
70 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
71 /* Define default autoconverge cpu throttle migration parameters */
72 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
73 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
74 
75 /* Migration XBZRLE default cache size */
76 #define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
77 
78 /* The delay time (in ms) between two COLO checkpoints
79  * Note: Please change this default value to 10000 when we support hybrid mode.
80  */
81 #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY 200
82 #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
83 #define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 16
84 
85 static NotifierList migration_state_notifiers =
86     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
87 
88 static bool deferred_incoming;
89 
90 /* Messages sent on the return path from destination to source */
91 enum mig_rp_message_type {
92     MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
93     MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
94     MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
95 
96     MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
97     MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
98     MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
99     MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
100 
101     MIG_RP_MSG_MAX
102 };
103 
104 /* When we add fault tolerance, we could have several
105    migrations at once.  For now we don't need to add
106    dynamic creation of migration */
107 
108 static MigrationState *current_migration;
109 static MigrationIncomingState *current_incoming;
110 
111 static bool migration_object_check(MigrationState *ms, Error **errp);
112 static int migration_maybe_pause(MigrationState *s,
113                                  int *current_active_state,
114                                  int new_state);
115 
116 void migration_object_init(void)
117 {
118     MachineState *ms = MACHINE(qdev_get_machine());
119     Error *err = NULL;
120 
121     /* This can only be called once. */
122     assert(!current_migration);
123     current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
124 
125     /*
126      * Init the migrate incoming object as well no matter whether
127      * we'll use it or not.
128      */
129     assert(!current_incoming);
130     current_incoming = g_new0(MigrationIncomingState, 1);
131     current_incoming->state = MIGRATION_STATUS_NONE;
132     current_incoming->postcopy_remote_fds =
133         g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
134     qemu_mutex_init(&current_incoming->rp_mutex);
135     qemu_event_init(&current_incoming->main_thread_load_event, false);
136     qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
137     qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
138 
139     init_dirty_bitmap_incoming_migration();
140 
141     if (!migration_object_check(current_migration, &err)) {
142         error_report_err(err);
143         exit(1);
144     }
145 
146     /*
147      * We cannot really do this in migration_instance_init() since at
148      * that time global properties are not yet applied, then this
149      * value will be definitely replaced by something else.
150      */
151     if (ms->enforce_config_section) {
152         current_migration->send_configuration = true;
153     }
154 }
155 
156 void migration_object_finalize(void)
157 {
158     object_unref(OBJECT(current_migration));
159 }
160 
161 /* For outgoing */
162 MigrationState *migrate_get_current(void)
163 {
164     /* This can only be called after the object created. */
165     assert(current_migration);
166     return current_migration;
167 }
168 
169 MigrationIncomingState *migration_incoming_get_current(void)
170 {
171     assert(current_incoming);
172     return current_incoming;
173 }
174 
175 void migration_incoming_state_destroy(void)
176 {
177     struct MigrationIncomingState *mis = migration_incoming_get_current();
178 
179     if (mis->to_src_file) {
180         /* Tell source that we are done */
181         migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
182         qemu_fclose(mis->to_src_file);
183         mis->to_src_file = NULL;
184     }
185 
186     if (mis->from_src_file) {
187         qemu_fclose(mis->from_src_file);
188         mis->from_src_file = NULL;
189     }
190     if (mis->postcopy_remote_fds) {
191         g_array_free(mis->postcopy_remote_fds, TRUE);
192         mis->postcopy_remote_fds = NULL;
193     }
194 
195     qemu_event_reset(&mis->main_thread_load_event);
196 }
197 
198 static void migrate_generate_event(int new_state)
199 {
200     if (migrate_use_events()) {
201         qapi_event_send_migration(new_state, &error_abort);
202     }
203 }
204 
205 static bool migrate_late_block_activate(void)
206 {
207     MigrationState *s;
208 
209     s = migrate_get_current();
210 
211     return s->enabled_capabilities[
212         MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE];
213 }
214 
215 /*
216  * Called on -incoming with a defer: uri.
217  * The migration can be started later after any parameters have been
218  * changed.
219  */
220 static void deferred_incoming_migration(Error **errp)
221 {
222     if (deferred_incoming) {
223         error_setg(errp, "Incoming migration already deferred");
224     }
225     deferred_incoming = true;
226 }
227 
228 /*
229  * Send a message on the return channel back to the source
230  * of the migration.
231  */
232 static int migrate_send_rp_message(MigrationIncomingState *mis,
233                                    enum mig_rp_message_type message_type,
234                                    uint16_t len, void *data)
235 {
236     int ret = 0;
237 
238     trace_migrate_send_rp_message((int)message_type, len);
239     qemu_mutex_lock(&mis->rp_mutex);
240 
241     /*
242      * It's possible that the file handle got lost due to network
243      * failures.
244      */
245     if (!mis->to_src_file) {
246         ret = -EIO;
247         goto error;
248     }
249 
250     qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
251     qemu_put_be16(mis->to_src_file, len);
252     qemu_put_buffer(mis->to_src_file, data, len);
253     qemu_fflush(mis->to_src_file);
254 
255     /* It's possible that qemu file got error during sending */
256     ret = qemu_file_get_error(mis->to_src_file);
257 
258 error:
259     qemu_mutex_unlock(&mis->rp_mutex);
260     return ret;
261 }
262 
263 /* Request a range of pages from the source VM at the given
264  * start address.
265  *   rbname: Name of the RAMBlock to request the page in, if NULL it's the same
266  *           as the last request (a name must have been given previously)
267  *   Start: Address offset within the RB
268  *   Len: Length in bytes required - must be a multiple of pagesize
269  */
270 int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
271                               ram_addr_t start, size_t len)
272 {
273     uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
274     size_t msglen = 12; /* start + len */
275     enum mig_rp_message_type msg_type;
276 
277     *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
278     *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
279 
280     if (rbname) {
281         int rbname_len = strlen(rbname);
282         assert(rbname_len < 256);
283 
284         bufc[msglen++] = rbname_len;
285         memcpy(bufc + msglen, rbname, rbname_len);
286         msglen += rbname_len;
287         msg_type = MIG_RP_MSG_REQ_PAGES_ID;
288     } else {
289         msg_type = MIG_RP_MSG_REQ_PAGES;
290     }
291 
292     return migrate_send_rp_message(mis, msg_type, msglen, bufc);
293 }
294 
295 void qemu_start_incoming_migration(const char *uri, Error **errp)
296 {
297     const char *p;
298 
299     qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort);
300     if (!strcmp(uri, "defer")) {
301         deferred_incoming_migration(errp);
302     } else if (strstart(uri, "tcp:", &p)) {
303         tcp_start_incoming_migration(p, errp);
304 #ifdef CONFIG_RDMA
305     } else if (strstart(uri, "rdma:", &p)) {
306         rdma_start_incoming_migration(p, errp);
307 #endif
308     } else if (strstart(uri, "exec:", &p)) {
309         exec_start_incoming_migration(p, errp);
310     } else if (strstart(uri, "unix:", &p)) {
311         unix_start_incoming_migration(p, errp);
312     } else if (strstart(uri, "fd:", &p)) {
313         fd_start_incoming_migration(p, errp);
314     } else {
315         error_setg(errp, "unknown migration protocol: %s", uri);
316     }
317 }
318 
319 static void process_incoming_migration_bh(void *opaque)
320 {
321     Error *local_err = NULL;
322     MigrationIncomingState *mis = opaque;
323 
324     /* If capability late_block_activate is set:
325      * Only fire up the block code now if we're going to restart the
326      * VM, else 'cont' will do it.
327      * This causes file locking to happen; so we don't want it to happen
328      * unless we really are starting the VM.
329      */
330     if (!migrate_late_block_activate() ||
331          (autostart && (!global_state_received() ||
332             global_state_get_runstate() == RUN_STATE_RUNNING))) {
333         /* Make sure all file formats flush their mutable metadata.
334          * If we get an error here, just don't restart the VM yet. */
335         bdrv_invalidate_cache_all(&local_err);
336         if (local_err) {
337             error_report_err(local_err);
338             local_err = NULL;
339             autostart = false;
340         }
341     }
342 
343     /*
344      * This must happen after all error conditions are dealt with and
345      * we're sure the VM is going to be running on this host.
346      */
347     qemu_announce_self();
348 
349     if (multifd_load_cleanup(&local_err) != 0) {
350         error_report_err(local_err);
351         autostart = false;
352     }
353     /* If global state section was not received or we are in running
354        state, we need to obey autostart. Any other state is set with
355        runstate_set. */
356 
357     dirty_bitmap_mig_before_vm_start();
358 
359     if (!global_state_received() ||
360         global_state_get_runstate() == RUN_STATE_RUNNING) {
361         if (autostart) {
362             vm_start();
363         } else {
364             runstate_set(RUN_STATE_PAUSED);
365         }
366     } else {
367         runstate_set(global_state_get_runstate());
368     }
369     /*
370      * This must happen after any state changes since as soon as an external
371      * observer sees this event they might start to prod at the VM assuming
372      * it's ready to use.
373      */
374     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
375                       MIGRATION_STATUS_COMPLETED);
376     qemu_bh_delete(mis->bh);
377     migration_incoming_state_destroy();
378 }
379 
380 static void process_incoming_migration_co(void *opaque)
381 {
382     MigrationIncomingState *mis = migration_incoming_get_current();
383     PostcopyState ps;
384     int ret;
385 
386     assert(mis->from_src_file);
387     mis->largest_page_size = qemu_ram_pagesize_largest();
388     postcopy_state_set(POSTCOPY_INCOMING_NONE);
389     migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
390                       MIGRATION_STATUS_ACTIVE);
391     ret = qemu_loadvm_state(mis->from_src_file);
392 
393     ps = postcopy_state_get();
394     trace_process_incoming_migration_co_end(ret, ps);
395     if (ps != POSTCOPY_INCOMING_NONE) {
396         if (ps == POSTCOPY_INCOMING_ADVISE) {
397             /*
398              * Where a migration had postcopy enabled (and thus went to advise)
399              * but managed to complete within the precopy period, we can use
400              * the normal exit.
401              */
402             postcopy_ram_incoming_cleanup(mis);
403         } else if (ret >= 0) {
404             /*
405              * Postcopy was started, cleanup should happen at the end of the
406              * postcopy thread.
407              */
408             trace_process_incoming_migration_co_postcopy_end_main();
409             return;
410         }
411         /* Else if something went wrong then just fall out of the normal exit */
412     }
413 
414     /* we get COLO info, and know if we are in COLO mode */
415     if (!ret && migration_incoming_enable_colo()) {
416         mis->migration_incoming_co = qemu_coroutine_self();
417         qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
418              colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
419         mis->have_colo_incoming_thread = true;
420         qemu_coroutine_yield();
421 
422         /* Wait checkpoint incoming thread exit before free resource */
423         qemu_thread_join(&mis->colo_incoming_thread);
424     }
425 
426     if (ret < 0) {
427         Error *local_err = NULL;
428 
429         migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
430                           MIGRATION_STATUS_FAILED);
431         error_report("load of migration failed: %s", strerror(-ret));
432         qemu_fclose(mis->from_src_file);
433         if (multifd_load_cleanup(&local_err) != 0) {
434             error_report_err(local_err);
435         }
436         exit(EXIT_FAILURE);
437     }
438     mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
439     qemu_bh_schedule(mis->bh);
440 }
441 
442 static void migration_incoming_setup(QEMUFile *f)
443 {
444     MigrationIncomingState *mis = migration_incoming_get_current();
445 
446     if (multifd_load_setup() != 0) {
447         /* We haven't been able to create multifd threads
448            nothing better to do */
449         exit(EXIT_FAILURE);
450     }
451 
452     if (!mis->from_src_file) {
453         mis->from_src_file = f;
454     }
455     qemu_file_set_blocking(f, false);
456 }
457 
458 void migration_incoming_process(void)
459 {
460     Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
461     qemu_coroutine_enter(co);
462 }
463 
464 void migration_fd_process_incoming(QEMUFile *f)
465 {
466     MigrationIncomingState *mis = migration_incoming_get_current();
467 
468     if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
469         /* Resumed from a paused postcopy migration */
470 
471         mis->from_src_file = f;
472         /* Postcopy has standalone thread to do vm load */
473         qemu_file_set_blocking(f, true);
474 
475         /* Re-configure the return path */
476         mis->to_src_file = qemu_file_get_return_path(f);
477 
478         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
479                           MIGRATION_STATUS_POSTCOPY_RECOVER);
480 
481         /*
482          * Here, we only wake up the main loading thread (while the
483          * fault thread will still be waiting), so that we can receive
484          * commands from source now, and answer it if needed. The
485          * fault thread will be woken up afterwards until we are sure
486          * that source is ready to reply to page requests.
487          */
488         qemu_sem_post(&mis->postcopy_pause_sem_dst);
489     } else {
490         /* New incoming migration */
491         migration_incoming_setup(f);
492         migration_incoming_process();
493     }
494 }
495 
496 void migration_ioc_process_incoming(QIOChannel *ioc)
497 {
498     MigrationIncomingState *mis = migration_incoming_get_current();
499 
500     if (!mis->from_src_file) {
501         QEMUFile *f = qemu_fopen_channel_input(ioc);
502         migration_incoming_setup(f);
503         return;
504     }
505     multifd_recv_new_channel(ioc);
506 }
507 
508 /**
509  * @migration_has_all_channels: We have received all channels that we need
510  *
511  * Returns true when we have got connections to all the channels that
512  * we need for migration.
513  */
514 bool migration_has_all_channels(void)
515 {
516     bool all_channels;
517 
518     all_channels = multifd_recv_all_channels_created();
519 
520     return all_channels;
521 }
522 
523 /*
524  * Send a 'SHUT' message on the return channel with the given value
525  * to indicate that we've finished with the RP.  Non-0 value indicates
526  * error.
527  */
528 void migrate_send_rp_shut(MigrationIncomingState *mis,
529                           uint32_t value)
530 {
531     uint32_t buf;
532 
533     buf = cpu_to_be32(value);
534     migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
535 }
536 
537 /*
538  * Send a 'PONG' message on the return channel with the given value
539  * (normally in response to a 'PING')
540  */
541 void migrate_send_rp_pong(MigrationIncomingState *mis,
542                           uint32_t value)
543 {
544     uint32_t buf;
545 
546     buf = cpu_to_be32(value);
547     migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
548 }
549 
550 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
551                                  char *block_name)
552 {
553     char buf[512];
554     int len;
555     int64_t res;
556 
557     /*
558      * First, we send the header part. It contains only the len of
559      * idstr, and the idstr itself.
560      */
561     len = strlen(block_name);
562     buf[0] = len;
563     memcpy(buf + 1, block_name, len);
564 
565     if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
566         error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
567                      __func__);
568         return;
569     }
570 
571     migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
572 
573     /*
574      * Next, we dump the received bitmap to the stream.
575      *
576      * TODO: currently we are safe since we are the only one that is
577      * using the to_src_file handle (fault thread is still paused),
578      * and it's ok even not taking the mutex. However the best way is
579      * to take the lock before sending the message header, and release
580      * the lock after sending the bitmap.
581      */
582     qemu_mutex_lock(&mis->rp_mutex);
583     res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
584     qemu_mutex_unlock(&mis->rp_mutex);
585 
586     trace_migrate_send_rp_recv_bitmap(block_name, res);
587 }
588 
589 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
590 {
591     uint32_t buf;
592 
593     buf = cpu_to_be32(value);
594     migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
595 }
596 
597 MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
598 {
599     MigrationCapabilityStatusList *head = NULL;
600     MigrationCapabilityStatusList *caps;
601     MigrationState *s = migrate_get_current();
602     int i;
603 
604     caps = NULL; /* silence compiler warning */
605     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
606 #ifndef CONFIG_LIVE_BLOCK_MIGRATION
607         if (i == MIGRATION_CAPABILITY_BLOCK) {
608             continue;
609         }
610 #endif
611         if (head == NULL) {
612             head = g_malloc0(sizeof(*caps));
613             caps = head;
614         } else {
615             caps->next = g_malloc0(sizeof(*caps));
616             caps = caps->next;
617         }
618         caps->value =
619             g_malloc(sizeof(*caps->value));
620         caps->value->capability = i;
621         caps->value->state = s->enabled_capabilities[i];
622     }
623 
624     return head;
625 }
626 
627 MigrationParameters *qmp_query_migrate_parameters(Error **errp)
628 {
629     MigrationParameters *params;
630     MigrationState *s = migrate_get_current();
631 
632     /* TODO use QAPI_CLONE() instead of duplicating it inline */
633     params = g_malloc0(sizeof(*params));
634     params->has_compress_level = true;
635     params->compress_level = s->parameters.compress_level;
636     params->has_compress_threads = true;
637     params->compress_threads = s->parameters.compress_threads;
638     params->has_decompress_threads = true;
639     params->decompress_threads = s->parameters.decompress_threads;
640     params->has_cpu_throttle_initial = true;
641     params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
642     params->has_cpu_throttle_increment = true;
643     params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
644     params->has_tls_creds = true;
645     params->tls_creds = g_strdup(s->parameters.tls_creds);
646     params->has_tls_hostname = true;
647     params->tls_hostname = g_strdup(s->parameters.tls_hostname);
648     params->has_max_bandwidth = true;
649     params->max_bandwidth = s->parameters.max_bandwidth;
650     params->has_downtime_limit = true;
651     params->downtime_limit = s->parameters.downtime_limit;
652     params->has_x_checkpoint_delay = true;
653     params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
654     params->has_block_incremental = true;
655     params->block_incremental = s->parameters.block_incremental;
656     params->has_x_multifd_channels = true;
657     params->x_multifd_channels = s->parameters.x_multifd_channels;
658     params->has_x_multifd_page_count = true;
659     params->x_multifd_page_count = s->parameters.x_multifd_page_count;
660     params->has_xbzrle_cache_size = true;
661     params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
662 
663     return params;
664 }
665 
666 /*
667  * Return true if we're already in the middle of a migration
668  * (i.e. any of the active or setup states)
669  */
670 static bool migration_is_setup_or_active(int state)
671 {
672     switch (state) {
673     case MIGRATION_STATUS_ACTIVE:
674     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
675     case MIGRATION_STATUS_POSTCOPY_PAUSED:
676     case MIGRATION_STATUS_POSTCOPY_RECOVER:
677     case MIGRATION_STATUS_SETUP:
678     case MIGRATION_STATUS_PRE_SWITCHOVER:
679     case MIGRATION_STATUS_DEVICE:
680         return true;
681 
682     default:
683         return false;
684 
685     }
686 }
687 
688 static void populate_ram_info(MigrationInfo *info, MigrationState *s)
689 {
690     info->has_ram = true;
691     info->ram = g_malloc0(sizeof(*info->ram));
692     info->ram->transferred = ram_counters.transferred;
693     info->ram->total = ram_bytes_total();
694     info->ram->duplicate = ram_counters.duplicate;
695     /* legacy value.  It is not used anymore */
696     info->ram->skipped = 0;
697     info->ram->normal = ram_counters.normal;
698     info->ram->normal_bytes = ram_counters.normal *
699         qemu_target_page_size();
700     info->ram->mbps = s->mbps;
701     info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
702     info->ram->postcopy_requests = ram_counters.postcopy_requests;
703     info->ram->page_size = qemu_target_page_size();
704 
705     if (migrate_use_xbzrle()) {
706         info->has_xbzrle_cache = true;
707         info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
708         info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
709         info->xbzrle_cache->bytes = xbzrle_counters.bytes;
710         info->xbzrle_cache->pages = xbzrle_counters.pages;
711         info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
712         info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
713         info->xbzrle_cache->overflow = xbzrle_counters.overflow;
714     }
715 
716     if (cpu_throttle_active()) {
717         info->has_cpu_throttle_percentage = true;
718         info->cpu_throttle_percentage = cpu_throttle_get_percentage();
719     }
720 
721     if (s->state != MIGRATION_STATUS_COMPLETED) {
722         info->ram->remaining = ram_bytes_remaining();
723         info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate;
724     }
725 }
726 
727 static void populate_disk_info(MigrationInfo *info)
728 {
729     if (blk_mig_active()) {
730         info->has_disk = true;
731         info->disk = g_malloc0(sizeof(*info->disk));
732         info->disk->transferred = blk_mig_bytes_transferred();
733         info->disk->remaining = blk_mig_bytes_remaining();
734         info->disk->total = blk_mig_bytes_total();
735     }
736 }
737 
738 static void fill_source_migration_info(MigrationInfo *info)
739 {
740     MigrationState *s = migrate_get_current();
741 
742     switch (s->state) {
743     case MIGRATION_STATUS_NONE:
744         /* no migration has happened ever */
745         /* do not overwrite destination migration status */
746         return;
747         break;
748     case MIGRATION_STATUS_SETUP:
749         info->has_status = true;
750         info->has_total_time = false;
751         break;
752     case MIGRATION_STATUS_ACTIVE:
753     case MIGRATION_STATUS_CANCELLING:
754     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
755     case MIGRATION_STATUS_PRE_SWITCHOVER:
756     case MIGRATION_STATUS_DEVICE:
757     case MIGRATION_STATUS_POSTCOPY_PAUSED:
758     case MIGRATION_STATUS_POSTCOPY_RECOVER:
759          /* TODO add some postcopy stats */
760         info->has_status = true;
761         info->has_total_time = true;
762         info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
763             - s->start_time;
764         info->has_expected_downtime = true;
765         info->expected_downtime = s->expected_downtime;
766         info->has_setup_time = true;
767         info->setup_time = s->setup_time;
768 
769         populate_ram_info(info, s);
770         populate_disk_info(info);
771         break;
772     case MIGRATION_STATUS_COLO:
773         info->has_status = true;
774         /* TODO: display COLO specific information (checkpoint info etc.) */
775         break;
776     case MIGRATION_STATUS_COMPLETED:
777         info->has_status = true;
778         info->has_total_time = true;
779         info->total_time = s->total_time;
780         info->has_downtime = true;
781         info->downtime = s->downtime;
782         info->has_setup_time = true;
783         info->setup_time = s->setup_time;
784 
785         populate_ram_info(info, s);
786         break;
787     case MIGRATION_STATUS_FAILED:
788         info->has_status = true;
789         if (s->error) {
790             info->has_error_desc = true;
791             info->error_desc = g_strdup(error_get_pretty(s->error));
792         }
793         break;
794     case MIGRATION_STATUS_CANCELLED:
795         info->has_status = true;
796         break;
797     }
798     info->status = s->state;
799 }
800 
801 /**
802  * @migration_caps_check - check capability validity
803  *
804  * @cap_list: old capability list, array of bool
805  * @params: new capabilities to be applied soon
806  * @errp: set *errp if the check failed, with reason
807  *
808  * Returns true if check passed, otherwise false.
809  */
810 static bool migrate_caps_check(bool *cap_list,
811                                MigrationCapabilityStatusList *params,
812                                Error **errp)
813 {
814     MigrationCapabilityStatusList *cap;
815     bool old_postcopy_cap;
816     MigrationIncomingState *mis = migration_incoming_get_current();
817 
818     old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM];
819 
820     for (cap = params; cap; cap = cap->next) {
821         cap_list[cap->value->capability] = cap->value->state;
822     }
823 
824 #ifndef CONFIG_LIVE_BLOCK_MIGRATION
825     if (cap_list[MIGRATION_CAPABILITY_BLOCK]) {
826         error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) "
827                    "block migration");
828         error_append_hint(errp, "Use drive_mirror+NBD instead.\n");
829         return false;
830     }
831 #endif
832 
833     if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
834         if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
835             /* The decompression threads asynchronously write into RAM
836              * rather than use the atomic copies needed to avoid
837              * userfaulting.  It should be possible to fix the decompression
838              * threads for compatibility in future.
839              */
840             error_setg(errp, "Postcopy is not currently compatible "
841                        "with compression");
842             return false;
843         }
844 
845         /* This check is reasonably expensive, so only when it's being
846          * set the first time, also it's only the destination that needs
847          * special support.
848          */
849         if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
850             !postcopy_ram_supported_by_host(mis)) {
851             /* postcopy_ram_supported_by_host will have emitted a more
852              * detailed message
853              */
854             error_setg(errp, "Postcopy is not supported");
855             return false;
856         }
857     }
858 
859     return true;
860 }
861 
862 static void fill_destination_migration_info(MigrationInfo *info)
863 {
864     MigrationIncomingState *mis = migration_incoming_get_current();
865 
866     switch (mis->state) {
867     case MIGRATION_STATUS_NONE:
868         return;
869         break;
870     case MIGRATION_STATUS_SETUP:
871     case MIGRATION_STATUS_CANCELLING:
872     case MIGRATION_STATUS_CANCELLED:
873     case MIGRATION_STATUS_ACTIVE:
874     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
875     case MIGRATION_STATUS_FAILED:
876     case MIGRATION_STATUS_COLO:
877         info->has_status = true;
878         break;
879     case MIGRATION_STATUS_COMPLETED:
880         info->has_status = true;
881         fill_destination_postcopy_migration_info(info);
882         break;
883     }
884     info->status = mis->state;
885 }
886 
887 MigrationInfo *qmp_query_migrate(Error **errp)
888 {
889     MigrationInfo *info = g_malloc0(sizeof(*info));
890 
891     fill_destination_migration_info(info);
892     fill_source_migration_info(info);
893 
894     return info;
895 }
896 
897 void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
898                                   Error **errp)
899 {
900     MigrationState *s = migrate_get_current();
901     MigrationCapabilityStatusList *cap;
902     bool cap_list[MIGRATION_CAPABILITY__MAX];
903 
904     if (migration_is_setup_or_active(s->state)) {
905         error_setg(errp, QERR_MIGRATION_ACTIVE);
906         return;
907     }
908 
909     memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
910     if (!migrate_caps_check(cap_list, params, errp)) {
911         return;
912     }
913 
914     for (cap = params; cap; cap = cap->next) {
915         s->enabled_capabilities[cap->value->capability] = cap->value->state;
916     }
917 }
918 
919 /*
920  * Check whether the parameters are valid. Error will be put into errp
921  * (if provided). Return true if valid, otherwise false.
922  */
923 static bool migrate_params_check(MigrationParameters *params, Error **errp)
924 {
925     if (params->has_compress_level &&
926         (params->compress_level > 9)) {
927         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
928                    "is invalid, it should be in the range of 0 to 9");
929         return false;
930     }
931 
932     if (params->has_compress_threads && (params->compress_threads < 1)) {
933         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
934                    "compress_threads",
935                    "is invalid, it should be in the range of 1 to 255");
936         return false;
937     }
938 
939     if (params->has_decompress_threads && (params->decompress_threads < 1)) {
940         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
941                    "decompress_threads",
942                    "is invalid, it should be in the range of 1 to 255");
943         return false;
944     }
945 
946     if (params->has_cpu_throttle_initial &&
947         (params->cpu_throttle_initial < 1 ||
948          params->cpu_throttle_initial > 99)) {
949         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
950                    "cpu_throttle_initial",
951                    "an integer in the range of 1 to 99");
952         return false;
953     }
954 
955     if (params->has_cpu_throttle_increment &&
956         (params->cpu_throttle_increment < 1 ||
957          params->cpu_throttle_increment > 99)) {
958         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
959                    "cpu_throttle_increment",
960                    "an integer in the range of 1 to 99");
961         return false;
962     }
963 
964     if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
965         error_setg(errp, "Parameter 'max_bandwidth' expects an integer in the"
966                          " range of 0 to %zu bytes/second", SIZE_MAX);
967         return false;
968     }
969 
970     if (params->has_downtime_limit &&
971         (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
972         error_setg(errp, "Parameter 'downtime_limit' expects an integer in "
973                          "the range of 0 to %d milliseconds",
974                          MAX_MIGRATE_DOWNTIME);
975         return false;
976     }
977 
978     /* x_checkpoint_delay is now always positive */
979 
980     if (params->has_x_multifd_channels && (params->x_multifd_channels < 1)) {
981         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
982                    "multifd_channels",
983                    "is invalid, it should be in the range of 1 to 255");
984         return false;
985     }
986     if (params->has_x_multifd_page_count &&
987         (params->x_multifd_page_count < 1 ||
988          params->x_multifd_page_count > 10000)) {
989         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
990                    "multifd_page_count",
991                    "is invalid, it should be in the range of 1 to 10000");
992         return false;
993     }
994 
995     if (params->has_xbzrle_cache_size &&
996         (params->xbzrle_cache_size < qemu_target_page_size() ||
997          !is_power_of_2(params->xbzrle_cache_size))) {
998         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
999                    "xbzrle_cache_size",
1000                    "is invalid, it should be bigger than target page size"
1001                    " and a power of two");
1002         return false;
1003     }
1004 
1005     return true;
1006 }
1007 
1008 static void migrate_params_test_apply(MigrateSetParameters *params,
1009                                       MigrationParameters *dest)
1010 {
1011     *dest = migrate_get_current()->parameters;
1012 
1013     /* TODO use QAPI_CLONE() instead of duplicating it inline */
1014 
1015     if (params->has_compress_level) {
1016         dest->compress_level = params->compress_level;
1017     }
1018 
1019     if (params->has_compress_threads) {
1020         dest->compress_threads = params->compress_threads;
1021     }
1022 
1023     if (params->has_decompress_threads) {
1024         dest->decompress_threads = params->decompress_threads;
1025     }
1026 
1027     if (params->has_cpu_throttle_initial) {
1028         dest->cpu_throttle_initial = params->cpu_throttle_initial;
1029     }
1030 
1031     if (params->has_cpu_throttle_increment) {
1032         dest->cpu_throttle_increment = params->cpu_throttle_increment;
1033     }
1034 
1035     if (params->has_tls_creds) {
1036         assert(params->tls_creds->type == QTYPE_QSTRING);
1037         dest->tls_creds = g_strdup(params->tls_creds->u.s);
1038     }
1039 
1040     if (params->has_tls_hostname) {
1041         assert(params->tls_hostname->type == QTYPE_QSTRING);
1042         dest->tls_hostname = g_strdup(params->tls_hostname->u.s);
1043     }
1044 
1045     if (params->has_max_bandwidth) {
1046         dest->max_bandwidth = params->max_bandwidth;
1047     }
1048 
1049     if (params->has_downtime_limit) {
1050         dest->downtime_limit = params->downtime_limit;
1051     }
1052 
1053     if (params->has_x_checkpoint_delay) {
1054         dest->x_checkpoint_delay = params->x_checkpoint_delay;
1055     }
1056 
1057     if (params->has_block_incremental) {
1058         dest->block_incremental = params->block_incremental;
1059     }
1060     if (params->has_x_multifd_channels) {
1061         dest->x_multifd_channels = params->x_multifd_channels;
1062     }
1063     if (params->has_x_multifd_page_count) {
1064         dest->x_multifd_page_count = params->x_multifd_page_count;
1065     }
1066     if (params->has_xbzrle_cache_size) {
1067         dest->xbzrle_cache_size = params->xbzrle_cache_size;
1068     }
1069 }
1070 
1071 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
1072 {
1073     MigrationState *s = migrate_get_current();
1074 
1075     /* TODO use QAPI_CLONE() instead of duplicating it inline */
1076 
1077     if (params->has_compress_level) {
1078         s->parameters.compress_level = params->compress_level;
1079     }
1080 
1081     if (params->has_compress_threads) {
1082         s->parameters.compress_threads = params->compress_threads;
1083     }
1084 
1085     if (params->has_decompress_threads) {
1086         s->parameters.decompress_threads = params->decompress_threads;
1087     }
1088 
1089     if (params->has_cpu_throttle_initial) {
1090         s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
1091     }
1092 
1093     if (params->has_cpu_throttle_increment) {
1094         s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
1095     }
1096 
1097     if (params->has_tls_creds) {
1098         g_free(s->parameters.tls_creds);
1099         assert(params->tls_creds->type == QTYPE_QSTRING);
1100         s->parameters.tls_creds = g_strdup(params->tls_creds->u.s);
1101     }
1102 
1103     if (params->has_tls_hostname) {
1104         g_free(s->parameters.tls_hostname);
1105         assert(params->tls_hostname->type == QTYPE_QSTRING);
1106         s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s);
1107     }
1108 
1109     if (params->has_max_bandwidth) {
1110         s->parameters.max_bandwidth = params->max_bandwidth;
1111         if (s->to_dst_file) {
1112             qemu_file_set_rate_limit(s->to_dst_file,
1113                                 s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
1114         }
1115     }
1116 
1117     if (params->has_downtime_limit) {
1118         s->parameters.downtime_limit = params->downtime_limit;
1119     }
1120 
1121     if (params->has_x_checkpoint_delay) {
1122         s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
1123         if (migration_in_colo_state()) {
1124             colo_checkpoint_notify(s);
1125         }
1126     }
1127 
1128     if (params->has_block_incremental) {
1129         s->parameters.block_incremental = params->block_incremental;
1130     }
1131     if (params->has_x_multifd_channels) {
1132         s->parameters.x_multifd_channels = params->x_multifd_channels;
1133     }
1134     if (params->has_x_multifd_page_count) {
1135         s->parameters.x_multifd_page_count = params->x_multifd_page_count;
1136     }
1137     if (params->has_xbzrle_cache_size) {
1138         s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
1139         xbzrle_cache_resize(params->xbzrle_cache_size, errp);
1140     }
1141 }
1142 
1143 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
1144 {
1145     MigrationParameters tmp;
1146 
1147     /* TODO Rewrite "" to null instead */
1148     if (params->has_tls_creds
1149         && params->tls_creds->type == QTYPE_QNULL) {
1150         qobject_unref(params->tls_creds->u.n);
1151         params->tls_creds->type = QTYPE_QSTRING;
1152         params->tls_creds->u.s = strdup("");
1153     }
1154     /* TODO Rewrite "" to null instead */
1155     if (params->has_tls_hostname
1156         && params->tls_hostname->type == QTYPE_QNULL) {
1157         qobject_unref(params->tls_hostname->u.n);
1158         params->tls_hostname->type = QTYPE_QSTRING;
1159         params->tls_hostname->u.s = strdup("");
1160     }
1161 
1162     migrate_params_test_apply(params, &tmp);
1163 
1164     if (!migrate_params_check(&tmp, errp)) {
1165         /* Invalid parameter */
1166         return;
1167     }
1168 
1169     migrate_params_apply(params, errp);
1170 }
1171 
1172 
1173 void qmp_migrate_start_postcopy(Error **errp)
1174 {
1175     MigrationState *s = migrate_get_current();
1176 
1177     if (!migrate_postcopy()) {
1178         error_setg(errp, "Enable postcopy with migrate_set_capability before"
1179                          " the start of migration");
1180         return;
1181     }
1182 
1183     if (s->state == MIGRATION_STATUS_NONE) {
1184         error_setg(errp, "Postcopy must be started after migration has been"
1185                          " started");
1186         return;
1187     }
1188     /*
1189      * we don't error if migration has finished since that would be racy
1190      * with issuing this command.
1191      */
1192     atomic_set(&s->start_postcopy, true);
1193 }
1194 
1195 /* shared migration helpers */
1196 
1197 void migrate_set_state(int *state, int old_state, int new_state)
1198 {
1199     assert(new_state < MIGRATION_STATUS__MAX);
1200     if (atomic_cmpxchg(state, old_state, new_state) == old_state) {
1201         trace_migrate_set_state(MigrationStatus_str(new_state));
1202         migrate_generate_event(new_state);
1203     }
1204 }
1205 
1206 static MigrationCapabilityStatusList *migrate_cap_add(
1207     MigrationCapabilityStatusList *list,
1208     MigrationCapability index,
1209     bool state)
1210 {
1211     MigrationCapabilityStatusList *cap;
1212 
1213     cap = g_new0(MigrationCapabilityStatusList, 1);
1214     cap->value = g_new0(MigrationCapabilityStatus, 1);
1215     cap->value->capability = index;
1216     cap->value->state = state;
1217     cap->next = list;
1218 
1219     return cap;
1220 }
1221 
1222 void migrate_set_block_enabled(bool value, Error **errp)
1223 {
1224     MigrationCapabilityStatusList *cap;
1225 
1226     cap = migrate_cap_add(NULL, MIGRATION_CAPABILITY_BLOCK, value);
1227     qmp_migrate_set_capabilities(cap, errp);
1228     qapi_free_MigrationCapabilityStatusList(cap);
1229 }
1230 
1231 static void migrate_set_block_incremental(MigrationState *s, bool value)
1232 {
1233     s->parameters.block_incremental = value;
1234 }
1235 
1236 static void block_cleanup_parameters(MigrationState *s)
1237 {
1238     if (s->must_remove_block_options) {
1239         /* setting to false can never fail */
1240         migrate_set_block_enabled(false, &error_abort);
1241         migrate_set_block_incremental(s, false);
1242         s->must_remove_block_options = false;
1243     }
1244 }
1245 
1246 static void migrate_fd_cleanup(void *opaque)
1247 {
1248     MigrationState *s = opaque;
1249 
1250     qemu_bh_delete(s->cleanup_bh);
1251     s->cleanup_bh = NULL;
1252 
1253     qemu_savevm_state_cleanup();
1254 
1255     if (s->to_dst_file) {
1256         Error *local_err = NULL;
1257         QEMUFile *tmp;
1258 
1259         trace_migrate_fd_cleanup();
1260         qemu_mutex_unlock_iothread();
1261         if (s->migration_thread_running) {
1262             qemu_thread_join(&s->thread);
1263             s->migration_thread_running = false;
1264         }
1265         qemu_mutex_lock_iothread();
1266 
1267         if (multifd_save_cleanup(&local_err) != 0) {
1268             error_report_err(local_err);
1269         }
1270         qemu_mutex_lock(&s->qemu_file_lock);
1271         tmp = s->to_dst_file;
1272         s->to_dst_file = NULL;
1273         qemu_mutex_unlock(&s->qemu_file_lock);
1274         /*
1275          * Close the file handle without the lock to make sure the
1276          * critical section won't block for long.
1277          */
1278         qemu_fclose(tmp);
1279     }
1280 
1281     assert((s->state != MIGRATION_STATUS_ACTIVE) &&
1282            (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE));
1283 
1284     if (s->state == MIGRATION_STATUS_CANCELLING) {
1285         migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1286                           MIGRATION_STATUS_CANCELLED);
1287     }
1288 
1289     if (s->error) {
1290         /* It is used on info migrate.  We can't free it */
1291         error_report_err(error_copy(s->error));
1292     }
1293     notifier_list_notify(&migration_state_notifiers, s);
1294     block_cleanup_parameters(s);
1295 }
1296 
1297 void migrate_set_error(MigrationState *s, const Error *error)
1298 {
1299     qemu_mutex_lock(&s->error_mutex);
1300     if (!s->error) {
1301         s->error = error_copy(error);
1302     }
1303     qemu_mutex_unlock(&s->error_mutex);
1304 }
1305 
1306 void migrate_fd_error(MigrationState *s, const Error *error)
1307 {
1308     trace_migrate_fd_error(error_get_pretty(error));
1309     assert(s->to_dst_file == NULL);
1310     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1311                       MIGRATION_STATUS_FAILED);
1312     migrate_set_error(s, error);
1313 }
1314 
1315 static void migrate_fd_cancel(MigrationState *s)
1316 {
1317     int old_state ;
1318     QEMUFile *f = migrate_get_current()->to_dst_file;
1319     trace_migrate_fd_cancel();
1320 
1321     if (s->rp_state.from_dst_file) {
1322         /* shutdown the rp socket, so causing the rp thread to shutdown */
1323         qemu_file_shutdown(s->rp_state.from_dst_file);
1324     }
1325 
1326     do {
1327         old_state = s->state;
1328         if (!migration_is_setup_or_active(old_state)) {
1329             break;
1330         }
1331         /* If the migration is paused, kick it out of the pause */
1332         if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1333             qemu_sem_post(&s->pause_sem);
1334         }
1335         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
1336     } while (s->state != MIGRATION_STATUS_CANCELLING);
1337 
1338     /*
1339      * If we're unlucky the migration code might be stuck somewhere in a
1340      * send/write while the network has failed and is waiting to timeout;
1341      * if we've got shutdown(2) available then we can force it to quit.
1342      * The outgoing qemu file gets closed in migrate_fd_cleanup that is
1343      * called in a bh, so there is no race against this cancel.
1344      */
1345     if (s->state == MIGRATION_STATUS_CANCELLING && f) {
1346         qemu_file_shutdown(f);
1347     }
1348     if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
1349         Error *local_err = NULL;
1350 
1351         bdrv_invalidate_cache_all(&local_err);
1352         if (local_err) {
1353             error_report_err(local_err);
1354         } else {
1355             s->block_inactive = false;
1356         }
1357     }
1358 }
1359 
1360 void add_migration_state_change_notifier(Notifier *notify)
1361 {
1362     notifier_list_add(&migration_state_notifiers, notify);
1363 }
1364 
1365 void remove_migration_state_change_notifier(Notifier *notify)
1366 {
1367     notifier_remove(notify);
1368 }
1369 
1370 bool migration_in_setup(MigrationState *s)
1371 {
1372     return s->state == MIGRATION_STATUS_SETUP;
1373 }
1374 
1375 bool migration_has_finished(MigrationState *s)
1376 {
1377     return s->state == MIGRATION_STATUS_COMPLETED;
1378 }
1379 
1380 bool migration_has_failed(MigrationState *s)
1381 {
1382     return (s->state == MIGRATION_STATUS_CANCELLED ||
1383             s->state == MIGRATION_STATUS_FAILED);
1384 }
1385 
1386 bool migration_in_postcopy(void)
1387 {
1388     MigrationState *s = migrate_get_current();
1389 
1390     return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
1391 }
1392 
1393 bool migration_in_postcopy_after_devices(MigrationState *s)
1394 {
1395     return migration_in_postcopy() && s->postcopy_after_devices;
1396 }
1397 
1398 bool migration_is_idle(void)
1399 {
1400     MigrationState *s = migrate_get_current();
1401 
1402     switch (s->state) {
1403     case MIGRATION_STATUS_NONE:
1404     case MIGRATION_STATUS_CANCELLED:
1405     case MIGRATION_STATUS_COMPLETED:
1406     case MIGRATION_STATUS_FAILED:
1407         return true;
1408     case MIGRATION_STATUS_SETUP:
1409     case MIGRATION_STATUS_CANCELLING:
1410     case MIGRATION_STATUS_ACTIVE:
1411     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1412     case MIGRATION_STATUS_COLO:
1413     case MIGRATION_STATUS_PRE_SWITCHOVER:
1414     case MIGRATION_STATUS_DEVICE:
1415         return false;
1416     case MIGRATION_STATUS__MAX:
1417         g_assert_not_reached();
1418     }
1419 
1420     return false;
1421 }
1422 
1423 void migrate_init(MigrationState *s)
1424 {
1425     /*
1426      * Reinitialise all migration state, except
1427      * parameters/capabilities that the user set, and
1428      * locks.
1429      */
1430     s->bytes_xfer = 0;
1431     s->xfer_limit = 0;
1432     s->cleanup_bh = 0;
1433     s->to_dst_file = NULL;
1434     s->state = MIGRATION_STATUS_NONE;
1435     s->rp_state.from_dst_file = NULL;
1436     s->rp_state.error = false;
1437     s->mbps = 0.0;
1438     s->downtime = 0;
1439     s->expected_downtime = 0;
1440     s->setup_time = 0;
1441     s->start_postcopy = false;
1442     s->postcopy_after_devices = false;
1443     s->migration_thread_running = false;
1444     error_free(s->error);
1445     s->error = NULL;
1446 
1447     migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
1448 
1449     s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1450     s->total_time = 0;
1451     s->vm_was_running = false;
1452     s->iteration_initial_bytes = 0;
1453     s->threshold_size = 0;
1454 }
1455 
1456 static GSList *migration_blockers;
1457 
1458 int migrate_add_blocker(Error *reason, Error **errp)
1459 {
1460     if (migrate_get_current()->only_migratable) {
1461         error_propagate(errp, error_copy(reason));
1462         error_prepend(errp, "disallowing migration blocker "
1463                           "(--only_migratable) for: ");
1464         return -EACCES;
1465     }
1466 
1467     if (migration_is_idle()) {
1468         migration_blockers = g_slist_prepend(migration_blockers, reason);
1469         return 0;
1470     }
1471 
1472     error_propagate(errp, error_copy(reason));
1473     error_prepend(errp, "disallowing migration blocker (migration in "
1474                       "progress) for: ");
1475     return -EBUSY;
1476 }
1477 
1478 void migrate_del_blocker(Error *reason)
1479 {
1480     migration_blockers = g_slist_remove(migration_blockers, reason);
1481 }
1482 
1483 void qmp_migrate_incoming(const char *uri, Error **errp)
1484 {
1485     Error *local_err = NULL;
1486     static bool once = true;
1487 
1488     if (!deferred_incoming) {
1489         error_setg(errp, "For use with '-incoming defer'");
1490         return;
1491     }
1492     if (!once) {
1493         error_setg(errp, "The incoming migration has already been started");
1494     }
1495 
1496     qemu_start_incoming_migration(uri, &local_err);
1497 
1498     if (local_err) {
1499         error_propagate(errp, local_err);
1500         return;
1501     }
1502 
1503     once = false;
1504 }
1505 
1506 void qmp_migrate_recover(const char *uri, Error **errp)
1507 {
1508     MigrationIncomingState *mis = migration_incoming_get_current();
1509 
1510     if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1511         error_setg(errp, "Migrate recover can only be run "
1512                    "when postcopy is paused.");
1513         return;
1514     }
1515 
1516     if (atomic_cmpxchg(&mis->postcopy_recover_triggered,
1517                        false, true) == true) {
1518         error_setg(errp, "Migrate recovery is triggered already");
1519         return;
1520     }
1521 
1522     /*
1523      * Note that this call will never start a real migration; it will
1524      * only re-setup the migration stream and poke existing migration
1525      * to continue using that newly established channel.
1526      */
1527     qemu_start_incoming_migration(uri, errp);
1528 }
1529 
1530 void qmp_migrate_pause(Error **errp)
1531 {
1532     MigrationState *ms = migrate_get_current();
1533     MigrationIncomingState *mis = migration_incoming_get_current();
1534     int ret;
1535 
1536     if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
1537         /* Source side, during postcopy */
1538         qemu_mutex_lock(&ms->qemu_file_lock);
1539         ret = qemu_file_shutdown(ms->to_dst_file);
1540         qemu_mutex_unlock(&ms->qemu_file_lock);
1541         if (ret) {
1542             error_setg(errp, "Failed to pause source migration");
1543         }
1544         return;
1545     }
1546 
1547     if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
1548         ret = qemu_file_shutdown(mis->from_src_file);
1549         if (ret) {
1550             error_setg(errp, "Failed to pause destination migration");
1551         }
1552         return;
1553     }
1554 
1555     error_setg(errp, "migrate-pause is currently only supported "
1556                "during postcopy-active state");
1557 }
1558 
1559 bool migration_is_blocked(Error **errp)
1560 {
1561     if (qemu_savevm_state_blocked(errp)) {
1562         return true;
1563     }
1564 
1565     if (migration_blockers) {
1566         error_propagate(errp, error_copy(migration_blockers->data));
1567         return true;
1568     }
1569 
1570     return false;
1571 }
1572 
1573 /* Returns true if continue to migrate, or false if error detected */
1574 static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
1575                             bool resume, Error **errp)
1576 {
1577     Error *local_err = NULL;
1578 
1579     if (resume) {
1580         if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1581             error_setg(errp, "Cannot resume if there is no "
1582                        "paused migration");
1583             return false;
1584         }
1585         /* This is a resume, skip init status */
1586         return true;
1587     }
1588 
1589     if (migration_is_setup_or_active(s->state) ||
1590         s->state == MIGRATION_STATUS_CANCELLING ||
1591         s->state == MIGRATION_STATUS_COLO) {
1592         error_setg(errp, QERR_MIGRATION_ACTIVE);
1593         return false;
1594     }
1595 
1596     if (runstate_check(RUN_STATE_INMIGRATE)) {
1597         error_setg(errp, "Guest is waiting for an incoming migration");
1598         return false;
1599     }
1600 
1601     if (migration_is_blocked(errp)) {
1602         return false;
1603     }
1604 
1605     if (blk || blk_inc) {
1606         if (migrate_use_block() || migrate_use_block_incremental()) {
1607             error_setg(errp, "Command options are incompatible with "
1608                        "current migration capabilities");
1609             return false;
1610         }
1611         migrate_set_block_enabled(true, &local_err);
1612         if (local_err) {
1613             error_propagate(errp, local_err);
1614             return false;
1615         }
1616         s->must_remove_block_options = true;
1617     }
1618 
1619     if (blk_inc) {
1620         migrate_set_block_incremental(s, true);
1621     }
1622 
1623     migrate_init(s);
1624 
1625     return true;
1626 }
1627 
1628 void qmp_migrate(const char *uri, bool has_blk, bool blk,
1629                  bool has_inc, bool inc, bool has_detach, bool detach,
1630                  bool has_resume, bool resume, Error **errp)
1631 {
1632     Error *local_err = NULL;
1633     MigrationState *s = migrate_get_current();
1634     const char *p;
1635 
1636     if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
1637                          has_resume && resume, errp)) {
1638         /* Error detected, put into errp */
1639         return;
1640     }
1641 
1642     if (strstart(uri, "tcp:", &p)) {
1643         tcp_start_outgoing_migration(s, p, &local_err);
1644 #ifdef CONFIG_RDMA
1645     } else if (strstart(uri, "rdma:", &p)) {
1646         rdma_start_outgoing_migration(s, p, &local_err);
1647 #endif
1648     } else if (strstart(uri, "exec:", &p)) {
1649         exec_start_outgoing_migration(s, p, &local_err);
1650     } else if (strstart(uri, "unix:", &p)) {
1651         unix_start_outgoing_migration(s, p, &local_err);
1652     } else if (strstart(uri, "fd:", &p)) {
1653         fd_start_outgoing_migration(s, p, &local_err);
1654     } else {
1655         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
1656                    "a valid migration protocol");
1657         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1658                           MIGRATION_STATUS_FAILED);
1659         block_cleanup_parameters(s);
1660         return;
1661     }
1662 
1663     if (local_err) {
1664         migrate_fd_error(s, local_err);
1665         error_propagate(errp, local_err);
1666         return;
1667     }
1668 }
1669 
1670 void qmp_migrate_cancel(Error **errp)
1671 {
1672     migrate_fd_cancel(migrate_get_current());
1673 }
1674 
1675 void qmp_migrate_continue(MigrationStatus state, Error **errp)
1676 {
1677     MigrationState *s = migrate_get_current();
1678     if (s->state != state) {
1679         error_setg(errp,  "Migration not in expected state: %s",
1680                    MigrationStatus_str(s->state));
1681         return;
1682     }
1683     qemu_sem_post(&s->pause_sem);
1684 }
1685 
1686 void qmp_migrate_set_cache_size(int64_t value, Error **errp)
1687 {
1688     MigrateSetParameters p = {
1689         .has_xbzrle_cache_size = true,
1690         .xbzrle_cache_size = value,
1691     };
1692 
1693     qmp_migrate_set_parameters(&p, errp);
1694 }
1695 
1696 int64_t qmp_query_migrate_cache_size(Error **errp)
1697 {
1698     return migrate_xbzrle_cache_size();
1699 }
1700 
1701 void qmp_migrate_set_speed(int64_t value, Error **errp)
1702 {
1703     MigrateSetParameters p = {
1704         .has_max_bandwidth = true,
1705         .max_bandwidth = value,
1706     };
1707 
1708     qmp_migrate_set_parameters(&p, errp);
1709 }
1710 
1711 void qmp_migrate_set_downtime(double value, Error **errp)
1712 {
1713     if (value < 0 || value > MAX_MIGRATE_DOWNTIME_SECONDS) {
1714         error_setg(errp, "Parameter 'downtime_limit' expects an integer in "
1715                          "the range of 0 to %d seconds",
1716                          MAX_MIGRATE_DOWNTIME_SECONDS);
1717         return;
1718     }
1719 
1720     value *= 1000; /* Convert to milliseconds */
1721     value = MAX(0, MIN(INT64_MAX, value));
1722 
1723     MigrateSetParameters p = {
1724         .has_downtime_limit = true,
1725         .downtime_limit = value,
1726     };
1727 
1728     qmp_migrate_set_parameters(&p, errp);
1729 }
1730 
1731 bool migrate_release_ram(void)
1732 {
1733     MigrationState *s;
1734 
1735     s = migrate_get_current();
1736 
1737     return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
1738 }
1739 
1740 bool migrate_postcopy_ram(void)
1741 {
1742     MigrationState *s;
1743 
1744     s = migrate_get_current();
1745 
1746     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
1747 }
1748 
1749 bool migrate_postcopy(void)
1750 {
1751     return migrate_postcopy_ram() || migrate_dirty_bitmaps();
1752 }
1753 
1754 bool migrate_auto_converge(void)
1755 {
1756     MigrationState *s;
1757 
1758     s = migrate_get_current();
1759 
1760     return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
1761 }
1762 
1763 bool migrate_zero_blocks(void)
1764 {
1765     MigrationState *s;
1766 
1767     s = migrate_get_current();
1768 
1769     return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
1770 }
1771 
1772 bool migrate_postcopy_blocktime(void)
1773 {
1774     MigrationState *s;
1775 
1776     s = migrate_get_current();
1777 
1778     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
1779 }
1780 
1781 bool migrate_use_compression(void)
1782 {
1783     MigrationState *s;
1784 
1785     s = migrate_get_current();
1786 
1787     return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
1788 }
1789 
1790 int migrate_compress_level(void)
1791 {
1792     MigrationState *s;
1793 
1794     s = migrate_get_current();
1795 
1796     return s->parameters.compress_level;
1797 }
1798 
1799 int migrate_compress_threads(void)
1800 {
1801     MigrationState *s;
1802 
1803     s = migrate_get_current();
1804 
1805     return s->parameters.compress_threads;
1806 }
1807 
1808 int migrate_decompress_threads(void)
1809 {
1810     MigrationState *s;
1811 
1812     s = migrate_get_current();
1813 
1814     return s->parameters.decompress_threads;
1815 }
1816 
1817 bool migrate_dirty_bitmaps(void)
1818 {
1819     MigrationState *s;
1820 
1821     s = migrate_get_current();
1822 
1823     return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
1824 }
1825 
1826 bool migrate_use_events(void)
1827 {
1828     MigrationState *s;
1829 
1830     s = migrate_get_current();
1831 
1832     return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
1833 }
1834 
1835 bool migrate_use_multifd(void)
1836 {
1837     MigrationState *s;
1838 
1839     s = migrate_get_current();
1840 
1841     return s->enabled_capabilities[MIGRATION_CAPABILITY_X_MULTIFD];
1842 }
1843 
1844 bool migrate_pause_before_switchover(void)
1845 {
1846     MigrationState *s;
1847 
1848     s = migrate_get_current();
1849 
1850     return s->enabled_capabilities[
1851         MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
1852 }
1853 
1854 int migrate_multifd_channels(void)
1855 {
1856     MigrationState *s;
1857 
1858     s = migrate_get_current();
1859 
1860     return s->parameters.x_multifd_channels;
1861 }
1862 
1863 int migrate_multifd_page_count(void)
1864 {
1865     MigrationState *s;
1866 
1867     s = migrate_get_current();
1868 
1869     return s->parameters.x_multifd_page_count;
1870 }
1871 
1872 int migrate_use_xbzrle(void)
1873 {
1874     MigrationState *s;
1875 
1876     s = migrate_get_current();
1877 
1878     return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
1879 }
1880 
1881 int64_t migrate_xbzrle_cache_size(void)
1882 {
1883     MigrationState *s;
1884 
1885     s = migrate_get_current();
1886 
1887     return s->parameters.xbzrle_cache_size;
1888 }
1889 
1890 bool migrate_use_block(void)
1891 {
1892     MigrationState *s;
1893 
1894     s = migrate_get_current();
1895 
1896     return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK];
1897 }
1898 
1899 bool migrate_use_return_path(void)
1900 {
1901     MigrationState *s;
1902 
1903     s = migrate_get_current();
1904 
1905     return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
1906 }
1907 
1908 bool migrate_use_block_incremental(void)
1909 {
1910     MigrationState *s;
1911 
1912     s = migrate_get_current();
1913 
1914     return s->parameters.block_incremental;
1915 }
1916 
1917 /* migration thread support */
1918 /*
1919  * Something bad happened to the RP stream, mark an error
1920  * The caller shall print or trace something to indicate why
1921  */
1922 static void mark_source_rp_bad(MigrationState *s)
1923 {
1924     s->rp_state.error = true;
1925 }
1926 
1927 static struct rp_cmd_args {
1928     ssize_t     len; /* -1 = variable */
1929     const char *name;
1930 } rp_cmd_args[] = {
1931     [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
1932     [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
1933     [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
1934     [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
1935     [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
1936     [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
1937     [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
1938     [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
1939 };
1940 
1941 /*
1942  * Process a request for pages received on the return path,
1943  * We're allowed to send more than requested (e.g. to round to our page size)
1944  * and we don't need to send pages that have already been sent.
1945  */
1946 static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
1947                                        ram_addr_t start, size_t len)
1948 {
1949     long our_host_ps = getpagesize();
1950 
1951     trace_migrate_handle_rp_req_pages(rbname, start, len);
1952 
1953     /*
1954      * Since we currently insist on matching page sizes, just sanity check
1955      * we're being asked for whole host pages.
1956      */
1957     if (start & (our_host_ps-1) ||
1958        (len & (our_host_ps-1))) {
1959         error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
1960                      " len: %zd", __func__, start, len);
1961         mark_source_rp_bad(ms);
1962         return;
1963     }
1964 
1965     if (ram_save_queue_pages(rbname, start, len)) {
1966         mark_source_rp_bad(ms);
1967     }
1968 }
1969 
1970 /* Return true to retry, false to quit */
1971 static bool postcopy_pause_return_path_thread(MigrationState *s)
1972 {
1973     trace_postcopy_pause_return_path();
1974 
1975     qemu_sem_wait(&s->postcopy_pause_rp_sem);
1976 
1977     trace_postcopy_pause_return_path_continued();
1978 
1979     return true;
1980 }
1981 
1982 static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
1983 {
1984     RAMBlock *block = qemu_ram_block_by_name(block_name);
1985 
1986     if (!block) {
1987         error_report("%s: invalid block name '%s'", __func__, block_name);
1988         return -EINVAL;
1989     }
1990 
1991     /* Fetch the received bitmap and refresh the dirty bitmap */
1992     return ram_dirty_bitmap_reload(s, block);
1993 }
1994 
1995 static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
1996 {
1997     trace_source_return_path_thread_resume_ack(value);
1998 
1999     if (value != MIGRATION_RESUME_ACK_VALUE) {
2000         error_report("%s: illegal resume_ack value %"PRIu32,
2001                      __func__, value);
2002         return -1;
2003     }
2004 
2005     /* Now both sides are active. */
2006     migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2007                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
2008 
2009     /* Notify send thread that time to continue send pages */
2010     qemu_sem_post(&s->rp_state.rp_sem);
2011 
2012     return 0;
2013 }
2014 
2015 /*
2016  * Handles messages sent on the return path towards the source VM
2017  *
2018  */
2019 static void *source_return_path_thread(void *opaque)
2020 {
2021     MigrationState *ms = opaque;
2022     QEMUFile *rp = ms->rp_state.from_dst_file;
2023     uint16_t header_len, header_type;
2024     uint8_t buf[512];
2025     uint32_t tmp32, sibling_error;
2026     ram_addr_t start = 0; /* =0 to silence warning */
2027     size_t  len = 0, expected_len;
2028     int res;
2029 
2030     trace_source_return_path_thread_entry();
2031 
2032 retry:
2033     while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
2034            migration_is_setup_or_active(ms->state)) {
2035         trace_source_return_path_thread_loop_top();
2036         header_type = qemu_get_be16(rp);
2037         header_len = qemu_get_be16(rp);
2038 
2039         if (qemu_file_get_error(rp)) {
2040             mark_source_rp_bad(ms);
2041             goto out;
2042         }
2043 
2044         if (header_type >= MIG_RP_MSG_MAX ||
2045             header_type == MIG_RP_MSG_INVALID) {
2046             error_report("RP: Received invalid message 0x%04x length 0x%04x",
2047                     header_type, header_len);
2048             mark_source_rp_bad(ms);
2049             goto out;
2050         }
2051 
2052         if ((rp_cmd_args[header_type].len != -1 &&
2053             header_len != rp_cmd_args[header_type].len) ||
2054             header_len > sizeof(buf)) {
2055             error_report("RP: Received '%s' message (0x%04x) with"
2056                     "incorrect length %d expecting %zu",
2057                     rp_cmd_args[header_type].name, header_type, header_len,
2058                     (size_t)rp_cmd_args[header_type].len);
2059             mark_source_rp_bad(ms);
2060             goto out;
2061         }
2062 
2063         /* We know we've got a valid header by this point */
2064         res = qemu_get_buffer(rp, buf, header_len);
2065         if (res != header_len) {
2066             error_report("RP: Failed reading data for message 0x%04x"
2067                          " read %d expected %d",
2068                          header_type, res, header_len);
2069             mark_source_rp_bad(ms);
2070             goto out;
2071         }
2072 
2073         /* OK, we have the message and the data */
2074         switch (header_type) {
2075         case MIG_RP_MSG_SHUT:
2076             sibling_error = ldl_be_p(buf);
2077             trace_source_return_path_thread_shut(sibling_error);
2078             if (sibling_error) {
2079                 error_report("RP: Sibling indicated error %d", sibling_error);
2080                 mark_source_rp_bad(ms);
2081             }
2082             /*
2083              * We'll let the main thread deal with closing the RP
2084              * we could do a shutdown(2) on it, but we're the only user
2085              * anyway, so there's nothing gained.
2086              */
2087             goto out;
2088 
2089         case MIG_RP_MSG_PONG:
2090             tmp32 = ldl_be_p(buf);
2091             trace_source_return_path_thread_pong(tmp32);
2092             break;
2093 
2094         case MIG_RP_MSG_REQ_PAGES:
2095             start = ldq_be_p(buf);
2096             len = ldl_be_p(buf + 8);
2097             migrate_handle_rp_req_pages(ms, NULL, start, len);
2098             break;
2099 
2100         case MIG_RP_MSG_REQ_PAGES_ID:
2101             expected_len = 12 + 1; /* header + termination */
2102 
2103             if (header_len >= expected_len) {
2104                 start = ldq_be_p(buf);
2105                 len = ldl_be_p(buf + 8);
2106                 /* Now we expect an idstr */
2107                 tmp32 = buf[12]; /* Length of the following idstr */
2108                 buf[13 + tmp32] = '\0';
2109                 expected_len += tmp32;
2110             }
2111             if (header_len != expected_len) {
2112                 error_report("RP: Req_Page_id with length %d expecting %zd",
2113                         header_len, expected_len);
2114                 mark_source_rp_bad(ms);
2115                 goto out;
2116             }
2117             migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
2118             break;
2119 
2120         case MIG_RP_MSG_RECV_BITMAP:
2121             if (header_len < 1) {
2122                 error_report("%s: missing block name", __func__);
2123                 mark_source_rp_bad(ms);
2124                 goto out;
2125             }
2126             /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2127             buf[buf[0] + 1] = '\0';
2128             if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
2129                 mark_source_rp_bad(ms);
2130                 goto out;
2131             }
2132             break;
2133 
2134         case MIG_RP_MSG_RESUME_ACK:
2135             tmp32 = ldl_be_p(buf);
2136             if (migrate_handle_rp_resume_ack(ms, tmp32)) {
2137                 mark_source_rp_bad(ms);
2138                 goto out;
2139             }
2140             break;
2141 
2142         default:
2143             break;
2144         }
2145     }
2146 
2147 out:
2148     res = qemu_file_get_error(rp);
2149     if (res) {
2150         if (res == -EIO) {
2151             /*
2152              * Maybe there is something we can do: it looks like a
2153              * network down issue, and we pause for a recovery.
2154              */
2155             if (postcopy_pause_return_path_thread(ms)) {
2156                 /* Reload rp, reset the rest */
2157                 rp = ms->rp_state.from_dst_file;
2158                 ms->rp_state.error = false;
2159                 goto retry;
2160             }
2161         }
2162 
2163         trace_source_return_path_thread_bad_end();
2164         mark_source_rp_bad(ms);
2165     }
2166 
2167     trace_source_return_path_thread_end();
2168     ms->rp_state.from_dst_file = NULL;
2169     qemu_fclose(rp);
2170     return NULL;
2171 }
2172 
2173 static int open_return_path_on_source(MigrationState *ms,
2174                                       bool create_thread)
2175 {
2176 
2177     ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
2178     if (!ms->rp_state.from_dst_file) {
2179         return -1;
2180     }
2181 
2182     trace_open_return_path_on_source();
2183 
2184     if (!create_thread) {
2185         /* We're done */
2186         return 0;
2187     }
2188 
2189     qemu_thread_create(&ms->rp_state.rp_thread, "return path",
2190                        source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
2191 
2192     trace_open_return_path_on_source_continue();
2193 
2194     return 0;
2195 }
2196 
2197 /* Returns 0 if the RP was ok, otherwise there was an error on the RP */
2198 static int await_return_path_close_on_source(MigrationState *ms)
2199 {
2200     /*
2201      * If this is a normal exit then the destination will send a SHUT and the
2202      * rp_thread will exit, however if there's an error we need to cause
2203      * it to exit.
2204      */
2205     if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
2206         /*
2207          * shutdown(2), if we have it, will cause it to unblock if it's stuck
2208          * waiting for the destination.
2209          */
2210         qemu_file_shutdown(ms->rp_state.from_dst_file);
2211         mark_source_rp_bad(ms);
2212     }
2213     trace_await_return_path_close_on_source_joining();
2214     qemu_thread_join(&ms->rp_state.rp_thread);
2215     trace_await_return_path_close_on_source_close();
2216     return ms->rp_state.error;
2217 }
2218 
2219 /*
2220  * Switch from normal iteration to postcopy
2221  * Returns non-0 on error
2222  */
2223 static int postcopy_start(MigrationState *ms)
2224 {
2225     int ret;
2226     QIOChannelBuffer *bioc;
2227     QEMUFile *fb;
2228     int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2229     bool restart_block = false;
2230     int cur_state = MIGRATION_STATUS_ACTIVE;
2231     if (!migrate_pause_before_switchover()) {
2232         migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
2233                           MIGRATION_STATUS_POSTCOPY_ACTIVE);
2234     }
2235 
2236     trace_postcopy_start();
2237     qemu_mutex_lock_iothread();
2238     trace_postcopy_start_set_run();
2239 
2240     qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
2241     global_state_store();
2242     ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
2243     if (ret < 0) {
2244         goto fail;
2245     }
2246 
2247     ret = migration_maybe_pause(ms, &cur_state,
2248                                 MIGRATION_STATUS_POSTCOPY_ACTIVE);
2249     if (ret < 0) {
2250         goto fail;
2251     }
2252 
2253     ret = bdrv_inactivate_all();
2254     if (ret < 0) {
2255         goto fail;
2256     }
2257     restart_block = true;
2258 
2259     /*
2260      * Cause any non-postcopiable, but iterative devices to
2261      * send out their final data.
2262      */
2263     qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
2264 
2265     /*
2266      * in Finish migrate and with the io-lock held everything should
2267      * be quiet, but we've potentially still got dirty pages and we
2268      * need to tell the destination to throw any pages it's already received
2269      * that are dirty
2270      */
2271     if (migrate_postcopy_ram()) {
2272         if (ram_postcopy_send_discard_bitmap(ms)) {
2273             error_report("postcopy send discard bitmap failed");
2274             goto fail;
2275         }
2276     }
2277 
2278     /*
2279      * send rest of state - note things that are doing postcopy
2280      * will notice we're in POSTCOPY_ACTIVE and not actually
2281      * wrap their state up here
2282      */
2283     qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
2284     if (migrate_postcopy_ram()) {
2285         /* Ping just for debugging, helps line traces up */
2286         qemu_savevm_send_ping(ms->to_dst_file, 2);
2287     }
2288 
2289     /*
2290      * While loading the device state we may trigger page transfer
2291      * requests and the fd must be free to process those, and thus
2292      * the destination must read the whole device state off the fd before
2293      * it starts processing it.  Unfortunately the ad-hoc migration format
2294      * doesn't allow the destination to know the size to read without fully
2295      * parsing it through each devices load-state code (especially the open
2296      * coded devices that use get/put).
2297      * So we wrap the device state up in a package with a length at the start;
2298      * to do this we use a qemu_buf to hold the whole of the device state.
2299      */
2300     bioc = qio_channel_buffer_new(4096);
2301     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
2302     fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
2303     object_unref(OBJECT(bioc));
2304 
2305     /*
2306      * Make sure the receiver can get incoming pages before we send the rest
2307      * of the state
2308      */
2309     qemu_savevm_send_postcopy_listen(fb);
2310 
2311     qemu_savevm_state_complete_precopy(fb, false, false);
2312     if (migrate_postcopy_ram()) {
2313         qemu_savevm_send_ping(fb, 3);
2314     }
2315 
2316     qemu_savevm_send_postcopy_run(fb);
2317 
2318     /* <><> end of stuff going into the package */
2319 
2320     /* Last point of recovery; as soon as we send the package the destination
2321      * can open devices and potentially start running.
2322      * Lets just check again we've not got any errors.
2323      */
2324     ret = qemu_file_get_error(ms->to_dst_file);
2325     if (ret) {
2326         error_report("postcopy_start: Migration stream errored (pre package)");
2327         goto fail_closefb;
2328     }
2329 
2330     restart_block = false;
2331 
2332     /* Now send that blob */
2333     if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
2334         goto fail_closefb;
2335     }
2336     qemu_fclose(fb);
2337 
2338     /* Send a notify to give a chance for anything that needs to happen
2339      * at the transition to postcopy and after the device state; in particular
2340      * spice needs to trigger a transition now
2341      */
2342     ms->postcopy_after_devices = true;
2343     notifier_list_notify(&migration_state_notifiers, ms);
2344 
2345     ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
2346 
2347     qemu_mutex_unlock_iothread();
2348 
2349     if (migrate_postcopy_ram()) {
2350         /*
2351          * Although this ping is just for debug, it could potentially be
2352          * used for getting a better measurement of downtime at the source.
2353          */
2354         qemu_savevm_send_ping(ms->to_dst_file, 4);
2355     }
2356 
2357     if (migrate_release_ram()) {
2358         ram_postcopy_migrated_memory_release(ms);
2359     }
2360 
2361     ret = qemu_file_get_error(ms->to_dst_file);
2362     if (ret) {
2363         error_report("postcopy_start: Migration stream errored");
2364         migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2365                               MIGRATION_STATUS_FAILED);
2366     }
2367 
2368     return ret;
2369 
2370 fail_closefb:
2371     qemu_fclose(fb);
2372 fail:
2373     migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2374                           MIGRATION_STATUS_FAILED);
2375     if (restart_block) {
2376         /* A failure happened early enough that we know the destination hasn't
2377          * accessed block devices, so we're safe to recover.
2378          */
2379         Error *local_err = NULL;
2380 
2381         bdrv_invalidate_cache_all(&local_err);
2382         if (local_err) {
2383             error_report_err(local_err);
2384         }
2385     }
2386     qemu_mutex_unlock_iothread();
2387     return -1;
2388 }
2389 
2390 /**
2391  * migration_maybe_pause: Pause if required to by
2392  * migrate_pause_before_switchover called with the iothread locked
2393  * Returns: 0 on success
2394  */
2395 static int migration_maybe_pause(MigrationState *s,
2396                                  int *current_active_state,
2397                                  int new_state)
2398 {
2399     if (!migrate_pause_before_switchover()) {
2400         return 0;
2401     }
2402 
2403     /* Since leaving this state is not atomic with posting the semaphore
2404      * it's possible that someone could have issued multiple migrate_continue
2405      * and the semaphore is incorrectly positive at this point;
2406      * the docs say it's undefined to reinit a semaphore that's already
2407      * init'd, so use timedwait to eat up any existing posts.
2408      */
2409     while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
2410         /* This block intentionally left blank */
2411     }
2412 
2413     qemu_mutex_unlock_iothread();
2414     migrate_set_state(&s->state, *current_active_state,
2415                       MIGRATION_STATUS_PRE_SWITCHOVER);
2416     qemu_sem_wait(&s->pause_sem);
2417     migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
2418                       new_state);
2419     *current_active_state = new_state;
2420     qemu_mutex_lock_iothread();
2421 
2422     return s->state == new_state ? 0 : -EINVAL;
2423 }
2424 
2425 /**
2426  * migration_completion: Used by migration_thread when there's not much left.
2427  *   The caller 'breaks' the loop when this returns.
2428  *
2429  * @s: Current migration state
2430  */
2431 static void migration_completion(MigrationState *s)
2432 {
2433     int ret;
2434     int current_active_state = s->state;
2435 
2436     if (s->state == MIGRATION_STATUS_ACTIVE) {
2437         qemu_mutex_lock_iothread();
2438         s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2439         qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
2440         s->vm_was_running = runstate_is_running();
2441         ret = global_state_store();
2442 
2443         if (!ret) {
2444             bool inactivate = !migrate_colo_enabled();
2445             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
2446             if (ret >= 0) {
2447                 ret = migration_maybe_pause(s, &current_active_state,
2448                                             MIGRATION_STATUS_DEVICE);
2449             }
2450             if (ret >= 0) {
2451                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
2452                 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
2453                                                          inactivate);
2454             }
2455             if (inactivate && ret >= 0) {
2456                 s->block_inactive = true;
2457             }
2458         }
2459         qemu_mutex_unlock_iothread();
2460 
2461         if (ret < 0) {
2462             goto fail;
2463         }
2464     } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2465         trace_migration_completion_postcopy_end();
2466 
2467         qemu_savevm_state_complete_postcopy(s->to_dst_file);
2468         trace_migration_completion_postcopy_end_after_complete();
2469     }
2470 
2471     /*
2472      * If rp was opened we must clean up the thread before
2473      * cleaning everything else up (since if there are no failures
2474      * it will wait for the destination to send it's status in
2475      * a SHUT command).
2476      */
2477     if (s->rp_state.from_dst_file) {
2478         int rp_error;
2479         trace_migration_return_path_end_before();
2480         rp_error = await_return_path_close_on_source(s);
2481         trace_migration_return_path_end_after(rp_error);
2482         if (rp_error) {
2483             goto fail_invalidate;
2484         }
2485     }
2486 
2487     if (qemu_file_get_error(s->to_dst_file)) {
2488         trace_migration_completion_file_err();
2489         goto fail_invalidate;
2490     }
2491 
2492     if (!migrate_colo_enabled()) {
2493         migrate_set_state(&s->state, current_active_state,
2494                           MIGRATION_STATUS_COMPLETED);
2495     }
2496 
2497     return;
2498 
2499 fail_invalidate:
2500     /* If not doing postcopy, vm_start() will be called: let's regain
2501      * control on images.
2502      */
2503     if (s->state == MIGRATION_STATUS_ACTIVE ||
2504         s->state == MIGRATION_STATUS_DEVICE) {
2505         Error *local_err = NULL;
2506 
2507         qemu_mutex_lock_iothread();
2508         bdrv_invalidate_cache_all(&local_err);
2509         if (local_err) {
2510             error_report_err(local_err);
2511         } else {
2512             s->block_inactive = false;
2513         }
2514         qemu_mutex_unlock_iothread();
2515     }
2516 
2517 fail:
2518     migrate_set_state(&s->state, current_active_state,
2519                       MIGRATION_STATUS_FAILED);
2520 }
2521 
2522 bool migrate_colo_enabled(void)
2523 {
2524     MigrationState *s = migrate_get_current();
2525     return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
2526 }
2527 
2528 typedef enum MigThrError {
2529     /* No error detected */
2530     MIG_THR_ERR_NONE = 0,
2531     /* Detected error, but resumed successfully */
2532     MIG_THR_ERR_RECOVERED = 1,
2533     /* Detected fatal error, need to exit */
2534     MIG_THR_ERR_FATAL = 2,
2535 } MigThrError;
2536 
2537 static int postcopy_resume_handshake(MigrationState *s)
2538 {
2539     qemu_savevm_send_postcopy_resume(s->to_dst_file);
2540 
2541     while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2542         qemu_sem_wait(&s->rp_state.rp_sem);
2543     }
2544 
2545     if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2546         return 0;
2547     }
2548 
2549     return -1;
2550 }
2551 
2552 /* Return zero if success, or <0 for error */
2553 static int postcopy_do_resume(MigrationState *s)
2554 {
2555     int ret;
2556 
2557     /*
2558      * Call all the resume_prepare() hooks, so that modules can be
2559      * ready for the migration resume.
2560      */
2561     ret = qemu_savevm_state_resume_prepare(s);
2562     if (ret) {
2563         error_report("%s: resume_prepare() failure detected: %d",
2564                      __func__, ret);
2565         return ret;
2566     }
2567 
2568     /*
2569      * Last handshake with destination on the resume (destination will
2570      * switch to postcopy-active afterwards)
2571      */
2572     ret = postcopy_resume_handshake(s);
2573     if (ret) {
2574         error_report("%s: handshake failed: %d", __func__, ret);
2575         return ret;
2576     }
2577 
2578     return 0;
2579 }
2580 
2581 /*
2582  * We don't return until we are in a safe state to continue current
2583  * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
2584  * MIG_THR_ERR_FATAL if unrecovery failure happened.
2585  */
2586 static MigThrError postcopy_pause(MigrationState *s)
2587 {
2588     assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2589 
2590     while (true) {
2591         QEMUFile *file;
2592 
2593         migrate_set_state(&s->state, s->state,
2594                           MIGRATION_STATUS_POSTCOPY_PAUSED);
2595 
2596         /* Current channel is possibly broken. Release it. */
2597         assert(s->to_dst_file);
2598         qemu_mutex_lock(&s->qemu_file_lock);
2599         file = s->to_dst_file;
2600         s->to_dst_file = NULL;
2601         qemu_mutex_unlock(&s->qemu_file_lock);
2602 
2603         qemu_file_shutdown(file);
2604         qemu_fclose(file);
2605 
2606         error_report("Detected IO failure for postcopy. "
2607                      "Migration paused.");
2608 
2609         /*
2610          * We wait until things fixed up. Then someone will setup the
2611          * status back for us.
2612          */
2613         while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2614             qemu_sem_wait(&s->postcopy_pause_sem);
2615         }
2616 
2617         if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2618             /* Woken up by a recover procedure. Give it a shot */
2619 
2620             /*
2621              * Firstly, let's wake up the return path now, with a new
2622              * return path channel.
2623              */
2624             qemu_sem_post(&s->postcopy_pause_rp_sem);
2625 
2626             /* Do the resume logic */
2627             if (postcopy_do_resume(s) == 0) {
2628                 /* Let's continue! */
2629                 trace_postcopy_pause_continued();
2630                 return MIG_THR_ERR_RECOVERED;
2631             } else {
2632                 /*
2633                  * Something wrong happened during the recovery, let's
2634                  * pause again. Pause is always better than throwing
2635                  * data away.
2636                  */
2637                 continue;
2638             }
2639         } else {
2640             /* This is not right... Time to quit. */
2641             return MIG_THR_ERR_FATAL;
2642         }
2643     }
2644 }
2645 
2646 static MigThrError migration_detect_error(MigrationState *s)
2647 {
2648     int ret;
2649 
2650     /* Try to detect any file errors */
2651     ret = qemu_file_get_error(s->to_dst_file);
2652 
2653     if (!ret) {
2654         /* Everything is fine */
2655         return MIG_THR_ERR_NONE;
2656     }
2657 
2658     if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
2659         /*
2660          * For postcopy, we allow the network to be down for a
2661          * while. After that, it can be continued by a
2662          * recovery phase.
2663          */
2664         return postcopy_pause(s);
2665     } else {
2666         /*
2667          * For precopy (or postcopy with error outside IO), we fail
2668          * with no time.
2669          */
2670         migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
2671         trace_migration_thread_file_err();
2672 
2673         /* Time to stop the migration, now. */
2674         return MIG_THR_ERR_FATAL;
2675     }
2676 }
2677 
2678 static void migration_calculate_complete(MigrationState *s)
2679 {
2680     uint64_t bytes = qemu_ftell(s->to_dst_file);
2681     int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2682 
2683     s->total_time = end_time - s->start_time;
2684     if (!s->downtime) {
2685         /*
2686          * It's still not set, so we are precopy migration.  For
2687          * postcopy, downtime is calculated during postcopy_start().
2688          */
2689         s->downtime = end_time - s->downtime_start;
2690     }
2691 
2692     if (s->total_time) {
2693         s->mbps = ((double) bytes * 8.0) / s->total_time / 1000;
2694     }
2695 }
2696 
2697 static void migration_update_counters(MigrationState *s,
2698                                       int64_t current_time)
2699 {
2700     uint64_t transferred, time_spent;
2701     double bandwidth;
2702 
2703     if (current_time < s->iteration_start_time + BUFFER_DELAY) {
2704         return;
2705     }
2706 
2707     transferred = qemu_ftell(s->to_dst_file) - s->iteration_initial_bytes;
2708     time_spent = current_time - s->iteration_start_time;
2709     bandwidth = (double)transferred / time_spent;
2710     s->threshold_size = bandwidth * s->parameters.downtime_limit;
2711 
2712     s->mbps = (((double) transferred * 8.0) /
2713                ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
2714 
2715     /*
2716      * if we haven't sent anything, we don't want to
2717      * recalculate. 10000 is a small enough number for our purposes
2718      */
2719     if (ram_counters.dirty_pages_rate && transferred > 10000) {
2720         s->expected_downtime = ram_counters.dirty_pages_rate *
2721             qemu_target_page_size() / bandwidth;
2722     }
2723 
2724     qemu_file_reset_rate_limit(s->to_dst_file);
2725 
2726     s->iteration_start_time = current_time;
2727     s->iteration_initial_bytes = qemu_ftell(s->to_dst_file);
2728 
2729     trace_migrate_transferred(transferred, time_spent,
2730                               bandwidth, s->threshold_size);
2731 }
2732 
2733 /* Migration thread iteration status */
2734 typedef enum {
2735     MIG_ITERATE_RESUME,         /* Resume current iteration */
2736     MIG_ITERATE_SKIP,           /* Skip current iteration */
2737     MIG_ITERATE_BREAK,          /* Break the loop */
2738 } MigIterateState;
2739 
2740 /*
2741  * Return true if continue to the next iteration directly, false
2742  * otherwise.
2743  */
2744 static MigIterateState migration_iteration_run(MigrationState *s)
2745 {
2746     uint64_t pending_size, pend_pre, pend_compat, pend_post;
2747     bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
2748 
2749     qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
2750                               &pend_compat, &pend_post);
2751     pending_size = pend_pre + pend_compat + pend_post;
2752 
2753     trace_migrate_pending(pending_size, s->threshold_size,
2754                           pend_pre, pend_compat, pend_post);
2755 
2756     if (pending_size && pending_size >= s->threshold_size) {
2757         /* Still a significant amount to transfer */
2758         if (migrate_postcopy() && !in_postcopy &&
2759             pend_pre <= s->threshold_size &&
2760             atomic_read(&s->start_postcopy)) {
2761             if (postcopy_start(s)) {
2762                 error_report("%s: postcopy failed to start", __func__);
2763             }
2764             return MIG_ITERATE_SKIP;
2765         }
2766         /* Just another iteration step */
2767         qemu_savevm_state_iterate(s->to_dst_file,
2768             s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2769     } else {
2770         trace_migration_thread_low_pending(pending_size);
2771         migration_completion(s);
2772         return MIG_ITERATE_BREAK;
2773     }
2774 
2775     return MIG_ITERATE_RESUME;
2776 }
2777 
2778 static void migration_iteration_finish(MigrationState *s)
2779 {
2780     /* If we enabled cpu throttling for auto-converge, turn it off. */
2781     cpu_throttle_stop();
2782 
2783     qemu_mutex_lock_iothread();
2784     switch (s->state) {
2785     case MIGRATION_STATUS_COMPLETED:
2786         migration_calculate_complete(s);
2787         runstate_set(RUN_STATE_POSTMIGRATE);
2788         break;
2789 
2790     case MIGRATION_STATUS_ACTIVE:
2791         /*
2792          * We should really assert here, but since it's during
2793          * migration, let's try to reduce the usage of assertions.
2794          */
2795         if (!migrate_colo_enabled()) {
2796             error_report("%s: critical error: calling COLO code without "
2797                          "COLO enabled", __func__);
2798         }
2799         migrate_start_colo_process(s);
2800         /*
2801          * Fixme: we will run VM in COLO no matter its old running state.
2802          * After exited COLO, we will keep running.
2803          */
2804         s->vm_was_running = true;
2805         /* Fallthrough */
2806     case MIGRATION_STATUS_FAILED:
2807     case MIGRATION_STATUS_CANCELLED:
2808         if (s->vm_was_running) {
2809             vm_start();
2810         } else {
2811             if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
2812                 runstate_set(RUN_STATE_POSTMIGRATE);
2813             }
2814         }
2815         break;
2816 
2817     default:
2818         /* Should not reach here, but if so, forgive the VM. */
2819         error_report("%s: Unknown ending state %d", __func__, s->state);
2820         break;
2821     }
2822     qemu_bh_schedule(s->cleanup_bh);
2823     qemu_mutex_unlock_iothread();
2824 }
2825 
2826 /*
2827  * Master migration thread on the source VM.
2828  * It drives the migration and pumps the data down the outgoing channel.
2829  */
2830 static void *migration_thread(void *opaque)
2831 {
2832     MigrationState *s = opaque;
2833     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
2834     MigThrError thr_error;
2835 
2836     rcu_register_thread();
2837 
2838     s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2839 
2840     qemu_savevm_state_header(s->to_dst_file);
2841 
2842     /*
2843      * If we opened the return path, we need to make sure dst has it
2844      * opened as well.
2845      */
2846     if (s->rp_state.from_dst_file) {
2847         /* Now tell the dest that it should open its end so it can reply */
2848         qemu_savevm_send_open_return_path(s->to_dst_file);
2849 
2850         /* And do a ping that will make stuff easier to debug */
2851         qemu_savevm_send_ping(s->to_dst_file, 1);
2852     }
2853 
2854     if (migrate_postcopy()) {
2855         /*
2856          * Tell the destination that we *might* want to do postcopy later;
2857          * if the other end can't do postcopy it should fail now, nice and
2858          * early.
2859          */
2860         qemu_savevm_send_postcopy_advise(s->to_dst_file);
2861     }
2862 
2863     qemu_savevm_state_setup(s->to_dst_file);
2864 
2865     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
2866     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2867                       MIGRATION_STATUS_ACTIVE);
2868 
2869     trace_migration_thread_setup_complete();
2870 
2871     while (s->state == MIGRATION_STATUS_ACTIVE ||
2872            s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2873         int64_t current_time;
2874 
2875         if (!qemu_file_rate_limit(s->to_dst_file)) {
2876             MigIterateState iter_state = migration_iteration_run(s);
2877             if (iter_state == MIG_ITERATE_SKIP) {
2878                 continue;
2879             } else if (iter_state == MIG_ITERATE_BREAK) {
2880                 break;
2881             }
2882         }
2883 
2884         /*
2885          * Try to detect any kind of failures, and see whether we
2886          * should stop the migration now.
2887          */
2888         thr_error = migration_detect_error(s);
2889         if (thr_error == MIG_THR_ERR_FATAL) {
2890             /* Stop migration */
2891             break;
2892         } else if (thr_error == MIG_THR_ERR_RECOVERED) {
2893             /*
2894              * Just recovered from a e.g. network failure, reset all
2895              * the local variables. This is important to avoid
2896              * breaking transferred_bytes and bandwidth calculation
2897              */
2898             s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2899             s->iteration_initial_bytes = 0;
2900         }
2901 
2902         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2903 
2904         migration_update_counters(s, current_time);
2905 
2906         if (qemu_file_rate_limit(s->to_dst_file)) {
2907             /* usleep expects microseconds */
2908             g_usleep((s->iteration_start_time + BUFFER_DELAY -
2909                       current_time) * 1000);
2910         }
2911     }
2912 
2913     trace_migration_thread_after_loop();
2914     migration_iteration_finish(s);
2915     rcu_unregister_thread();
2916     return NULL;
2917 }
2918 
2919 void migrate_fd_connect(MigrationState *s, Error *error_in)
2920 {
2921     int64_t rate_limit;
2922     bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
2923 
2924     s->expected_downtime = s->parameters.downtime_limit;
2925     s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
2926     if (error_in) {
2927         migrate_fd_error(s, error_in);
2928         migrate_fd_cleanup(s);
2929         return;
2930     }
2931 
2932     if (resume) {
2933         /* This is a resumed migration */
2934         rate_limit = INT64_MAX;
2935     } else {
2936         /* This is a fresh new migration */
2937         rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
2938         s->expected_downtime = s->parameters.downtime_limit;
2939         s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
2940 
2941         /* Notify before starting migration thread */
2942         notifier_list_notify(&migration_state_notifiers, s);
2943     }
2944 
2945     qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
2946     qemu_file_set_blocking(s->to_dst_file, true);
2947 
2948     /*
2949      * Open the return path. For postcopy, it is used exclusively. For
2950      * precopy, only if user specified "return-path" capability would
2951      * QEMU uses the return path.
2952      */
2953     if (migrate_postcopy_ram() || migrate_use_return_path()) {
2954         if (open_return_path_on_source(s, !resume)) {
2955             error_report("Unable to open return-path for postcopy");
2956             migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
2957             migrate_fd_cleanup(s);
2958             return;
2959         }
2960     }
2961 
2962     if (resume) {
2963         /* Wakeup the main migration thread to do the recovery */
2964         migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
2965                           MIGRATION_STATUS_POSTCOPY_RECOVER);
2966         qemu_sem_post(&s->postcopy_pause_sem);
2967         return;
2968     }
2969 
2970     if (multifd_save_setup() != 0) {
2971         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2972                           MIGRATION_STATUS_FAILED);
2973         migrate_fd_cleanup(s);
2974         return;
2975     }
2976     qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
2977                        QEMU_THREAD_JOINABLE);
2978     s->migration_thread_running = true;
2979 }
2980 
2981 void migration_global_dump(Monitor *mon)
2982 {
2983     MigrationState *ms = migrate_get_current();
2984 
2985     monitor_printf(mon, "globals:\n");
2986     monitor_printf(mon, "store-global-state: %s\n",
2987                    ms->store_global_state ? "on" : "off");
2988     monitor_printf(mon, "only-migratable: %s\n",
2989                    ms->only_migratable ? "on" : "off");
2990     monitor_printf(mon, "send-configuration: %s\n",
2991                    ms->send_configuration ? "on" : "off");
2992     monitor_printf(mon, "send-section-footer: %s\n",
2993                    ms->send_section_footer ? "on" : "off");
2994     monitor_printf(mon, "decompress-error-check: %s\n",
2995                    ms->decompress_error_check ? "on" : "off");
2996 }
2997 
2998 #define DEFINE_PROP_MIG_CAP(name, x)             \
2999     DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false)
3000 
3001 static Property migration_properties[] = {
3002     DEFINE_PROP_BOOL("store-global-state", MigrationState,
3003                      store_global_state, true),
3004     DEFINE_PROP_BOOL("only-migratable", MigrationState, only_migratable, false),
3005     DEFINE_PROP_BOOL("send-configuration", MigrationState,
3006                      send_configuration, true),
3007     DEFINE_PROP_BOOL("send-section-footer", MigrationState,
3008                      send_section_footer, true),
3009     DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
3010                       decompress_error_check, true),
3011 
3012     /* Migration parameters */
3013     DEFINE_PROP_UINT8("x-compress-level", MigrationState,
3014                       parameters.compress_level,
3015                       DEFAULT_MIGRATE_COMPRESS_LEVEL),
3016     DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
3017                       parameters.compress_threads,
3018                       DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
3019     DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
3020                       parameters.decompress_threads,
3021                       DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
3022     DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
3023                       parameters.cpu_throttle_initial,
3024                       DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
3025     DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
3026                       parameters.cpu_throttle_increment,
3027                       DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
3028     DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
3029                       parameters.max_bandwidth, MAX_THROTTLE),
3030     DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
3031                       parameters.downtime_limit,
3032                       DEFAULT_MIGRATE_SET_DOWNTIME),
3033     DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
3034                       parameters.x_checkpoint_delay,
3035                       DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
3036     DEFINE_PROP_UINT8("x-multifd-channels", MigrationState,
3037                       parameters.x_multifd_channels,
3038                       DEFAULT_MIGRATE_MULTIFD_CHANNELS),
3039     DEFINE_PROP_UINT32("x-multifd-page-count", MigrationState,
3040                       parameters.x_multifd_page_count,
3041                       DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT),
3042     DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
3043                       parameters.xbzrle_cache_size,
3044                       DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
3045 
3046     /* Migration capabilities */
3047     DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
3048     DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL),
3049     DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE),
3050     DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS),
3051     DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
3052     DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
3053     DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
3054     DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
3055     DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
3056     DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
3057     DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
3058     DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_X_MULTIFD),
3059 
3060     DEFINE_PROP_END_OF_LIST(),
3061 };
3062 
3063 static void migration_class_init(ObjectClass *klass, void *data)
3064 {
3065     DeviceClass *dc = DEVICE_CLASS(klass);
3066 
3067     dc->user_creatable = false;
3068     dc->props = migration_properties;
3069 }
3070 
3071 static void migration_instance_finalize(Object *obj)
3072 {
3073     MigrationState *ms = MIGRATION_OBJ(obj);
3074     MigrationParameters *params = &ms->parameters;
3075 
3076     qemu_mutex_destroy(&ms->error_mutex);
3077     qemu_mutex_destroy(&ms->qemu_file_lock);
3078     g_free(params->tls_hostname);
3079     g_free(params->tls_creds);
3080     qemu_sem_destroy(&ms->pause_sem);
3081     qemu_sem_destroy(&ms->postcopy_pause_sem);
3082     qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
3083     qemu_sem_destroy(&ms->rp_state.rp_sem);
3084     error_free(ms->error);
3085 }
3086 
3087 static void migration_instance_init(Object *obj)
3088 {
3089     MigrationState *ms = MIGRATION_OBJ(obj);
3090     MigrationParameters *params = &ms->parameters;
3091 
3092     ms->state = MIGRATION_STATUS_NONE;
3093     ms->mbps = -1;
3094     qemu_sem_init(&ms->pause_sem, 0);
3095     qemu_mutex_init(&ms->error_mutex);
3096 
3097     params->tls_hostname = g_strdup("");
3098     params->tls_creds = g_strdup("");
3099 
3100     /* Set has_* up only for parameter checks */
3101     params->has_compress_level = true;
3102     params->has_compress_threads = true;
3103     params->has_decompress_threads = true;
3104     params->has_cpu_throttle_initial = true;
3105     params->has_cpu_throttle_increment = true;
3106     params->has_max_bandwidth = true;
3107     params->has_downtime_limit = true;
3108     params->has_x_checkpoint_delay = true;
3109     params->has_block_incremental = true;
3110     params->has_x_multifd_channels = true;
3111     params->has_x_multifd_page_count = true;
3112     params->has_xbzrle_cache_size = true;
3113 
3114     qemu_sem_init(&ms->postcopy_pause_sem, 0);
3115     qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
3116     qemu_sem_init(&ms->rp_state.rp_sem, 0);
3117     qemu_mutex_init(&ms->qemu_file_lock);
3118 }
3119 
3120 /*
3121  * Return true if check pass, false otherwise. Error will be put
3122  * inside errp if provided.
3123  */
3124 static bool migration_object_check(MigrationState *ms, Error **errp)
3125 {
3126     MigrationCapabilityStatusList *head = NULL;
3127     /* Assuming all off */
3128     bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret;
3129     int i;
3130 
3131     if (!migrate_params_check(&ms->parameters, errp)) {
3132         return false;
3133     }
3134 
3135     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
3136         if (ms->enabled_capabilities[i]) {
3137             head = migrate_cap_add(head, i, true);
3138         }
3139     }
3140 
3141     ret = migrate_caps_check(cap_list, head, errp);
3142 
3143     /* It works with head == NULL */
3144     qapi_free_MigrationCapabilityStatusList(head);
3145 
3146     return ret;
3147 }
3148 
3149 static const TypeInfo migration_type = {
3150     .name = TYPE_MIGRATION,
3151     /*
3152      * NOTE: TYPE_MIGRATION is not really a device, as the object is
3153      * not created using qdev_create(), it is not attached to the qdev
3154      * device tree, and it is never realized.
3155      *
3156      * TODO: Make this TYPE_OBJECT once QOM provides something like
3157      * TYPE_DEVICE's "-global" properties.
3158      */
3159     .parent = TYPE_DEVICE,
3160     .class_init = migration_class_init,
3161     .class_size = sizeof(MigrationClass),
3162     .instance_size = sizeof(MigrationState),
3163     .instance_init = migration_instance_init,
3164     .instance_finalize = migration_instance_finalize,
3165 };
3166 
3167 static void register_migration_types(void)
3168 {
3169     type_register_static(&migration_type);
3170 }
3171 
3172 type_init(register_migration_types);
3173