xref: /openbmc/qemu/migration/savevm.c (revision f0984d40)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2009-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "hw/boards.h"
31 #include "net/net.h"
32 #include "migration.h"
33 #include "migration/snapshot.h"
34 #include "migration/vmstate.h"
35 #include "migration/misc.h"
36 #include "migration/register.h"
37 #include "migration/global_state.h"
38 #include "migration/channel-block.h"
39 #include "ram.h"
40 #include "qemu-file.h"
41 #include "savevm.h"
42 #include "postcopy-ram.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-commands-migration.h"
45 #include "qapi/clone-visitor.h"
46 #include "qapi/qapi-builtin-visit.h"
47 #include "qapi/qmp/qerror.h"
48 #include "qemu/error-report.h"
49 #include "sysemu/cpus.h"
50 #include "exec/memory.h"
51 #include "exec/target_page.h"
52 #include "trace.h"
53 #include "qemu/iov.h"
54 #include "qemu/job.h"
55 #include "qemu/main-loop.h"
56 #include "block/snapshot.h"
57 #include "qemu/cutils.h"
58 #include "io/channel-buffer.h"
59 #include "io/channel-file.h"
60 #include "sysemu/replay.h"
61 #include "sysemu/runstate.h"
62 #include "sysemu/sysemu.h"
63 #include "sysemu/xen.h"
64 #include "migration/colo.h"
65 #include "qemu/bitmap.h"
66 #include "net/announce.h"
67 #include "qemu/yank.h"
68 #include "yank_functions.h"
69 #include "sysemu/qtest.h"
70 
71 const unsigned int postcopy_ram_discard_version;
72 
73 /* Subcommands for QEMU_VM_COMMAND */
74 enum qemu_vm_cmd {
75     MIG_CMD_INVALID = 0,   /* Must be 0 */
76     MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
77     MIG_CMD_PING,              /* Request a PONG on the RP */
78 
79     MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
80                                       warn we might want to do PC */
81     MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
82                                       pages as it's running. */
83     MIG_CMD_POSTCOPY_RUN,          /* Start execution */
84 
85     MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
86                                       were previously sent during
87                                       precopy but are dirty. */
88     MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
89     MIG_CMD_ENABLE_COLO,       /* Enable COLO */
90     MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
91     MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
92     MIG_CMD_MAX
93 };
94 
95 #define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
96 static struct mig_cmd_args {
97     ssize_t     len; /* -1 = variable */
98     const char *name;
99 } mig_cmd_args[] = {
100     [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
101     [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
102     [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
103     [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
104     [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
105     [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
106     [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
107                                    .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
108     [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
109     [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
110     [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
111     [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
112 };
113 
114 /* Note for MIG_CMD_POSTCOPY_ADVISE:
115  * The format of arguments is depending on postcopy mode:
116  * - postcopy RAM only
117  *   uint64_t host page size
118  *   uint64_t taget page size
119  *
120  * - postcopy RAM and postcopy dirty bitmaps
121  *   format is the same as for postcopy RAM only
122  *
123  * - postcopy dirty bitmaps only
124  *   Nothing. Command length field is 0.
125  *
126  * Be careful: adding a new postcopy entity with some other parameters should
127  * not break format self-description ability. Good way is to introduce some
128  * generic extendable format with an exception for two old entities.
129  */
130 
131 /***********************************************************/
132 /* savevm/loadvm support */
133 
134 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
135 {
136     if (is_writable) {
137         return qemu_file_new_output(QIO_CHANNEL(qio_channel_block_new(bs)));
138     } else {
139         return qemu_file_new_input(QIO_CHANNEL(qio_channel_block_new(bs)));
140     }
141 }
142 
143 
144 /* QEMUFile timer support.
145  * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
146  */
147 
148 void timer_put(QEMUFile *f, QEMUTimer *ts)
149 {
150     uint64_t expire_time;
151 
152     expire_time = timer_expire_time_ns(ts);
153     qemu_put_be64(f, expire_time);
154 }
155 
156 void timer_get(QEMUFile *f, QEMUTimer *ts)
157 {
158     uint64_t expire_time;
159 
160     expire_time = qemu_get_be64(f);
161     if (expire_time != -1) {
162         timer_mod_ns(ts, expire_time);
163     } else {
164         timer_del(ts);
165     }
166 }
167 
168 
169 /* VMState timer support.
170  * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
171  */
172 
173 static int get_timer(QEMUFile *f, void *pv, size_t size,
174                      const VMStateField *field)
175 {
176     QEMUTimer *v = pv;
177     timer_get(f, v);
178     return 0;
179 }
180 
181 static int put_timer(QEMUFile *f, void *pv, size_t size,
182                      const VMStateField *field, JSONWriter *vmdesc)
183 {
184     QEMUTimer *v = pv;
185     timer_put(f, v);
186 
187     return 0;
188 }
189 
190 const VMStateInfo vmstate_info_timer = {
191     .name = "timer",
192     .get  = get_timer,
193     .put  = put_timer,
194 };
195 
196 
197 typedef struct CompatEntry {
198     char idstr[256];
199     int instance_id;
200 } CompatEntry;
201 
202 typedef struct SaveStateEntry {
203     QTAILQ_ENTRY(SaveStateEntry) entry;
204     char idstr[256];
205     uint32_t instance_id;
206     int alias_id;
207     int version_id;
208     /* version id read from the stream */
209     int load_version_id;
210     int section_id;
211     /* section id read from the stream */
212     int load_section_id;
213     const SaveVMHandlers *ops;
214     const VMStateDescription *vmsd;
215     void *opaque;
216     CompatEntry *compat;
217     int is_ram;
218 } SaveStateEntry;
219 
220 typedef struct SaveState {
221     QTAILQ_HEAD(, SaveStateEntry) handlers;
222     SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
223     int global_section_id;
224     uint32_t len;
225     const char *name;
226     uint32_t target_page_bits;
227     uint32_t caps_count;
228     MigrationCapability *capabilities;
229     QemuUUID uuid;
230 } SaveState;
231 
232 static SaveState savevm_state = {
233     .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
234     .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
235     .global_section_id = 0,
236 };
237 
238 static bool should_validate_capability(int capability)
239 {
240     assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
241     /* Validate only new capabilities to keep compatibility. */
242     switch (capability) {
243     case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
244         return true;
245     default:
246         return false;
247     }
248 }
249 
250 static uint32_t get_validatable_capabilities_count(void)
251 {
252     MigrationState *s = migrate_get_current();
253     uint32_t result = 0;
254     int i;
255     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
256         if (should_validate_capability(i) && s->enabled_capabilities[i]) {
257             result++;
258         }
259     }
260     return result;
261 }
262 
263 static int configuration_pre_save(void *opaque)
264 {
265     SaveState *state = opaque;
266     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
267     MigrationState *s = migrate_get_current();
268     int i, j;
269 
270     state->len = strlen(current_name);
271     state->name = current_name;
272     state->target_page_bits = qemu_target_page_bits();
273 
274     state->caps_count = get_validatable_capabilities_count();
275     state->capabilities = g_renew(MigrationCapability, state->capabilities,
276                                   state->caps_count);
277     for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
278         if (should_validate_capability(i) && s->enabled_capabilities[i]) {
279             state->capabilities[j++] = i;
280         }
281     }
282     state->uuid = qemu_uuid;
283 
284     return 0;
285 }
286 
287 static int configuration_post_save(void *opaque)
288 {
289     SaveState *state = opaque;
290 
291     g_free(state->capabilities);
292     state->capabilities = NULL;
293     state->caps_count = 0;
294     return 0;
295 }
296 
297 static int configuration_pre_load(void *opaque)
298 {
299     SaveState *state = opaque;
300 
301     /* If there is no target-page-bits subsection it means the source
302      * predates the variable-target-page-bits support and is using the
303      * minimum possible value for this CPU.
304      */
305     state->target_page_bits = qemu_target_page_bits_min();
306     return 0;
307 }
308 
309 static bool configuration_validate_capabilities(SaveState *state)
310 {
311     bool ret = true;
312     MigrationState *s = migrate_get_current();
313     unsigned long *source_caps_bm;
314     int i;
315 
316     source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
317     for (i = 0; i < state->caps_count; i++) {
318         MigrationCapability capability = state->capabilities[i];
319         set_bit(capability, source_caps_bm);
320     }
321 
322     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
323         bool source_state, target_state;
324         if (!should_validate_capability(i)) {
325             continue;
326         }
327         source_state = test_bit(i, source_caps_bm);
328         target_state = s->enabled_capabilities[i];
329         if (source_state != target_state) {
330             error_report("Capability %s is %s, but received capability is %s",
331                          MigrationCapability_str(i),
332                          target_state ? "on" : "off",
333                          source_state ? "on" : "off");
334             ret = false;
335             /* Don't break here to report all failed capabilities */
336         }
337     }
338 
339     g_free(source_caps_bm);
340     return ret;
341 }
342 
343 static int configuration_post_load(void *opaque, int version_id)
344 {
345     SaveState *state = opaque;
346     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
347     int ret = 0;
348 
349     if (strncmp(state->name, current_name, state->len) != 0) {
350         error_report("Machine type received is '%.*s' and local is '%s'",
351                      (int) state->len, state->name, current_name);
352         ret = -EINVAL;
353         goto out;
354     }
355 
356     if (state->target_page_bits != qemu_target_page_bits()) {
357         error_report("Received TARGET_PAGE_BITS is %d but local is %d",
358                      state->target_page_bits, qemu_target_page_bits());
359         ret = -EINVAL;
360         goto out;
361     }
362 
363     if (!configuration_validate_capabilities(state)) {
364         ret = -EINVAL;
365         goto out;
366     }
367 
368 out:
369     g_free((void *)state->name);
370     state->name = NULL;
371     state->len = 0;
372     g_free(state->capabilities);
373     state->capabilities = NULL;
374     state->caps_count = 0;
375 
376     return ret;
377 }
378 
379 static int get_capability(QEMUFile *f, void *pv, size_t size,
380                           const VMStateField *field)
381 {
382     MigrationCapability *capability = pv;
383     char capability_str[UINT8_MAX + 1];
384     uint8_t len;
385     int i;
386 
387     len = qemu_get_byte(f);
388     qemu_get_buffer(f, (uint8_t *)capability_str, len);
389     capability_str[len] = '\0';
390     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
391         if (!strcmp(MigrationCapability_str(i), capability_str)) {
392             *capability = i;
393             return 0;
394         }
395     }
396     error_report("Received unknown capability %s", capability_str);
397     return -EINVAL;
398 }
399 
400 static int put_capability(QEMUFile *f, void *pv, size_t size,
401                           const VMStateField *field, JSONWriter *vmdesc)
402 {
403     MigrationCapability *capability = pv;
404     const char *capability_str = MigrationCapability_str(*capability);
405     size_t len = strlen(capability_str);
406     assert(len <= UINT8_MAX);
407 
408     qemu_put_byte(f, len);
409     qemu_put_buffer(f, (uint8_t *)capability_str, len);
410     return 0;
411 }
412 
413 static const VMStateInfo vmstate_info_capability = {
414     .name = "capability",
415     .get  = get_capability,
416     .put  = put_capability,
417 };
418 
419 /* The target-page-bits subsection is present only if the
420  * target page size is not the same as the default (ie the
421  * minimum page size for a variable-page-size guest CPU).
422  * If it is present then it contains the actual target page
423  * bits for the machine, and migration will fail if the
424  * two ends don't agree about it.
425  */
426 static bool vmstate_target_page_bits_needed(void *opaque)
427 {
428     return qemu_target_page_bits()
429         > qemu_target_page_bits_min();
430 }
431 
432 static const VMStateDescription vmstate_target_page_bits = {
433     .name = "configuration/target-page-bits",
434     .version_id = 1,
435     .minimum_version_id = 1,
436     .needed = vmstate_target_page_bits_needed,
437     .fields = (VMStateField[]) {
438         VMSTATE_UINT32(target_page_bits, SaveState),
439         VMSTATE_END_OF_LIST()
440     }
441 };
442 
443 static bool vmstate_capabilites_needed(void *opaque)
444 {
445     return get_validatable_capabilities_count() > 0;
446 }
447 
448 static const VMStateDescription vmstate_capabilites = {
449     .name = "configuration/capabilities",
450     .version_id = 1,
451     .minimum_version_id = 1,
452     .needed = vmstate_capabilites_needed,
453     .fields = (VMStateField[]) {
454         VMSTATE_UINT32_V(caps_count, SaveState, 1),
455         VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
456                                     vmstate_info_capability,
457                                     MigrationCapability),
458         VMSTATE_END_OF_LIST()
459     }
460 };
461 
462 static bool vmstate_uuid_needed(void *opaque)
463 {
464     return qemu_uuid_set && migrate_validate_uuid();
465 }
466 
467 static int vmstate_uuid_post_load(void *opaque, int version_id)
468 {
469     SaveState *state = opaque;
470     char uuid_src[UUID_FMT_LEN + 1];
471     char uuid_dst[UUID_FMT_LEN + 1];
472 
473     if (!qemu_uuid_set) {
474         /*
475          * It's warning because user might not know UUID in some cases,
476          * e.g. load an old snapshot
477          */
478         qemu_uuid_unparse(&state->uuid, uuid_src);
479         warn_report("UUID is received %s, but local uuid isn't set",
480                      uuid_src);
481         return 0;
482     }
483     if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
484         qemu_uuid_unparse(&state->uuid, uuid_src);
485         qemu_uuid_unparse(&qemu_uuid, uuid_dst);
486         error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
487         return -EINVAL;
488     }
489     return 0;
490 }
491 
492 static const VMStateDescription vmstate_uuid = {
493     .name = "configuration/uuid",
494     .version_id = 1,
495     .minimum_version_id = 1,
496     .needed = vmstate_uuid_needed,
497     .post_load = vmstate_uuid_post_load,
498     .fields = (VMStateField[]) {
499         VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
500         VMSTATE_END_OF_LIST()
501     }
502 };
503 
504 static const VMStateDescription vmstate_configuration = {
505     .name = "configuration",
506     .version_id = 1,
507     .pre_load = configuration_pre_load,
508     .post_load = configuration_post_load,
509     .pre_save = configuration_pre_save,
510     .post_save = configuration_post_save,
511     .fields = (VMStateField[]) {
512         VMSTATE_UINT32(len, SaveState),
513         VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
514         VMSTATE_END_OF_LIST()
515     },
516     .subsections = (const VMStateDescription *[]) {
517         &vmstate_target_page_bits,
518         &vmstate_capabilites,
519         &vmstate_uuid,
520         NULL
521     }
522 };
523 
524 static void dump_vmstate_vmsd(FILE *out_file,
525                               const VMStateDescription *vmsd, int indent,
526                               bool is_subsection);
527 
528 static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
529                               int indent)
530 {
531     fprintf(out_file, "%*s{\n", indent, "");
532     indent += 2;
533     fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
534     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
535             field->version_id);
536     fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
537             field->field_exists ? "true" : "false");
538     fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
539     if (field->vmsd != NULL) {
540         fprintf(out_file, ",\n");
541         dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
542     }
543     fprintf(out_file, "\n%*s}", indent - 2, "");
544 }
545 
546 static void dump_vmstate_vmss(FILE *out_file,
547                               const VMStateDescription **subsection,
548                               int indent)
549 {
550     if (*subsection != NULL) {
551         dump_vmstate_vmsd(out_file, *subsection, indent, true);
552     }
553 }
554 
555 static void dump_vmstate_vmsd(FILE *out_file,
556                               const VMStateDescription *vmsd, int indent,
557                               bool is_subsection)
558 {
559     if (is_subsection) {
560         fprintf(out_file, "%*s{\n", indent, "");
561     } else {
562         fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
563     }
564     indent += 2;
565     fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
566     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
567             vmsd->version_id);
568     fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
569             vmsd->minimum_version_id);
570     if (vmsd->fields != NULL) {
571         const VMStateField *field = vmsd->fields;
572         bool first;
573 
574         fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
575         first = true;
576         while (field->name != NULL) {
577             if (field->flags & VMS_MUST_EXIST) {
578                 /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
579                 field++;
580                 continue;
581             }
582             if (!first) {
583                 fprintf(out_file, ",\n");
584             }
585             dump_vmstate_vmsf(out_file, field, indent + 2);
586             field++;
587             first = false;
588         }
589         assert(field->flags == VMS_END);
590         fprintf(out_file, "\n%*s]", indent, "");
591     }
592     if (vmsd->subsections != NULL) {
593         const VMStateDescription **subsection = vmsd->subsections;
594         bool first;
595 
596         fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
597         first = true;
598         while (*subsection != NULL) {
599             if (!first) {
600                 fprintf(out_file, ",\n");
601             }
602             dump_vmstate_vmss(out_file, subsection, indent + 2);
603             subsection++;
604             first = false;
605         }
606         fprintf(out_file, "\n%*s]", indent, "");
607     }
608     fprintf(out_file, "\n%*s}", indent - 2, "");
609 }
610 
611 static void dump_machine_type(FILE *out_file)
612 {
613     MachineClass *mc;
614 
615     mc = MACHINE_GET_CLASS(current_machine);
616 
617     fprintf(out_file, "  \"vmschkmachine\": {\n");
618     fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
619     fprintf(out_file, "  },\n");
620 }
621 
622 void dump_vmstate_json_to_file(FILE *out_file)
623 {
624     GSList *list, *elt;
625     bool first;
626 
627     fprintf(out_file, "{\n");
628     dump_machine_type(out_file);
629 
630     first = true;
631     list = object_class_get_list(TYPE_DEVICE, true);
632     for (elt = list; elt; elt = elt->next) {
633         DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
634                                              TYPE_DEVICE);
635         const char *name;
636         int indent = 2;
637 
638         if (!dc->vmsd) {
639             continue;
640         }
641 
642         if (!first) {
643             fprintf(out_file, ",\n");
644         }
645         name = object_class_get_name(OBJECT_CLASS(dc));
646         fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
647         indent += 2;
648         fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
649         fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
650                 dc->vmsd->version_id);
651         fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
652                 dc->vmsd->minimum_version_id);
653 
654         dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
655 
656         fprintf(out_file, "\n%*s}", indent - 2, "");
657         first = false;
658     }
659     fprintf(out_file, "\n}\n");
660     fclose(out_file);
661     g_slist_free(list);
662 }
663 
664 static uint32_t calculate_new_instance_id(const char *idstr)
665 {
666     SaveStateEntry *se;
667     uint32_t instance_id = 0;
668 
669     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
670         if (strcmp(idstr, se->idstr) == 0
671             && instance_id <= se->instance_id) {
672             instance_id = se->instance_id + 1;
673         }
674     }
675     /* Make sure we never loop over without being noticed */
676     assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
677     return instance_id;
678 }
679 
680 static int calculate_compat_instance_id(const char *idstr)
681 {
682     SaveStateEntry *se;
683     int instance_id = 0;
684 
685     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
686         if (!se->compat) {
687             continue;
688         }
689 
690         if (strcmp(idstr, se->compat->idstr) == 0
691             && instance_id <= se->compat->instance_id) {
692             instance_id = se->compat->instance_id + 1;
693         }
694     }
695     return instance_id;
696 }
697 
698 static inline MigrationPriority save_state_priority(SaveStateEntry *se)
699 {
700     if (se->vmsd) {
701         return se->vmsd->priority;
702     }
703     return MIG_PRI_DEFAULT;
704 }
705 
706 static void savevm_state_handler_insert(SaveStateEntry *nse)
707 {
708     MigrationPriority priority = save_state_priority(nse);
709     SaveStateEntry *se;
710     int i;
711 
712     assert(priority <= MIG_PRI_MAX);
713 
714     for (i = priority - 1; i >= 0; i--) {
715         se = savevm_state.handler_pri_head[i];
716         if (se != NULL) {
717             assert(save_state_priority(se) < priority);
718             break;
719         }
720     }
721 
722     if (i >= 0) {
723         QTAILQ_INSERT_BEFORE(se, nse, entry);
724     } else {
725         QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
726     }
727 
728     if (savevm_state.handler_pri_head[priority] == NULL) {
729         savevm_state.handler_pri_head[priority] = nse;
730     }
731 }
732 
733 static void savevm_state_handler_remove(SaveStateEntry *se)
734 {
735     SaveStateEntry *next;
736     MigrationPriority priority = save_state_priority(se);
737 
738     if (se == savevm_state.handler_pri_head[priority]) {
739         next = QTAILQ_NEXT(se, entry);
740         if (next != NULL && save_state_priority(next) == priority) {
741             savevm_state.handler_pri_head[priority] = next;
742         } else {
743             savevm_state.handler_pri_head[priority] = NULL;
744         }
745     }
746     QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
747 }
748 
749 /* TODO: Individual devices generally have very little idea about the rest
750    of the system, so instance_id should be removed/replaced.
751    Meanwhile pass -1 as instance_id if you do not already have a clearly
752    distinguishing id for all instances of your device class. */
753 int register_savevm_live(const char *idstr,
754                          uint32_t instance_id,
755                          int version_id,
756                          const SaveVMHandlers *ops,
757                          void *opaque)
758 {
759     SaveStateEntry *se;
760 
761     se = g_new0(SaveStateEntry, 1);
762     se->version_id = version_id;
763     se->section_id = savevm_state.global_section_id++;
764     se->ops = ops;
765     se->opaque = opaque;
766     se->vmsd = NULL;
767     /* if this is a live_savem then set is_ram */
768     if (ops->save_setup != NULL) {
769         se->is_ram = 1;
770     }
771 
772     pstrcat(se->idstr, sizeof(se->idstr), idstr);
773 
774     if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
775         se->instance_id = calculate_new_instance_id(se->idstr);
776     } else {
777         se->instance_id = instance_id;
778     }
779     assert(!se->compat || se->instance_id == 0);
780     savevm_state_handler_insert(se);
781     return 0;
782 }
783 
784 void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
785 {
786     SaveStateEntry *se, *new_se;
787     char id[256] = "";
788 
789     if (obj) {
790         char *oid = vmstate_if_get_id(obj);
791         if (oid) {
792             pstrcpy(id, sizeof(id), oid);
793             pstrcat(id, sizeof(id), "/");
794             g_free(oid);
795         }
796     }
797     pstrcat(id, sizeof(id), idstr);
798 
799     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
800         if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
801             savevm_state_handler_remove(se);
802             g_free(se->compat);
803             g_free(se);
804         }
805     }
806 }
807 
808 /*
809  * Perform some basic checks on vmsd's at registration
810  * time.
811  */
812 static void vmstate_check(const VMStateDescription *vmsd)
813 {
814     const VMStateField *field = vmsd->fields;
815     const VMStateDescription **subsection = vmsd->subsections;
816 
817     if (field) {
818         while (field->name) {
819             if (field->flags & (VMS_STRUCT | VMS_VSTRUCT)) {
820                 /* Recurse to sub structures */
821                 vmstate_check(field->vmsd);
822             }
823             /* Carry on */
824             field++;
825         }
826         /* Check for the end of field list canary */
827         if (field->flags != VMS_END) {
828             error_report("VMSTATE not ending with VMS_END: %s", vmsd->name);
829             g_assert_not_reached();
830         }
831     }
832 
833     while (subsection && *subsection) {
834         /*
835          * The name of a subsection should start with the name of the
836          * current object.
837          */
838         assert(!strncmp(vmsd->name, (*subsection)->name, strlen(vmsd->name)));
839         vmstate_check(*subsection);
840         subsection++;
841     }
842 }
843 
844 int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
845                                    const VMStateDescription *vmsd,
846                                    void *opaque, int alias_id,
847                                    int required_for_version,
848                                    Error **errp)
849 {
850     SaveStateEntry *se;
851 
852     /* If this triggers, alias support can be dropped for the vmsd. */
853     assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
854 
855     se = g_new0(SaveStateEntry, 1);
856     se->version_id = vmsd->version_id;
857     se->section_id = savevm_state.global_section_id++;
858     se->opaque = opaque;
859     se->vmsd = vmsd;
860     se->alias_id = alias_id;
861 
862     if (obj) {
863         char *id = vmstate_if_get_id(obj);
864         if (id) {
865             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
866                 sizeof(se->idstr)) {
867                 error_setg(errp, "Path too long for VMState (%s)", id);
868                 g_free(id);
869                 g_free(se);
870 
871                 return -1;
872             }
873             g_free(id);
874 
875             se->compat = g_new0(CompatEntry, 1);
876             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
877             se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
878                          calculate_compat_instance_id(vmsd->name) : instance_id;
879             instance_id = VMSTATE_INSTANCE_ID_ANY;
880         }
881     }
882     pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
883 
884     if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
885         se->instance_id = calculate_new_instance_id(se->idstr);
886     } else {
887         se->instance_id = instance_id;
888     }
889 
890     /* Perform a recursive sanity check during the test runs */
891     if (qtest_enabled()) {
892         vmstate_check(vmsd);
893     }
894     assert(!se->compat || se->instance_id == 0);
895     savevm_state_handler_insert(se);
896     return 0;
897 }
898 
899 void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
900                         void *opaque)
901 {
902     SaveStateEntry *se, *new_se;
903 
904     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
905         if (se->vmsd == vmsd && se->opaque == opaque) {
906             savevm_state_handler_remove(se);
907             g_free(se->compat);
908             g_free(se);
909         }
910     }
911 }
912 
913 static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
914 {
915     trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
916     if (!se->vmsd) {         /* Old style */
917         return se->ops->load_state(f, se->opaque, se->load_version_id);
918     }
919     return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
920 }
921 
922 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
923                                    JSONWriter *vmdesc)
924 {
925     int64_t old_offset, size;
926 
927     old_offset = qemu_file_total_transferred_fast(f);
928     se->ops->save_state(f, se->opaque);
929     size = qemu_file_total_transferred_fast(f) - old_offset;
930 
931     if (vmdesc) {
932         json_writer_int64(vmdesc, "size", size);
933         json_writer_start_array(vmdesc, "fields");
934         json_writer_start_object(vmdesc, NULL);
935         json_writer_str(vmdesc, "name", "data");
936         json_writer_int64(vmdesc, "size", size);
937         json_writer_str(vmdesc, "type", "buffer");
938         json_writer_end_object(vmdesc);
939         json_writer_end_array(vmdesc);
940     }
941 }
942 
943 /*
944  * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
945  */
946 static void save_section_header(QEMUFile *f, SaveStateEntry *se,
947                                 uint8_t section_type)
948 {
949     qemu_put_byte(f, section_type);
950     qemu_put_be32(f, se->section_id);
951 
952     if (section_type == QEMU_VM_SECTION_FULL ||
953         section_type == QEMU_VM_SECTION_START) {
954         /* ID string */
955         size_t len = strlen(se->idstr);
956         qemu_put_byte(f, len);
957         qemu_put_buffer(f, (uint8_t *)se->idstr, len);
958 
959         qemu_put_be32(f, se->instance_id);
960         qemu_put_be32(f, se->version_id);
961     }
962 }
963 
964 /*
965  * Write a footer onto device sections that catches cases misformatted device
966  * sections.
967  */
968 static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
969 {
970     if (migrate_get_current()->send_section_footer) {
971         qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
972         qemu_put_be32(f, se->section_id);
973     }
974 }
975 
976 static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc)
977 {
978     int ret;
979 
980     if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
981         return 0;
982     }
983     if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
984         trace_savevm_section_skip(se->idstr, se->section_id);
985         return 0;
986     }
987 
988     trace_savevm_section_start(se->idstr, se->section_id);
989     save_section_header(f, se, QEMU_VM_SECTION_FULL);
990     if (vmdesc) {
991         json_writer_start_object(vmdesc, NULL);
992         json_writer_str(vmdesc, "name", se->idstr);
993         json_writer_int64(vmdesc, "instance_id", se->instance_id);
994     }
995 
996     trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
997     if (!se->vmsd) {
998         vmstate_save_old_style(f, se, vmdesc);
999     } else {
1000         ret = vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
1001         if (ret) {
1002             return ret;
1003         }
1004     }
1005 
1006     trace_savevm_section_end(se->idstr, se->section_id, 0);
1007     save_section_footer(f, se);
1008     if (vmdesc) {
1009         json_writer_end_object(vmdesc);
1010     }
1011     return 0;
1012 }
1013 /**
1014  * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
1015  *                           command and associated data.
1016  *
1017  * @f: File to send command on
1018  * @command: Command type to send
1019  * @len: Length of associated data
1020  * @data: Data associated with command.
1021  */
1022 static void qemu_savevm_command_send(QEMUFile *f,
1023                                      enum qemu_vm_cmd command,
1024                                      uint16_t len,
1025                                      uint8_t *data)
1026 {
1027     trace_savevm_command_send(command, len);
1028     qemu_put_byte(f, QEMU_VM_COMMAND);
1029     qemu_put_be16(f, (uint16_t)command);
1030     qemu_put_be16(f, len);
1031     qemu_put_buffer(f, data, len);
1032     qemu_fflush(f);
1033 }
1034 
1035 void qemu_savevm_send_colo_enable(QEMUFile *f)
1036 {
1037     trace_savevm_send_colo_enable();
1038     qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
1039 }
1040 
1041 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
1042 {
1043     uint32_t buf;
1044 
1045     trace_savevm_send_ping(value);
1046     buf = cpu_to_be32(value);
1047     qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
1048 }
1049 
1050 void qemu_savevm_send_open_return_path(QEMUFile *f)
1051 {
1052     trace_savevm_send_open_return_path();
1053     qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
1054 }
1055 
1056 /* We have a buffer of data to send; we don't want that all to be loaded
1057  * by the command itself, so the command contains just the length of the
1058  * extra buffer that we then send straight after it.
1059  * TODO: Must be a better way to organise that
1060  *
1061  * Returns:
1062  *    0 on success
1063  *    -ve on error
1064  */
1065 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
1066 {
1067     uint32_t tmp;
1068 
1069     if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1070         error_report("%s: Unreasonably large packaged state: %zu",
1071                      __func__, len);
1072         return -1;
1073     }
1074 
1075     tmp = cpu_to_be32(len);
1076 
1077     trace_qemu_savevm_send_packaged();
1078     qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1079 
1080     qemu_put_buffer(f, buf, len);
1081 
1082     return 0;
1083 }
1084 
1085 /* Send prior to any postcopy transfer */
1086 void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1087 {
1088     if (migrate_postcopy_ram()) {
1089         uint64_t tmp[2];
1090         tmp[0] = cpu_to_be64(ram_pagesize_summary());
1091         tmp[1] = cpu_to_be64(qemu_target_page_size());
1092 
1093         trace_qemu_savevm_send_postcopy_advise();
1094         qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1095                                  16, (uint8_t *)tmp);
1096     } else {
1097         qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1098     }
1099 }
1100 
1101 /* Sent prior to starting the destination running in postcopy, discard pages
1102  * that have already been sent but redirtied on the source.
1103  * CMD_POSTCOPY_RAM_DISCARD consist of:
1104  *      byte   version (0)
1105  *      byte   Length of name field (not including 0)
1106  *  n x byte   RAM block name
1107  *      byte   0 terminator (just for safety)
1108  *  n x        Byte ranges within the named RAMBlock
1109  *      be64   Start of the range
1110  *      be64   Length
1111  *
1112  *  name:  RAMBlock name that these entries are part of
1113  *  len: Number of page entries
1114  *  start_list: 'len' addresses
1115  *  length_list: 'len' addresses
1116  *
1117  */
1118 void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1119                                            uint16_t len,
1120                                            uint64_t *start_list,
1121                                            uint64_t *length_list)
1122 {
1123     uint8_t *buf;
1124     uint16_t tmplen;
1125     uint16_t t;
1126     size_t name_len = strlen(name);
1127 
1128     trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1129     assert(name_len < 256);
1130     buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1131     buf[0] = postcopy_ram_discard_version;
1132     buf[1] = name_len;
1133     memcpy(buf + 2, name, name_len);
1134     tmplen = 2 + name_len;
1135     buf[tmplen++] = '\0';
1136 
1137     for (t = 0; t < len; t++) {
1138         stq_be_p(buf + tmplen, start_list[t]);
1139         tmplen += 8;
1140         stq_be_p(buf + tmplen, length_list[t]);
1141         tmplen += 8;
1142     }
1143     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1144     g_free(buf);
1145 }
1146 
1147 /* Get the destination into a state where it can receive postcopy data. */
1148 void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1149 {
1150     trace_savevm_send_postcopy_listen();
1151     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1152 }
1153 
1154 /* Kick the destination into running */
1155 void qemu_savevm_send_postcopy_run(QEMUFile *f)
1156 {
1157     trace_savevm_send_postcopy_run();
1158     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1159 }
1160 
1161 void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1162 {
1163     trace_savevm_send_postcopy_resume();
1164     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1165 }
1166 
1167 void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1168 {
1169     size_t len;
1170     char buf[256];
1171 
1172     trace_savevm_send_recv_bitmap(block_name);
1173 
1174     buf[0] = len = strlen(block_name);
1175     memcpy(buf + 1, block_name, len);
1176 
1177     qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1178 }
1179 
1180 bool qemu_savevm_state_blocked(Error **errp)
1181 {
1182     SaveStateEntry *se;
1183 
1184     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1185         if (se->vmsd && se->vmsd->unmigratable) {
1186             error_setg(errp, "State blocked by non-migratable device '%s'",
1187                        se->idstr);
1188             return true;
1189         }
1190     }
1191     return false;
1192 }
1193 
1194 void qemu_savevm_non_migratable_list(strList **reasons)
1195 {
1196     SaveStateEntry *se;
1197 
1198     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1199         if (se->vmsd && se->vmsd->unmigratable) {
1200             QAPI_LIST_PREPEND(*reasons,
1201                               g_strdup_printf("non-migratable device: %s",
1202                                               se->idstr));
1203         }
1204     }
1205 }
1206 
1207 void qemu_savevm_state_header(QEMUFile *f)
1208 {
1209     trace_savevm_state_header();
1210     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1211     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1212 
1213     if (migrate_get_current()->send_configuration) {
1214         qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1215         vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1216     }
1217 }
1218 
1219 bool qemu_savevm_state_guest_unplug_pending(void)
1220 {
1221     SaveStateEntry *se;
1222 
1223     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1224         if (se->vmsd && se->vmsd->dev_unplug_pending &&
1225             se->vmsd->dev_unplug_pending(se->opaque)) {
1226             return true;
1227         }
1228     }
1229 
1230     return false;
1231 }
1232 
1233 void qemu_savevm_state_setup(QEMUFile *f)
1234 {
1235     MigrationState *ms = migrate_get_current();
1236     SaveStateEntry *se;
1237     Error *local_err = NULL;
1238     int ret;
1239 
1240     ms->vmdesc = json_writer_new(false);
1241     json_writer_start_object(ms->vmdesc, NULL);
1242     json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size());
1243     json_writer_start_array(ms->vmdesc, "devices");
1244 
1245     trace_savevm_state_setup();
1246     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1247         if (se->vmsd && se->vmsd->early_setup) {
1248             ret = vmstate_save(f, se, ms->vmdesc);
1249             if (ret) {
1250                 qemu_file_set_error(f, ret);
1251                 break;
1252             }
1253             continue;
1254         }
1255 
1256         if (!se->ops || !se->ops->save_setup) {
1257             continue;
1258         }
1259         if (se->ops->is_active) {
1260             if (!se->ops->is_active(se->opaque)) {
1261                 continue;
1262             }
1263         }
1264         save_section_header(f, se, QEMU_VM_SECTION_START);
1265 
1266         ret = se->ops->save_setup(f, se->opaque);
1267         save_section_footer(f, se);
1268         if (ret < 0) {
1269             qemu_file_set_error(f, ret);
1270             break;
1271         }
1272     }
1273 
1274     if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) {
1275         error_report_err(local_err);
1276     }
1277 }
1278 
1279 int qemu_savevm_state_resume_prepare(MigrationState *s)
1280 {
1281     SaveStateEntry *se;
1282     int ret;
1283 
1284     trace_savevm_state_resume_prepare();
1285 
1286     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1287         if (!se->ops || !se->ops->resume_prepare) {
1288             continue;
1289         }
1290         if (se->ops->is_active) {
1291             if (!se->ops->is_active(se->opaque)) {
1292                 continue;
1293             }
1294         }
1295         ret = se->ops->resume_prepare(s, se->opaque);
1296         if (ret < 0) {
1297             return ret;
1298         }
1299     }
1300 
1301     return 0;
1302 }
1303 
1304 /*
1305  * this function has three return values:
1306  *   negative: there was one error, and we have -errno.
1307  *   0 : We haven't finished, caller have to go again
1308  *   1 : We have finished, we can go to complete phase
1309  */
1310 int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1311 {
1312     SaveStateEntry *se;
1313     int ret = 1;
1314 
1315     trace_savevm_state_iterate();
1316     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1317         if (!se->ops || !se->ops->save_live_iterate) {
1318             continue;
1319         }
1320         if (se->ops->is_active &&
1321             !se->ops->is_active(se->opaque)) {
1322             continue;
1323         }
1324         if (se->ops->is_active_iterate &&
1325             !se->ops->is_active_iterate(se->opaque)) {
1326             continue;
1327         }
1328         /*
1329          * In the postcopy phase, any device that doesn't know how to
1330          * do postcopy should have saved it's state in the _complete
1331          * call that's already run, it might get confused if we call
1332          * iterate afterwards.
1333          */
1334         if (postcopy &&
1335             !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1336             continue;
1337         }
1338         if (qemu_file_rate_limit(f)) {
1339             return 0;
1340         }
1341         trace_savevm_section_start(se->idstr, se->section_id);
1342 
1343         save_section_header(f, se, QEMU_VM_SECTION_PART);
1344 
1345         ret = se->ops->save_live_iterate(f, se->opaque);
1346         trace_savevm_section_end(se->idstr, se->section_id, ret);
1347         save_section_footer(f, se);
1348 
1349         if (ret < 0) {
1350             error_report("failed to save SaveStateEntry with id(name): "
1351                          "%d(%s): %d",
1352                          se->section_id, se->idstr, ret);
1353             qemu_file_set_error(f, ret);
1354         }
1355         if (ret <= 0) {
1356             /* Do not proceed to the next vmstate before this one reported
1357                completion of the current stage. This serializes the migration
1358                and reduces the probability that a faster changing state is
1359                synchronized over and over again. */
1360             break;
1361         }
1362     }
1363     return ret;
1364 }
1365 
1366 static bool should_send_vmdesc(void)
1367 {
1368     MachineState *machine = MACHINE(qdev_get_machine());
1369     bool in_postcopy = migration_in_postcopy();
1370     return !machine->suppress_vmdesc && !in_postcopy;
1371 }
1372 
1373 /*
1374  * Calls the save_live_complete_postcopy methods
1375  * causing the last few pages to be sent immediately and doing any associated
1376  * cleanup.
1377  * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1378  * all the other devices, but that happens at the point we switch to postcopy.
1379  */
1380 void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1381 {
1382     SaveStateEntry *se;
1383     int ret;
1384 
1385     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1386         if (!se->ops || !se->ops->save_live_complete_postcopy) {
1387             continue;
1388         }
1389         if (se->ops->is_active) {
1390             if (!se->ops->is_active(se->opaque)) {
1391                 continue;
1392             }
1393         }
1394         trace_savevm_section_start(se->idstr, se->section_id);
1395         /* Section type */
1396         qemu_put_byte(f, QEMU_VM_SECTION_END);
1397         qemu_put_be32(f, se->section_id);
1398 
1399         ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1400         trace_savevm_section_end(se->idstr, se->section_id, ret);
1401         save_section_footer(f, se);
1402         if (ret < 0) {
1403             qemu_file_set_error(f, ret);
1404             return;
1405         }
1406     }
1407 
1408     qemu_put_byte(f, QEMU_VM_EOF);
1409     qemu_fflush(f);
1410 }
1411 
1412 static
1413 int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1414 {
1415     SaveStateEntry *se;
1416     int ret;
1417 
1418     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1419         if (!se->ops ||
1420             (in_postcopy && se->ops->has_postcopy &&
1421              se->ops->has_postcopy(se->opaque)) ||
1422             !se->ops->save_live_complete_precopy) {
1423             continue;
1424         }
1425 
1426         if (se->ops->is_active) {
1427             if (!se->ops->is_active(se->opaque)) {
1428                 continue;
1429             }
1430         }
1431         trace_savevm_section_start(se->idstr, se->section_id);
1432 
1433         save_section_header(f, se, QEMU_VM_SECTION_END);
1434 
1435         ret = se->ops->save_live_complete_precopy(f, se->opaque);
1436         trace_savevm_section_end(se->idstr, se->section_id, ret);
1437         save_section_footer(f, se);
1438         if (ret < 0) {
1439             qemu_file_set_error(f, ret);
1440             return -1;
1441         }
1442     }
1443 
1444     return 0;
1445 }
1446 
1447 int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1448                                                     bool in_postcopy,
1449                                                     bool inactivate_disks)
1450 {
1451     MigrationState *ms = migrate_get_current();
1452     JSONWriter *vmdesc = ms->vmdesc;
1453     int vmdesc_len;
1454     SaveStateEntry *se;
1455     int ret;
1456 
1457     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1458         if (se->vmsd && se->vmsd->early_setup) {
1459             /* Already saved during qemu_savevm_state_setup(). */
1460             continue;
1461         }
1462 
1463         ret = vmstate_save(f, se, vmdesc);
1464         if (ret) {
1465             qemu_file_set_error(f, ret);
1466             return ret;
1467         }
1468     }
1469 
1470     if (inactivate_disks) {
1471         /* Inactivate before sending QEMU_VM_EOF so that the
1472          * bdrv_activate_all() on the other end won't fail. */
1473         ret = bdrv_inactivate_all();
1474         if (ret) {
1475             error_report("%s: bdrv_inactivate_all() failed (%d)",
1476                          __func__, ret);
1477             qemu_file_set_error(f, ret);
1478             return ret;
1479         }
1480     }
1481     if (!in_postcopy) {
1482         /* Postcopy stream will still be going */
1483         qemu_put_byte(f, QEMU_VM_EOF);
1484     }
1485 
1486     json_writer_end_array(vmdesc);
1487     json_writer_end_object(vmdesc);
1488     vmdesc_len = strlen(json_writer_get(vmdesc));
1489 
1490     if (should_send_vmdesc()) {
1491         qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1492         qemu_put_be32(f, vmdesc_len);
1493         qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
1494     }
1495 
1496     /* Free it now to detect any inconsistencies. */
1497     json_writer_free(vmdesc);
1498     ms->vmdesc = NULL;
1499 
1500     return 0;
1501 }
1502 
1503 int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1504                                        bool inactivate_disks)
1505 {
1506     int ret;
1507     Error *local_err = NULL;
1508     bool in_postcopy = migration_in_postcopy();
1509 
1510     if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
1511         error_report_err(local_err);
1512     }
1513 
1514     trace_savevm_state_complete_precopy();
1515 
1516     cpu_synchronize_all_states();
1517 
1518     if (!in_postcopy || iterable_only) {
1519         ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
1520         if (ret) {
1521             return ret;
1522         }
1523     }
1524 
1525     if (iterable_only) {
1526         goto flush;
1527     }
1528 
1529     ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
1530                                                           inactivate_disks);
1531     if (ret) {
1532         return ret;
1533     }
1534 
1535 flush:
1536     qemu_fflush(f);
1537     return 0;
1538 }
1539 
1540 /* Give an estimate of the amount left to be transferred,
1541  * the result is split into the amount for units that can and
1542  * for units that can't do postcopy.
1543  */
1544 void qemu_savevm_state_pending_estimate(uint64_t *must_precopy,
1545                                         uint64_t *can_postcopy)
1546 {
1547     SaveStateEntry *se;
1548 
1549     *must_precopy = 0;
1550     *can_postcopy = 0;
1551 
1552     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1553         if (!se->ops || !se->ops->state_pending_estimate) {
1554             continue;
1555         }
1556         if (se->ops->is_active) {
1557             if (!se->ops->is_active(se->opaque)) {
1558                 continue;
1559             }
1560         }
1561         se->ops->state_pending_estimate(se->opaque, must_precopy, can_postcopy);
1562     }
1563 }
1564 
1565 void qemu_savevm_state_pending_exact(uint64_t *must_precopy,
1566                                      uint64_t *can_postcopy)
1567 {
1568     SaveStateEntry *se;
1569 
1570     *must_precopy = 0;
1571     *can_postcopy = 0;
1572 
1573     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1574         if (!se->ops || !se->ops->state_pending_exact) {
1575             continue;
1576         }
1577         if (se->ops->is_active) {
1578             if (!se->ops->is_active(se->opaque)) {
1579                 continue;
1580             }
1581         }
1582         se->ops->state_pending_exact(se->opaque, must_precopy, can_postcopy);
1583     }
1584 }
1585 
1586 void qemu_savevm_state_cleanup(void)
1587 {
1588     SaveStateEntry *se;
1589     Error *local_err = NULL;
1590 
1591     if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1592         error_report_err(local_err);
1593     }
1594 
1595     trace_savevm_state_cleanup();
1596     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1597         if (se->ops && se->ops->save_cleanup) {
1598             se->ops->save_cleanup(se->opaque);
1599         }
1600     }
1601 }
1602 
1603 static int qemu_savevm_state(QEMUFile *f, Error **errp)
1604 {
1605     int ret;
1606     MigrationState *ms = migrate_get_current();
1607     MigrationStatus status;
1608 
1609     if (migration_is_running(ms->state)) {
1610         error_setg(errp, QERR_MIGRATION_ACTIVE);
1611         return -EINVAL;
1612     }
1613 
1614     if (migrate_use_block()) {
1615         error_setg(errp, "Block migration and snapshots are incompatible");
1616         return -EINVAL;
1617     }
1618 
1619     migrate_init(ms);
1620     memset(&ram_counters, 0, sizeof(ram_counters));
1621     memset(&compression_counters, 0, sizeof(compression_counters));
1622     ms->to_dst_file = f;
1623 
1624     qemu_mutex_unlock_iothread();
1625     qemu_savevm_state_header(f);
1626     qemu_savevm_state_setup(f);
1627     qemu_mutex_lock_iothread();
1628 
1629     while (qemu_file_get_error(f) == 0) {
1630         if (qemu_savevm_state_iterate(f, false) > 0) {
1631             break;
1632         }
1633     }
1634 
1635     ret = qemu_file_get_error(f);
1636     if (ret == 0) {
1637         qemu_savevm_state_complete_precopy(f, false, false);
1638         ret = qemu_file_get_error(f);
1639     }
1640     qemu_savevm_state_cleanup();
1641     if (ret != 0) {
1642         error_setg_errno(errp, -ret, "Error while writing VM state");
1643     }
1644 
1645     if (ret != 0) {
1646         status = MIGRATION_STATUS_FAILED;
1647     } else {
1648         status = MIGRATION_STATUS_COMPLETED;
1649     }
1650     migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1651 
1652     /* f is outer parameter, it should not stay in global migration state after
1653      * this function finished */
1654     ms->to_dst_file = NULL;
1655 
1656     return ret;
1657 }
1658 
1659 void qemu_savevm_live_state(QEMUFile *f)
1660 {
1661     /* save QEMU_VM_SECTION_END section */
1662     qemu_savevm_state_complete_precopy(f, true, false);
1663     qemu_put_byte(f, QEMU_VM_EOF);
1664 }
1665 
1666 int qemu_save_device_state(QEMUFile *f)
1667 {
1668     SaveStateEntry *se;
1669 
1670     if (!migration_in_colo_state()) {
1671         qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1672         qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1673     }
1674     cpu_synchronize_all_states();
1675 
1676     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1677         int ret;
1678 
1679         if (se->is_ram) {
1680             continue;
1681         }
1682         ret = vmstate_save(f, se, NULL);
1683         if (ret) {
1684             return ret;
1685         }
1686     }
1687 
1688     qemu_put_byte(f, QEMU_VM_EOF);
1689 
1690     return qemu_file_get_error(f);
1691 }
1692 
1693 static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1694 {
1695     SaveStateEntry *se;
1696 
1697     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1698         if (!strcmp(se->idstr, idstr) &&
1699             (instance_id == se->instance_id ||
1700              instance_id == se->alias_id))
1701             return se;
1702         /* Migrating from an older version? */
1703         if (strstr(se->idstr, idstr) && se->compat) {
1704             if (!strcmp(se->compat->idstr, idstr) &&
1705                 (instance_id == se->compat->instance_id ||
1706                  instance_id == se->alias_id))
1707                 return se;
1708         }
1709     }
1710     return NULL;
1711 }
1712 
1713 enum LoadVMExitCodes {
1714     /* Allow a command to quit all layers of nested loadvm loops */
1715     LOADVM_QUIT     =  1,
1716 };
1717 
1718 /* ------ incoming postcopy messages ------ */
1719 /* 'advise' arrives before any transfers just to tell us that a postcopy
1720  * *might* happen - it might be skipped if precopy transferred everything
1721  * quickly.
1722  */
1723 static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1724                                          uint16_t len)
1725 {
1726     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1727     uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1728     size_t page_size = qemu_target_page_size();
1729     Error *local_err = NULL;
1730 
1731     trace_loadvm_postcopy_handle_advise();
1732     if (ps != POSTCOPY_INCOMING_NONE) {
1733         error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1734         return -1;
1735     }
1736 
1737     switch (len) {
1738     case 0:
1739         if (migrate_postcopy_ram()) {
1740             error_report("RAM postcopy is enabled but have 0 byte advise");
1741             return -EINVAL;
1742         }
1743         return 0;
1744     case 8 + 8:
1745         if (!migrate_postcopy_ram()) {
1746             error_report("RAM postcopy is disabled but have 16 byte advise");
1747             return -EINVAL;
1748         }
1749         break;
1750     default:
1751         error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1752         return -EINVAL;
1753     }
1754 
1755     if (!postcopy_ram_supported_by_host(mis)) {
1756         postcopy_state_set(POSTCOPY_INCOMING_NONE);
1757         return -1;
1758     }
1759 
1760     remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1761     local_pagesize_summary = ram_pagesize_summary();
1762 
1763     if (remote_pagesize_summary != local_pagesize_summary)  {
1764         /*
1765          * This detects two potential causes of mismatch:
1766          *   a) A mismatch in host page sizes
1767          *      Some combinations of mismatch are probably possible but it gets
1768          *      a bit more complicated.  In particular we need to place whole
1769          *      host pages on the dest at once, and we need to ensure that we
1770          *      handle dirtying to make sure we never end up sending part of
1771          *      a hostpage on it's own.
1772          *   b) The use of different huge page sizes on source/destination
1773          *      a more fine grain test is performed during RAM block migration
1774          *      but this test here causes a nice early clear failure, and
1775          *      also fails when passed to an older qemu that doesn't
1776          *      do huge pages.
1777          */
1778         error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1779                                                              " d=%" PRIx64 ")",
1780                      remote_pagesize_summary, local_pagesize_summary);
1781         return -1;
1782     }
1783 
1784     remote_tps = qemu_get_be64(mis->from_src_file);
1785     if (remote_tps != page_size) {
1786         /*
1787          * Again, some differences could be dealt with, but for now keep it
1788          * simple.
1789          */
1790         error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1791                      (int)remote_tps, page_size);
1792         return -1;
1793     }
1794 
1795     if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1796         error_report_err(local_err);
1797         return -1;
1798     }
1799 
1800     if (ram_postcopy_incoming_init(mis)) {
1801         return -1;
1802     }
1803 
1804     return 0;
1805 }
1806 
1807 /* After postcopy we will be told to throw some pages away since they're
1808  * dirty and will have to be demand fetched.  Must happen before CPU is
1809  * started.
1810  * There can be 0..many of these messages, each encoding multiple pages.
1811  */
1812 static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1813                                               uint16_t len)
1814 {
1815     int tmp;
1816     char ramid[256];
1817     PostcopyState ps = postcopy_state_get();
1818 
1819     trace_loadvm_postcopy_ram_handle_discard();
1820 
1821     switch (ps) {
1822     case POSTCOPY_INCOMING_ADVISE:
1823         /* 1st discard */
1824         tmp = postcopy_ram_prepare_discard(mis);
1825         if (tmp) {
1826             return tmp;
1827         }
1828         break;
1829 
1830     case POSTCOPY_INCOMING_DISCARD:
1831         /* Expected state */
1832         break;
1833 
1834     default:
1835         error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1836                      ps);
1837         return -1;
1838     }
1839     /* We're expecting a
1840      *    Version (0)
1841      *    a RAM ID string (length byte, name, 0 term)
1842      *    then at least 1 16 byte chunk
1843     */
1844     if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1845         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1846         return -1;
1847     }
1848 
1849     tmp = qemu_get_byte(mis->from_src_file);
1850     if (tmp != postcopy_ram_discard_version) {
1851         error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1852         return -1;
1853     }
1854 
1855     if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1856         error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1857         return -1;
1858     }
1859     tmp = qemu_get_byte(mis->from_src_file);
1860     if (tmp != 0) {
1861         error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1862         return -1;
1863     }
1864 
1865     len -= 3 + strlen(ramid);
1866     if (len % 16) {
1867         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1868         return -1;
1869     }
1870     trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1871     while (len) {
1872         uint64_t start_addr, block_length;
1873         start_addr = qemu_get_be64(mis->from_src_file);
1874         block_length = qemu_get_be64(mis->from_src_file);
1875 
1876         len -= 16;
1877         int ret = ram_discard_range(ramid, start_addr, block_length);
1878         if (ret) {
1879             return ret;
1880         }
1881     }
1882     trace_loadvm_postcopy_ram_handle_discard_end();
1883 
1884     return 0;
1885 }
1886 
1887 /*
1888  * Triggered by a postcopy_listen command; this thread takes over reading
1889  * the input stream, leaving the main thread free to carry on loading the rest
1890  * of the device state (from RAM).
1891  * (TODO:This could do with being in a postcopy file - but there again it's
1892  * just another input loop, not that postcopy specific)
1893  */
1894 static void *postcopy_ram_listen_thread(void *opaque)
1895 {
1896     MigrationIncomingState *mis = migration_incoming_get_current();
1897     QEMUFile *f = mis->from_src_file;
1898     int load_res;
1899     MigrationState *migr = migrate_get_current();
1900 
1901     object_ref(OBJECT(migr));
1902 
1903     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1904                                    MIGRATION_STATUS_POSTCOPY_ACTIVE);
1905     qemu_sem_post(&mis->thread_sync_sem);
1906     trace_postcopy_ram_listen_thread_start();
1907 
1908     rcu_register_thread();
1909     /*
1910      * Because we're a thread and not a coroutine we can't yield
1911      * in qemu_file, and thus we must be blocking now.
1912      */
1913     qemu_file_set_blocking(f, true);
1914     load_res = qemu_loadvm_state_main(f, mis);
1915 
1916     /*
1917      * This is tricky, but, mis->from_src_file can change after it
1918      * returns, when postcopy recovery happened. In the future, we may
1919      * want a wrapper for the QEMUFile handle.
1920      */
1921     f = mis->from_src_file;
1922 
1923     /* And non-blocking again so we don't block in any cleanup */
1924     qemu_file_set_blocking(f, false);
1925 
1926     trace_postcopy_ram_listen_thread_exit();
1927     if (load_res < 0) {
1928         qemu_file_set_error(f, load_res);
1929         dirty_bitmap_mig_cancel_incoming();
1930         if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
1931             !migrate_postcopy_ram() && migrate_dirty_bitmaps())
1932         {
1933             error_report("%s: loadvm failed during postcopy: %d. All states "
1934                          "are migrated except dirty bitmaps. Some dirty "
1935                          "bitmaps may be lost, and present migrated dirty "
1936                          "bitmaps are correctly migrated and valid.",
1937                          __func__, load_res);
1938             load_res = 0; /* prevent further exit() */
1939         } else {
1940             error_report("%s: loadvm failed: %d", __func__, load_res);
1941             migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1942                                            MIGRATION_STATUS_FAILED);
1943         }
1944     }
1945     if (load_res >= 0) {
1946         /*
1947          * This looks good, but it's possible that the device loading in the
1948          * main thread hasn't finished yet, and so we might not be in 'RUN'
1949          * state yet; wait for the end of the main thread.
1950          */
1951         qemu_event_wait(&mis->main_thread_load_event);
1952     }
1953     postcopy_ram_incoming_cleanup(mis);
1954 
1955     if (load_res < 0) {
1956         /*
1957          * If something went wrong then we have a bad state so exit;
1958          * depending how far we got it might be possible at this point
1959          * to leave the guest running and fire MCEs for pages that never
1960          * arrived as a desperate recovery step.
1961          */
1962         rcu_unregister_thread();
1963         exit(EXIT_FAILURE);
1964     }
1965 
1966     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1967                                    MIGRATION_STATUS_COMPLETED);
1968     /*
1969      * If everything has worked fine, then the main thread has waited
1970      * for us to start, and we're the last use of the mis.
1971      * (If something broke then qemu will have to exit anyway since it's
1972      * got a bad migration state).
1973      */
1974     migration_incoming_state_destroy();
1975     qemu_loadvm_state_cleanup();
1976 
1977     rcu_unregister_thread();
1978     mis->have_listen_thread = false;
1979     postcopy_state_set(POSTCOPY_INCOMING_END);
1980 
1981     object_unref(OBJECT(migr));
1982 
1983     return NULL;
1984 }
1985 
1986 /* After this message we must be able to immediately receive postcopy data */
1987 static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1988 {
1989     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1990     Error *local_err = NULL;
1991 
1992     trace_loadvm_postcopy_handle_listen("enter");
1993 
1994     if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1995         error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1996         return -1;
1997     }
1998     if (ps == POSTCOPY_INCOMING_ADVISE) {
1999         /*
2000          * A rare case, we entered listen without having to do any discards,
2001          * so do the setup that's normally done at the time of the 1st discard.
2002          */
2003         if (migrate_postcopy_ram()) {
2004             postcopy_ram_prepare_discard(mis);
2005         }
2006     }
2007 
2008     trace_loadvm_postcopy_handle_listen("after discard");
2009 
2010     /*
2011      * Sensitise RAM - can now generate requests for blocks that don't exist
2012      * However, at this point the CPU shouldn't be running, and the IO
2013      * shouldn't be doing anything yet so don't actually expect requests
2014      */
2015     if (migrate_postcopy_ram()) {
2016         if (postcopy_ram_incoming_setup(mis)) {
2017             postcopy_ram_incoming_cleanup(mis);
2018             return -1;
2019         }
2020     }
2021 
2022     trace_loadvm_postcopy_handle_listen("after uffd");
2023 
2024     if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
2025         error_report_err(local_err);
2026         return -1;
2027     }
2028 
2029     mis->have_listen_thread = true;
2030     postcopy_thread_create(mis, &mis->listen_thread, "postcopy/listen",
2031                            postcopy_ram_listen_thread, QEMU_THREAD_DETACHED);
2032     trace_loadvm_postcopy_handle_listen("return");
2033 
2034     return 0;
2035 }
2036 
2037 static void loadvm_postcopy_handle_run_bh(void *opaque)
2038 {
2039     Error *local_err = NULL;
2040     MigrationIncomingState *mis = opaque;
2041 
2042     trace_loadvm_postcopy_handle_run_bh("enter");
2043 
2044     /* TODO we should move all of this lot into postcopy_ram.c or a shared code
2045      * in migration.c
2046      */
2047     cpu_synchronize_all_post_init();
2048 
2049     trace_loadvm_postcopy_handle_run_bh("after cpu sync");
2050 
2051     qemu_announce_self(&mis->announce_timer, migrate_announce_params());
2052 
2053     trace_loadvm_postcopy_handle_run_bh("after announce");
2054 
2055     /* Make sure all file formats throw away their mutable metadata.
2056      * If we get an error here, just don't restart the VM yet. */
2057     bdrv_activate_all(&local_err);
2058     if (local_err) {
2059         error_report_err(local_err);
2060         local_err = NULL;
2061         autostart = false;
2062     }
2063 
2064     trace_loadvm_postcopy_handle_run_bh("after invalidate cache");
2065 
2066     dirty_bitmap_mig_before_vm_start();
2067 
2068     if (autostart) {
2069         /* Hold onto your hats, starting the CPU */
2070         vm_start();
2071     } else {
2072         /* leave it paused and let management decide when to start the CPU */
2073         runstate_set(RUN_STATE_PAUSED);
2074     }
2075 
2076     qemu_bh_delete(mis->bh);
2077 
2078     trace_loadvm_postcopy_handle_run_bh("return");
2079 }
2080 
2081 /* After all discards we can start running and asking for pages */
2082 static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
2083 {
2084     PostcopyState ps = postcopy_state_get();
2085 
2086     trace_loadvm_postcopy_handle_run();
2087     if (ps != POSTCOPY_INCOMING_LISTENING) {
2088         error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
2089         return -1;
2090     }
2091 
2092     postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2093     mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
2094     qemu_bh_schedule(mis->bh);
2095 
2096     /* We need to finish reading the stream from the package
2097      * and also stop reading anything more from the stream that loaded the
2098      * package (since it's now being read by the listener thread).
2099      * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2100      */
2101     return LOADVM_QUIT;
2102 }
2103 
2104 /* We must be with page_request_mutex held */
2105 static gboolean postcopy_sync_page_req(gpointer key, gpointer value,
2106                                        gpointer data)
2107 {
2108     MigrationIncomingState *mis = data;
2109     void *host_addr = (void *) key;
2110     ram_addr_t rb_offset;
2111     RAMBlock *rb;
2112     int ret;
2113 
2114     rb = qemu_ram_block_from_host(host_addr, true, &rb_offset);
2115     if (!rb) {
2116         /*
2117          * This should _never_ happen.  However be nice for a migrating VM to
2118          * not crash/assert.  Post an error (note: intended to not use *_once
2119          * because we do want to see all the illegal addresses; and this can
2120          * never be triggered by the guest so we're safe) and move on next.
2121          */
2122         error_report("%s: illegal host addr %p", __func__, host_addr);
2123         /* Try the next entry */
2124         return FALSE;
2125     }
2126 
2127     ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset);
2128     if (ret) {
2129         /* Please refer to above comment. */
2130         error_report("%s: send rp message failed for addr %p",
2131                      __func__, host_addr);
2132         return FALSE;
2133     }
2134 
2135     trace_postcopy_page_req_sync(host_addr);
2136 
2137     return FALSE;
2138 }
2139 
2140 static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis)
2141 {
2142     WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
2143         g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis);
2144     }
2145 }
2146 
2147 static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2148 {
2149     if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2150         error_report("%s: illegal resume received", __func__);
2151         /* Don't fail the load, only for this. */
2152         return 0;
2153     }
2154 
2155     /*
2156      * Reset the last_rb before we resend any page req to source again, since
2157      * the source should have it reset already.
2158      */
2159     mis->last_rb = NULL;
2160 
2161     /*
2162      * This means source VM is ready to resume the postcopy migration.
2163      */
2164     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2165                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
2166 
2167     trace_loadvm_postcopy_handle_resume();
2168 
2169     /* Tell source that "we are ready" */
2170     migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2171 
2172     /*
2173      * After a postcopy recovery, the source should have lost the postcopy
2174      * queue, or potentially the requested pages could have been lost during
2175      * the network down phase.  Let's re-sync with the source VM by re-sending
2176      * all the pending pages that we eagerly need, so these threads won't get
2177      * blocked too long due to the recovery.
2178      *
2179      * Without this procedure, the faulted destination VM threads (waiting for
2180      * page requests right before the postcopy is interrupted) can keep hanging
2181      * until the pages are sent by the source during the background copying of
2182      * pages, or another thread faulted on the same address accidentally.
2183      */
2184     migrate_send_rp_req_pages_pending(mis);
2185 
2186     /*
2187      * It's time to switch state and release the fault thread to continue
2188      * service page faults.  Note that this should be explicitly after the
2189      * above call to migrate_send_rp_req_pages_pending().  In short:
2190      * migrate_send_rp_message_req_pages() is not thread safe, yet.
2191      */
2192     qemu_sem_post(&mis->postcopy_pause_sem_fault);
2193 
2194     if (migrate_postcopy_preempt()) {
2195         /*
2196          * The preempt channel will be created in async manner, now let's
2197          * wait for it and make sure it's created.
2198          */
2199         qemu_sem_wait(&mis->postcopy_qemufile_dst_done);
2200         assert(mis->postcopy_qemufile_dst);
2201         /* Kick the fast ram load thread too */
2202         qemu_sem_post(&mis->postcopy_pause_sem_fast_load);
2203     }
2204 
2205     return 0;
2206 }
2207 
2208 /**
2209  * Immediately following this command is a blob of data containing an embedded
2210  * chunk of migration stream; read it and load it.
2211  *
2212  * @mis: Incoming state
2213  * @length: Length of packaged data to read
2214  *
2215  * Returns: Negative values on error
2216  *
2217  */
2218 static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2219 {
2220     int ret;
2221     size_t length;
2222     QIOChannelBuffer *bioc;
2223 
2224     length = qemu_get_be32(mis->from_src_file);
2225     trace_loadvm_handle_cmd_packaged(length);
2226 
2227     if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2228         error_report("Unreasonably large packaged state: %zu", length);
2229         return -1;
2230     }
2231 
2232     bioc = qio_channel_buffer_new(length);
2233     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2234     ret = qemu_get_buffer(mis->from_src_file,
2235                           bioc->data,
2236                           length);
2237     if (ret != length) {
2238         object_unref(OBJECT(bioc));
2239         error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2240                      ret, length);
2241         return (ret < 0) ? ret : -EAGAIN;
2242     }
2243     bioc->usage += length;
2244     trace_loadvm_handle_cmd_packaged_received(ret);
2245 
2246     QEMUFile *packf = qemu_file_new_input(QIO_CHANNEL(bioc));
2247 
2248     ret = qemu_loadvm_state_main(packf, mis);
2249     trace_loadvm_handle_cmd_packaged_main(ret);
2250     qemu_fclose(packf);
2251     object_unref(OBJECT(bioc));
2252 
2253     return ret;
2254 }
2255 
2256 /*
2257  * Handle request that source requests for recved_bitmap on
2258  * destination. Payload format:
2259  *
2260  * len (1 byte) + ramblock_name (<255 bytes)
2261  */
2262 static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2263                                      uint16_t len)
2264 {
2265     QEMUFile *file = mis->from_src_file;
2266     RAMBlock *rb;
2267     char block_name[256];
2268     size_t cnt;
2269 
2270     cnt = qemu_get_counted_string(file, block_name);
2271     if (!cnt) {
2272         error_report("%s: failed to read block name", __func__);
2273         return -EINVAL;
2274     }
2275 
2276     /* Validate before using the data */
2277     if (qemu_file_get_error(file)) {
2278         return qemu_file_get_error(file);
2279     }
2280 
2281     if (len != cnt + 1) {
2282         error_report("%s: invalid payload length (%d)", __func__, len);
2283         return -EINVAL;
2284     }
2285 
2286     rb = qemu_ram_block_by_name(block_name);
2287     if (!rb) {
2288         error_report("%s: block '%s' not found", __func__, block_name);
2289         return -EINVAL;
2290     }
2291 
2292     migrate_send_rp_recv_bitmap(mis, block_name);
2293 
2294     trace_loadvm_handle_recv_bitmap(block_name);
2295 
2296     return 0;
2297 }
2298 
2299 static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2300 {
2301     int ret = migration_incoming_enable_colo();
2302 
2303     if (!ret) {
2304         ret = colo_init_ram_cache();
2305         if (ret) {
2306             migration_incoming_disable_colo();
2307         }
2308     }
2309     return ret;
2310 }
2311 
2312 /*
2313  * Process an incoming 'QEMU_VM_COMMAND'
2314  * 0           just a normal return
2315  * LOADVM_QUIT All good, but exit the loop
2316  * <0          Error
2317  */
2318 static int loadvm_process_command(QEMUFile *f)
2319 {
2320     MigrationIncomingState *mis = migration_incoming_get_current();
2321     uint16_t cmd;
2322     uint16_t len;
2323     uint32_t tmp32;
2324 
2325     cmd = qemu_get_be16(f);
2326     len = qemu_get_be16(f);
2327 
2328     /* Check validity before continue processing of cmds */
2329     if (qemu_file_get_error(f)) {
2330         return qemu_file_get_error(f);
2331     }
2332 
2333     if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2334         error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2335         return -EINVAL;
2336     }
2337 
2338     trace_loadvm_process_command(mig_cmd_args[cmd].name, len);
2339 
2340     if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2341         error_report("%s received with bad length - expecting %zu, got %d",
2342                      mig_cmd_args[cmd].name,
2343                      (size_t)mig_cmd_args[cmd].len, len);
2344         return -ERANGE;
2345     }
2346 
2347     switch (cmd) {
2348     case MIG_CMD_OPEN_RETURN_PATH:
2349         if (mis->to_src_file) {
2350             error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2351             /* Not really a problem, so don't give up */
2352             return 0;
2353         }
2354         mis->to_src_file = qemu_file_get_return_path(f);
2355         if (!mis->to_src_file) {
2356             error_report("CMD_OPEN_RETURN_PATH failed");
2357             return -1;
2358         }
2359         break;
2360 
2361     case MIG_CMD_PING:
2362         tmp32 = qemu_get_be32(f);
2363         trace_loadvm_process_command_ping(tmp32);
2364         if (!mis->to_src_file) {
2365             error_report("CMD_PING (0x%x) received with no return path",
2366                          tmp32);
2367             return -1;
2368         }
2369         migrate_send_rp_pong(mis, tmp32);
2370         break;
2371 
2372     case MIG_CMD_PACKAGED:
2373         return loadvm_handle_cmd_packaged(mis);
2374 
2375     case MIG_CMD_POSTCOPY_ADVISE:
2376         return loadvm_postcopy_handle_advise(mis, len);
2377 
2378     case MIG_CMD_POSTCOPY_LISTEN:
2379         return loadvm_postcopy_handle_listen(mis);
2380 
2381     case MIG_CMD_POSTCOPY_RUN:
2382         return loadvm_postcopy_handle_run(mis);
2383 
2384     case MIG_CMD_POSTCOPY_RAM_DISCARD:
2385         return loadvm_postcopy_ram_handle_discard(mis, len);
2386 
2387     case MIG_CMD_POSTCOPY_RESUME:
2388         return loadvm_postcopy_handle_resume(mis);
2389 
2390     case MIG_CMD_RECV_BITMAP:
2391         return loadvm_handle_recv_bitmap(mis, len);
2392 
2393     case MIG_CMD_ENABLE_COLO:
2394         return loadvm_process_enable_colo(mis);
2395     }
2396 
2397     return 0;
2398 }
2399 
2400 /*
2401  * Read a footer off the wire and check that it matches the expected section
2402  *
2403  * Returns: true if the footer was good
2404  *          false if there is a problem (and calls error_report to say why)
2405  */
2406 static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2407 {
2408     int ret;
2409     uint8_t read_mark;
2410     uint32_t read_section_id;
2411 
2412     if (!migrate_get_current()->send_section_footer) {
2413         /* No footer to check */
2414         return true;
2415     }
2416 
2417     read_mark = qemu_get_byte(f);
2418 
2419     ret = qemu_file_get_error(f);
2420     if (ret) {
2421         error_report("%s: Read section footer failed: %d",
2422                      __func__, ret);
2423         return false;
2424     }
2425 
2426     if (read_mark != QEMU_VM_SECTION_FOOTER) {
2427         error_report("Missing section footer for %s", se->idstr);
2428         return false;
2429     }
2430 
2431     read_section_id = qemu_get_be32(f);
2432     if (read_section_id != se->load_section_id) {
2433         error_report("Mismatched section id in footer for %s -"
2434                      " read 0x%x expected 0x%x",
2435                      se->idstr, read_section_id, se->load_section_id);
2436         return false;
2437     }
2438 
2439     /* All good */
2440     return true;
2441 }
2442 
2443 static int
2444 qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2445 {
2446     uint32_t instance_id, version_id, section_id;
2447     SaveStateEntry *se;
2448     char idstr[256];
2449     int ret;
2450 
2451     /* Read section start */
2452     section_id = qemu_get_be32(f);
2453     if (!qemu_get_counted_string(f, idstr)) {
2454         error_report("Unable to read ID string for section %u",
2455                      section_id);
2456         return -EINVAL;
2457     }
2458     instance_id = qemu_get_be32(f);
2459     version_id = qemu_get_be32(f);
2460 
2461     ret = qemu_file_get_error(f);
2462     if (ret) {
2463         error_report("%s: Failed to read instance/version ID: %d",
2464                      __func__, ret);
2465         return ret;
2466     }
2467 
2468     trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2469             instance_id, version_id);
2470     /* Find savevm section */
2471     se = find_se(idstr, instance_id);
2472     if (se == NULL) {
2473         error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2474                      "Make sure that your current VM setup matches your "
2475                      "saved VM setup, including any hotplugged devices",
2476                      idstr, instance_id);
2477         return -EINVAL;
2478     }
2479 
2480     /* Validate version */
2481     if (version_id > se->version_id) {
2482         error_report("savevm: unsupported version %d for '%s' v%d",
2483                      version_id, idstr, se->version_id);
2484         return -EINVAL;
2485     }
2486     se->load_version_id = version_id;
2487     se->load_section_id = section_id;
2488 
2489     /* Validate if it is a device's state */
2490     if (xen_enabled() && se->is_ram) {
2491         error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2492         return -EINVAL;
2493     }
2494 
2495     ret = vmstate_load(f, se);
2496     if (ret < 0) {
2497         error_report("error while loading state for instance 0x%"PRIx32" of"
2498                      " device '%s'", instance_id, idstr);
2499         return ret;
2500     }
2501     if (!check_section_footer(f, se)) {
2502         return -EINVAL;
2503     }
2504 
2505     return 0;
2506 }
2507 
2508 static int
2509 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2510 {
2511     uint32_t section_id;
2512     SaveStateEntry *se;
2513     int ret;
2514 
2515     section_id = qemu_get_be32(f);
2516 
2517     ret = qemu_file_get_error(f);
2518     if (ret) {
2519         error_report("%s: Failed to read section ID: %d",
2520                      __func__, ret);
2521         return ret;
2522     }
2523 
2524     trace_qemu_loadvm_state_section_partend(section_id);
2525     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2526         if (se->load_section_id == section_id) {
2527             break;
2528         }
2529     }
2530     if (se == NULL) {
2531         error_report("Unknown savevm section %d", section_id);
2532         return -EINVAL;
2533     }
2534 
2535     ret = vmstate_load(f, se);
2536     if (ret < 0) {
2537         error_report("error while loading state section id %d(%s)",
2538                      section_id, se->idstr);
2539         return ret;
2540     }
2541     if (!check_section_footer(f, se)) {
2542         return -EINVAL;
2543     }
2544 
2545     return 0;
2546 }
2547 
2548 static int qemu_loadvm_state_header(QEMUFile *f)
2549 {
2550     unsigned int v;
2551     int ret;
2552 
2553     v = qemu_get_be32(f);
2554     if (v != QEMU_VM_FILE_MAGIC) {
2555         error_report("Not a migration stream");
2556         return -EINVAL;
2557     }
2558 
2559     v = qemu_get_be32(f);
2560     if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2561         error_report("SaveVM v2 format is obsolete and don't work anymore");
2562         return -ENOTSUP;
2563     }
2564     if (v != QEMU_VM_FILE_VERSION) {
2565         error_report("Unsupported migration stream version");
2566         return -ENOTSUP;
2567     }
2568 
2569     if (migrate_get_current()->send_configuration) {
2570         if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2571             error_report("Configuration section missing");
2572             qemu_loadvm_state_cleanup();
2573             return -EINVAL;
2574         }
2575         ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2576 
2577         if (ret) {
2578             qemu_loadvm_state_cleanup();
2579             return ret;
2580         }
2581     }
2582     return 0;
2583 }
2584 
2585 static int qemu_loadvm_state_setup(QEMUFile *f)
2586 {
2587     SaveStateEntry *se;
2588     int ret;
2589 
2590     trace_loadvm_state_setup();
2591     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2592         if (!se->ops || !se->ops->load_setup) {
2593             continue;
2594         }
2595         if (se->ops->is_active) {
2596             if (!se->ops->is_active(se->opaque)) {
2597                 continue;
2598             }
2599         }
2600 
2601         ret = se->ops->load_setup(f, se->opaque);
2602         if (ret < 0) {
2603             qemu_file_set_error(f, ret);
2604             error_report("Load state of device %s failed", se->idstr);
2605             return ret;
2606         }
2607     }
2608     return 0;
2609 }
2610 
2611 void qemu_loadvm_state_cleanup(void)
2612 {
2613     SaveStateEntry *se;
2614 
2615     trace_loadvm_state_cleanup();
2616     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2617         if (se->ops && se->ops->load_cleanup) {
2618             se->ops->load_cleanup(se->opaque);
2619         }
2620     }
2621 }
2622 
2623 /* Return true if we should continue the migration, or false. */
2624 static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2625 {
2626     int i;
2627 
2628     trace_postcopy_pause_incoming();
2629 
2630     assert(migrate_postcopy_ram());
2631 
2632     /*
2633      * Unregister yank with either from/to src would work, since ioc behind it
2634      * is the same
2635      */
2636     migration_ioc_unregister_yank_from_file(mis->from_src_file);
2637 
2638     assert(mis->from_src_file);
2639     qemu_file_shutdown(mis->from_src_file);
2640     qemu_fclose(mis->from_src_file);
2641     mis->from_src_file = NULL;
2642 
2643     assert(mis->to_src_file);
2644     qemu_file_shutdown(mis->to_src_file);
2645     qemu_mutex_lock(&mis->rp_mutex);
2646     qemu_fclose(mis->to_src_file);
2647     mis->to_src_file = NULL;
2648     qemu_mutex_unlock(&mis->rp_mutex);
2649 
2650     /*
2651      * NOTE: this must happen before reset the PostcopyTmpPages below,
2652      * otherwise it's racy to reset those fields when the fast load thread
2653      * can be accessing it in parallel.
2654      */
2655     if (mis->postcopy_qemufile_dst) {
2656         qemu_file_shutdown(mis->postcopy_qemufile_dst);
2657         /* Take the mutex to make sure the fast ram load thread halted */
2658         qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
2659         migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
2660         qemu_fclose(mis->postcopy_qemufile_dst);
2661         mis->postcopy_qemufile_dst = NULL;
2662         qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
2663     }
2664 
2665     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2666                       MIGRATION_STATUS_POSTCOPY_PAUSED);
2667 
2668     /* Notify the fault thread for the invalidated file handle */
2669     postcopy_fault_thread_notify(mis);
2670 
2671     /*
2672      * If network is interrupted, any temp page we received will be useless
2673      * because we didn't mark them as "received" in receivedmap.  After a
2674      * proper recovery later (which will sync src dirty bitmap with receivedmap
2675      * on dest) these cached small pages will be resent again.
2676      */
2677     for (i = 0; i < mis->postcopy_channels; i++) {
2678         postcopy_temp_page_reset(&mis->postcopy_tmp_pages[i]);
2679     }
2680 
2681     error_report("Detected IO failure for postcopy. "
2682                  "Migration paused.");
2683 
2684     while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2685         qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2686     }
2687 
2688     trace_postcopy_pause_incoming_continued();
2689 
2690     return true;
2691 }
2692 
2693 int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2694 {
2695     uint8_t section_type;
2696     int ret = 0;
2697 
2698 retry:
2699     while (true) {
2700         section_type = qemu_get_byte(f);
2701 
2702         ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL);
2703         if (ret) {
2704             break;
2705         }
2706 
2707         trace_qemu_loadvm_state_section(section_type);
2708         switch (section_type) {
2709         case QEMU_VM_SECTION_START:
2710         case QEMU_VM_SECTION_FULL:
2711             ret = qemu_loadvm_section_start_full(f, mis);
2712             if (ret < 0) {
2713                 goto out;
2714             }
2715             break;
2716         case QEMU_VM_SECTION_PART:
2717         case QEMU_VM_SECTION_END:
2718             ret = qemu_loadvm_section_part_end(f, mis);
2719             if (ret < 0) {
2720                 goto out;
2721             }
2722             break;
2723         case QEMU_VM_COMMAND:
2724             ret = loadvm_process_command(f);
2725             trace_qemu_loadvm_state_section_command(ret);
2726             if ((ret < 0) || (ret == LOADVM_QUIT)) {
2727                 goto out;
2728             }
2729             break;
2730         case QEMU_VM_EOF:
2731             /* This is the end of migration */
2732             goto out;
2733         default:
2734             error_report("Unknown savevm section type %d", section_type);
2735             ret = -EINVAL;
2736             goto out;
2737         }
2738     }
2739 
2740 out:
2741     if (ret < 0) {
2742         qemu_file_set_error(f, ret);
2743 
2744         /* Cancel bitmaps incoming regardless of recovery */
2745         dirty_bitmap_mig_cancel_incoming();
2746 
2747         /*
2748          * If we are during an active postcopy, then we pause instead
2749          * of bail out to at least keep the VM's dirty data.  Note
2750          * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2751          * during which we're still receiving device states and we
2752          * still haven't yet started the VM on destination.
2753          *
2754          * Only RAM postcopy supports recovery. Still, if RAM postcopy is
2755          * enabled, canceled bitmaps postcopy will not affect RAM postcopy
2756          * recovering.
2757          */
2758         if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2759             migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
2760             /* Reset f to point to the newly created channel */
2761             f = mis->from_src_file;
2762             goto retry;
2763         }
2764     }
2765     return ret;
2766 }
2767 
2768 int qemu_loadvm_state(QEMUFile *f)
2769 {
2770     MigrationIncomingState *mis = migration_incoming_get_current();
2771     Error *local_err = NULL;
2772     int ret;
2773 
2774     if (qemu_savevm_state_blocked(&local_err)) {
2775         error_report_err(local_err);
2776         return -EINVAL;
2777     }
2778 
2779     ret = qemu_loadvm_state_header(f);
2780     if (ret) {
2781         return ret;
2782     }
2783 
2784     if (qemu_loadvm_state_setup(f) != 0) {
2785         return -EINVAL;
2786     }
2787 
2788     cpu_synchronize_all_pre_loadvm();
2789 
2790     ret = qemu_loadvm_state_main(f, mis);
2791     qemu_event_set(&mis->main_thread_load_event);
2792 
2793     trace_qemu_loadvm_state_post_main(ret);
2794 
2795     if (mis->have_listen_thread) {
2796         /* Listen thread still going, can't clean up yet */
2797         return ret;
2798     }
2799 
2800     if (ret == 0) {
2801         ret = qemu_file_get_error(f);
2802     }
2803 
2804     /*
2805      * Try to read in the VMDESC section as well, so that dumping tools that
2806      * intercept our migration stream have the chance to see it.
2807      */
2808 
2809     /* We've got to be careful; if we don't read the data and just shut the fd
2810      * then the sender can error if we close while it's still sending.
2811      * We also mustn't read data that isn't there; some transports (RDMA)
2812      * will stall waiting for that data when the source has already closed.
2813      */
2814     if (ret == 0 && should_send_vmdesc()) {
2815         uint8_t *buf;
2816         uint32_t size;
2817         uint8_t  section_type = qemu_get_byte(f);
2818 
2819         if (section_type != QEMU_VM_VMDESCRIPTION) {
2820             error_report("Expected vmdescription section, but got %d",
2821                          section_type);
2822             /*
2823              * It doesn't seem worth failing at this point since
2824              * we apparently have an otherwise valid VM state
2825              */
2826         } else {
2827             buf = g_malloc(0x1000);
2828             size = qemu_get_be32(f);
2829 
2830             while (size > 0) {
2831                 uint32_t read_chunk = MIN(size, 0x1000);
2832                 qemu_get_buffer(f, buf, read_chunk);
2833                 size -= read_chunk;
2834             }
2835             g_free(buf);
2836         }
2837     }
2838 
2839     qemu_loadvm_state_cleanup();
2840     cpu_synchronize_all_post_init();
2841 
2842     return ret;
2843 }
2844 
2845 int qemu_load_device_state(QEMUFile *f)
2846 {
2847     MigrationIncomingState *mis = migration_incoming_get_current();
2848     int ret;
2849 
2850     /* Load QEMU_VM_SECTION_FULL section */
2851     ret = qemu_loadvm_state_main(f, mis);
2852     if (ret < 0) {
2853         error_report("Failed to load device state: %d", ret);
2854         return ret;
2855     }
2856 
2857     cpu_synchronize_all_post_init();
2858     return 0;
2859 }
2860 
2861 bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
2862                   bool has_devices, strList *devices, Error **errp)
2863 {
2864     BlockDriverState *bs;
2865     QEMUSnapshotInfo sn1, *sn = &sn1;
2866     int ret = -1, ret2;
2867     QEMUFile *f;
2868     int saved_vm_running;
2869     uint64_t vm_state_size;
2870     g_autoptr(GDateTime) now = g_date_time_new_now_local();
2871     AioContext *aio_context;
2872 
2873     GLOBAL_STATE_CODE();
2874 
2875     if (migration_is_blocked(errp)) {
2876         return false;
2877     }
2878 
2879     if (!replay_can_snapshot()) {
2880         error_setg(errp, "Record/replay does not allow making snapshot "
2881                    "right now. Try once more later.");
2882         return false;
2883     }
2884 
2885     if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
2886         return false;
2887     }
2888 
2889     /* Delete old snapshots of the same name */
2890     if (name) {
2891         if (overwrite) {
2892             if (bdrv_all_delete_snapshot(name, has_devices,
2893                                          devices, errp) < 0) {
2894                 return false;
2895             }
2896         } else {
2897             ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
2898             if (ret2 < 0) {
2899                 return false;
2900             }
2901             if (ret2 == 1) {
2902                 error_setg(errp,
2903                            "Snapshot '%s' already exists in one or more devices",
2904                            name);
2905                 return false;
2906             }
2907         }
2908     }
2909 
2910     bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
2911     if (bs == NULL) {
2912         return false;
2913     }
2914     aio_context = bdrv_get_aio_context(bs);
2915 
2916     saved_vm_running = runstate_is_running();
2917 
2918     ret = global_state_store();
2919     if (ret) {
2920         error_setg(errp, "Error saving global state");
2921         return false;
2922     }
2923     vm_stop(RUN_STATE_SAVE_VM);
2924 
2925     bdrv_drain_all_begin();
2926 
2927     aio_context_acquire(aio_context);
2928 
2929     memset(sn, 0, sizeof(*sn));
2930 
2931     /* fill auxiliary fields */
2932     sn->date_sec = g_date_time_to_unix(now);
2933     sn->date_nsec = g_date_time_get_microsecond(now) * 1000;
2934     sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2935     if (replay_mode != REPLAY_MODE_NONE) {
2936         sn->icount = replay_get_current_icount();
2937     } else {
2938         sn->icount = -1ULL;
2939     }
2940 
2941     if (name) {
2942         pstrcpy(sn->name, sizeof(sn->name), name);
2943     } else {
2944         g_autofree char *autoname = g_date_time_format(now,  "vm-%Y%m%d%H%M%S");
2945         pstrcpy(sn->name, sizeof(sn->name), autoname);
2946     }
2947 
2948     /* save the VM state */
2949     f = qemu_fopen_bdrv(bs, 1);
2950     if (!f) {
2951         error_setg(errp, "Could not open VM state file");
2952         goto the_end;
2953     }
2954     ret = qemu_savevm_state(f, errp);
2955     vm_state_size = qemu_file_total_transferred(f);
2956     ret2 = qemu_fclose(f);
2957     if (ret < 0) {
2958         goto the_end;
2959     }
2960     if (ret2 < 0) {
2961         ret = ret2;
2962         goto the_end;
2963     }
2964 
2965     /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2966      * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2967      * it only releases the lock once.  Therefore synchronous I/O will deadlock
2968      * unless we release the AioContext before bdrv_all_create_snapshot().
2969      */
2970     aio_context_release(aio_context);
2971     aio_context = NULL;
2972 
2973     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
2974                                    has_devices, devices, errp);
2975     if (ret < 0) {
2976         bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
2977         goto the_end;
2978     }
2979 
2980     ret = 0;
2981 
2982  the_end:
2983     if (aio_context) {
2984         aio_context_release(aio_context);
2985     }
2986 
2987     bdrv_drain_all_end();
2988 
2989     if (saved_vm_running) {
2990         vm_start();
2991     }
2992     return ret == 0;
2993 }
2994 
2995 void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2996                                 Error **errp)
2997 {
2998     QEMUFile *f;
2999     QIOChannelFile *ioc;
3000     int saved_vm_running;
3001     int ret;
3002 
3003     if (!has_live) {
3004         /* live default to true so old version of Xen tool stack can have a
3005          * successful live migration */
3006         live = true;
3007     }
3008 
3009     saved_vm_running = runstate_is_running();
3010     vm_stop(RUN_STATE_SAVE_VM);
3011     global_state_store_running();
3012 
3013     ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
3014                                     0660, errp);
3015     if (!ioc) {
3016         goto the_end;
3017     }
3018     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
3019     f = qemu_file_new_output(QIO_CHANNEL(ioc));
3020     object_unref(OBJECT(ioc));
3021     ret = qemu_save_device_state(f);
3022     if (ret < 0 || qemu_fclose(f) < 0) {
3023         error_setg(errp, QERR_IO_ERROR);
3024     } else {
3025         /* libxl calls the QMP command "stop" before calling
3026          * "xen-save-devices-state" and in case of migration failure, libxl
3027          * would call "cont".
3028          * So call bdrv_inactivate_all (release locks) here to let the other
3029          * side of the migration take control of the images.
3030          */
3031         if (live && !saved_vm_running) {
3032             ret = bdrv_inactivate_all();
3033             if (ret) {
3034                 error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
3035                            __func__, ret);
3036             }
3037         }
3038     }
3039 
3040  the_end:
3041     if (saved_vm_running) {
3042         vm_start();
3043     }
3044 }
3045 
3046 void qmp_xen_load_devices_state(const char *filename, Error **errp)
3047 {
3048     QEMUFile *f;
3049     QIOChannelFile *ioc;
3050     int ret;
3051 
3052     /* Guest must be paused before loading the device state; the RAM state
3053      * will already have been loaded by xc
3054      */
3055     if (runstate_is_running()) {
3056         error_setg(errp, "Cannot update device state while vm is running");
3057         return;
3058     }
3059     vm_stop(RUN_STATE_RESTORE_VM);
3060 
3061     ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
3062     if (!ioc) {
3063         return;
3064     }
3065     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
3066     f = qemu_file_new_input(QIO_CHANNEL(ioc));
3067     object_unref(OBJECT(ioc));
3068 
3069     ret = qemu_loadvm_state(f);
3070     qemu_fclose(f);
3071     if (ret < 0) {
3072         error_setg(errp, QERR_IO_ERROR);
3073     }
3074     migration_incoming_state_destroy();
3075 }
3076 
3077 bool load_snapshot(const char *name, const char *vmstate,
3078                    bool has_devices, strList *devices, Error **errp)
3079 {
3080     BlockDriverState *bs_vm_state;
3081     QEMUSnapshotInfo sn;
3082     QEMUFile *f;
3083     int ret;
3084     AioContext *aio_context;
3085     MigrationIncomingState *mis = migration_incoming_get_current();
3086 
3087     if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3088         return false;
3089     }
3090     ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3091     if (ret < 0) {
3092         return false;
3093     }
3094     if (ret == 0) {
3095         error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
3096                    name);
3097         return false;
3098     }
3099 
3100     bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3101     if (!bs_vm_state) {
3102         return false;
3103     }
3104     aio_context = bdrv_get_aio_context(bs_vm_state);
3105 
3106     /* Don't even try to load empty VM states */
3107     aio_context_acquire(aio_context);
3108     ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
3109     aio_context_release(aio_context);
3110     if (ret < 0) {
3111         return false;
3112     } else if (sn.vm_state_size == 0) {
3113         error_setg(errp, "This is a disk-only snapshot. Revert to it "
3114                    " offline using qemu-img");
3115         return false;
3116     }
3117 
3118     /*
3119      * Flush the record/replay queue. Now the VM state is going
3120      * to change. Therefore we don't need to preserve its consistency
3121      */
3122     replay_flush_events();
3123 
3124     /* Flush all IO requests so they don't interfere with the new state.  */
3125     bdrv_drain_all_begin();
3126 
3127     ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
3128     if (ret < 0) {
3129         goto err_drain;
3130     }
3131 
3132     /* restore the VM state */
3133     f = qemu_fopen_bdrv(bs_vm_state, 0);
3134     if (!f) {
3135         error_setg(errp, "Could not open VM state file");
3136         goto err_drain;
3137     }
3138 
3139     qemu_system_reset(SHUTDOWN_CAUSE_SNAPSHOT_LOAD);
3140     mis->from_src_file = f;
3141 
3142     if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
3143         ret = -EINVAL;
3144         goto err_drain;
3145     }
3146     aio_context_acquire(aio_context);
3147     ret = qemu_loadvm_state(f);
3148     migration_incoming_state_destroy();
3149     aio_context_release(aio_context);
3150 
3151     bdrv_drain_all_end();
3152 
3153     if (ret < 0) {
3154         error_setg(errp, "Error %d while loading VM state", ret);
3155         return false;
3156     }
3157 
3158     return true;
3159 
3160 err_drain:
3161     bdrv_drain_all_end();
3162     return false;
3163 }
3164 
3165 bool delete_snapshot(const char *name, bool has_devices,
3166                      strList *devices, Error **errp)
3167 {
3168     if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3169         return false;
3170     }
3171 
3172     if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
3173         return false;
3174     }
3175 
3176     return true;
3177 }
3178 
3179 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
3180 {
3181     qemu_ram_set_idstr(mr->ram_block,
3182                        memory_region_name(mr), dev);
3183     qemu_ram_set_migratable(mr->ram_block);
3184 }
3185 
3186 void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
3187 {
3188     qemu_ram_unset_idstr(mr->ram_block);
3189     qemu_ram_unset_migratable(mr->ram_block);
3190 }
3191 
3192 void vmstate_register_ram_global(MemoryRegion *mr)
3193 {
3194     vmstate_register_ram(mr, NULL);
3195 }
3196 
3197 bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
3198 {
3199     /* check needed if --only-migratable is specified */
3200     if (!only_migratable) {
3201         return true;
3202     }
3203 
3204     return !(vmsd && vmsd->unmigratable);
3205 }
3206 
3207 typedef struct SnapshotJob {
3208     Job common;
3209     char *tag;
3210     char *vmstate;
3211     strList *devices;
3212     Coroutine *co;
3213     Error **errp;
3214     bool ret;
3215 } SnapshotJob;
3216 
3217 static void qmp_snapshot_job_free(SnapshotJob *s)
3218 {
3219     g_free(s->tag);
3220     g_free(s->vmstate);
3221     qapi_free_strList(s->devices);
3222 }
3223 
3224 
3225 static void snapshot_load_job_bh(void *opaque)
3226 {
3227     Job *job = opaque;
3228     SnapshotJob *s = container_of(job, SnapshotJob, common);
3229     int orig_vm_running;
3230 
3231     job_progress_set_remaining(&s->common, 1);
3232 
3233     orig_vm_running = runstate_is_running();
3234     vm_stop(RUN_STATE_RESTORE_VM);
3235 
3236     s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
3237     if (s->ret && orig_vm_running) {
3238         vm_start();
3239     }
3240 
3241     job_progress_update(&s->common, 1);
3242 
3243     qmp_snapshot_job_free(s);
3244     aio_co_wake(s->co);
3245 }
3246 
3247 static void snapshot_save_job_bh(void *opaque)
3248 {
3249     Job *job = opaque;
3250     SnapshotJob *s = container_of(job, SnapshotJob, common);
3251 
3252     job_progress_set_remaining(&s->common, 1);
3253     s->ret = save_snapshot(s->tag, false, s->vmstate,
3254                            true, s->devices, s->errp);
3255     job_progress_update(&s->common, 1);
3256 
3257     qmp_snapshot_job_free(s);
3258     aio_co_wake(s->co);
3259 }
3260 
3261 static void snapshot_delete_job_bh(void *opaque)
3262 {
3263     Job *job = opaque;
3264     SnapshotJob *s = container_of(job, SnapshotJob, common);
3265 
3266     job_progress_set_remaining(&s->common, 1);
3267     s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
3268     job_progress_update(&s->common, 1);
3269 
3270     qmp_snapshot_job_free(s);
3271     aio_co_wake(s->co);
3272 }
3273 
3274 static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
3275 {
3276     SnapshotJob *s = container_of(job, SnapshotJob, common);
3277     s->errp = errp;
3278     s->co = qemu_coroutine_self();
3279     aio_bh_schedule_oneshot(qemu_get_aio_context(),
3280                             snapshot_save_job_bh, job);
3281     qemu_coroutine_yield();
3282     return s->ret ? 0 : -1;
3283 }
3284 
3285 static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
3286 {
3287     SnapshotJob *s = container_of(job, SnapshotJob, common);
3288     s->errp = errp;
3289     s->co = qemu_coroutine_self();
3290     aio_bh_schedule_oneshot(qemu_get_aio_context(),
3291                             snapshot_load_job_bh, job);
3292     qemu_coroutine_yield();
3293     return s->ret ? 0 : -1;
3294 }
3295 
3296 static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
3297 {
3298     SnapshotJob *s = container_of(job, SnapshotJob, common);
3299     s->errp = errp;
3300     s->co = qemu_coroutine_self();
3301     aio_bh_schedule_oneshot(qemu_get_aio_context(),
3302                             snapshot_delete_job_bh, job);
3303     qemu_coroutine_yield();
3304     return s->ret ? 0 : -1;
3305 }
3306 
3307 
3308 static const JobDriver snapshot_load_job_driver = {
3309     .instance_size = sizeof(SnapshotJob),
3310     .job_type      = JOB_TYPE_SNAPSHOT_LOAD,
3311     .run           = snapshot_load_job_run,
3312 };
3313 
3314 static const JobDriver snapshot_save_job_driver = {
3315     .instance_size = sizeof(SnapshotJob),
3316     .job_type      = JOB_TYPE_SNAPSHOT_SAVE,
3317     .run           = snapshot_save_job_run,
3318 };
3319 
3320 static const JobDriver snapshot_delete_job_driver = {
3321     .instance_size = sizeof(SnapshotJob),
3322     .job_type      = JOB_TYPE_SNAPSHOT_DELETE,
3323     .run           = snapshot_delete_job_run,
3324 };
3325 
3326 
3327 void qmp_snapshot_save(const char *job_id,
3328                        const char *tag,
3329                        const char *vmstate,
3330                        strList *devices,
3331                        Error **errp)
3332 {
3333     SnapshotJob *s;
3334 
3335     s = job_create(job_id, &snapshot_save_job_driver, NULL,
3336                    qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3337                    NULL, NULL, errp);
3338     if (!s) {
3339         return;
3340     }
3341 
3342     s->tag = g_strdup(tag);
3343     s->vmstate = g_strdup(vmstate);
3344     s->devices = QAPI_CLONE(strList, devices);
3345 
3346     job_start(&s->common);
3347 }
3348 
3349 void qmp_snapshot_load(const char *job_id,
3350                        const char *tag,
3351                        const char *vmstate,
3352                        strList *devices,
3353                        Error **errp)
3354 {
3355     SnapshotJob *s;
3356 
3357     s = job_create(job_id, &snapshot_load_job_driver, NULL,
3358                    qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3359                    NULL, NULL, errp);
3360     if (!s) {
3361         return;
3362     }
3363 
3364     s->tag = g_strdup(tag);
3365     s->vmstate = g_strdup(vmstate);
3366     s->devices = QAPI_CLONE(strList, devices);
3367 
3368     job_start(&s->common);
3369 }
3370 
3371 void qmp_snapshot_delete(const char *job_id,
3372                          const char *tag,
3373                          strList *devices,
3374                          Error **errp)
3375 {
3376     SnapshotJob *s;
3377 
3378     s = job_create(job_id, &snapshot_delete_job_driver, NULL,
3379                    qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3380                    NULL, NULL, errp);
3381     if (!s) {
3382         return;
3383     }
3384 
3385     s->tag = g_strdup(tag);
3386     s->devices = QAPI_CLONE(strList, devices);
3387 
3388     job_start(&s->common);
3389 }
3390