xref: /openbmc/qemu/migration/savevm.c (revision 2a8af382)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2009-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "hw/boards.h"
31 #include "net/net.h"
32 #include "migration.h"
33 #include "migration/snapshot.h"
34 #include "migration/vmstate.h"
35 #include "migration/misc.h"
36 #include "migration/register.h"
37 #include "migration/global_state.h"
38 #include "migration/channel-block.h"
39 #include "ram.h"
40 #include "qemu-file.h"
41 #include "savevm.h"
42 #include "postcopy-ram.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-commands-migration.h"
45 #include "qapi/qmp/json-writer.h"
46 #include "qapi/clone-visitor.h"
47 #include "qapi/qapi-builtin-visit.h"
48 #include "qapi/qmp/qerror.h"
49 #include "qemu/error-report.h"
50 #include "sysemu/cpus.h"
51 #include "exec/memory.h"
52 #include "exec/target_page.h"
53 #include "trace.h"
54 #include "qemu/iov.h"
55 #include "qemu/job.h"
56 #include "qemu/main-loop.h"
57 #include "block/snapshot.h"
58 #include "qemu/cutils.h"
59 #include "io/channel-buffer.h"
60 #include "io/channel-file.h"
61 #include "sysemu/replay.h"
62 #include "sysemu/runstate.h"
63 #include "sysemu/sysemu.h"
64 #include "sysemu/xen.h"
65 #include "migration/colo.h"
66 #include "qemu/bitmap.h"
67 #include "net/announce.h"
68 #include "qemu/yank.h"
69 #include "yank_functions.h"
70 
71 const unsigned int postcopy_ram_discard_version;
72 
73 /* Subcommands for QEMU_VM_COMMAND */
74 enum qemu_vm_cmd {
75     MIG_CMD_INVALID = 0,   /* Must be 0 */
76     MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
77     MIG_CMD_PING,              /* Request a PONG on the RP */
78 
79     MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
80                                       warn we might want to do PC */
81     MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
82                                       pages as it's running. */
83     MIG_CMD_POSTCOPY_RUN,          /* Start execution */
84 
85     MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
86                                       were previously sent during
87                                       precopy but are dirty. */
88     MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
89     MIG_CMD_ENABLE_COLO,       /* Enable COLO */
90     MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
91     MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
92     MIG_CMD_MAX
93 };
94 
95 #define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
96 static struct mig_cmd_args {
97     ssize_t     len; /* -1 = variable */
98     const char *name;
99 } mig_cmd_args[] = {
100     [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
101     [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
102     [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
103     [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
104     [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
105     [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
106     [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
107                                    .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
108     [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
109     [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
110     [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
111     [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
112 };
113 
114 /* Note for MIG_CMD_POSTCOPY_ADVISE:
115  * The format of arguments is depending on postcopy mode:
116  * - postcopy RAM only
117  *   uint64_t host page size
118  *   uint64_t taget page size
119  *
120  * - postcopy RAM and postcopy dirty bitmaps
121  *   format is the same as for postcopy RAM only
122  *
123  * - postcopy dirty bitmaps only
124  *   Nothing. Command length field is 0.
125  *
126  * Be careful: adding a new postcopy entity with some other parameters should
127  * not break format self-description ability. Good way is to introduce some
128  * generic extendable format with an exception for two old entities.
129  */
130 
131 /***********************************************************/
132 /* savevm/loadvm support */
133 
134 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
135 {
136     if (is_writable) {
137         return qemu_file_new_output(QIO_CHANNEL(qio_channel_block_new(bs)));
138     } else {
139         return qemu_file_new_input(QIO_CHANNEL(qio_channel_block_new(bs)));
140     }
141 }
142 
143 
144 /* QEMUFile timer support.
145  * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
146  */
147 
148 void timer_put(QEMUFile *f, QEMUTimer *ts)
149 {
150     uint64_t expire_time;
151 
152     expire_time = timer_expire_time_ns(ts);
153     qemu_put_be64(f, expire_time);
154 }
155 
156 void timer_get(QEMUFile *f, QEMUTimer *ts)
157 {
158     uint64_t expire_time;
159 
160     expire_time = qemu_get_be64(f);
161     if (expire_time != -1) {
162         timer_mod_ns(ts, expire_time);
163     } else {
164         timer_del(ts);
165     }
166 }
167 
168 
169 /* VMState timer support.
170  * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
171  */
172 
173 static int get_timer(QEMUFile *f, void *pv, size_t size,
174                      const VMStateField *field)
175 {
176     QEMUTimer *v = pv;
177     timer_get(f, v);
178     return 0;
179 }
180 
181 static int put_timer(QEMUFile *f, void *pv, size_t size,
182                      const VMStateField *field, JSONWriter *vmdesc)
183 {
184     QEMUTimer *v = pv;
185     timer_put(f, v);
186 
187     return 0;
188 }
189 
190 const VMStateInfo vmstate_info_timer = {
191     .name = "timer",
192     .get  = get_timer,
193     .put  = put_timer,
194 };
195 
196 
197 typedef struct CompatEntry {
198     char idstr[256];
199     int instance_id;
200 } CompatEntry;
201 
202 typedef struct SaveStateEntry {
203     QTAILQ_ENTRY(SaveStateEntry) entry;
204     char idstr[256];
205     uint32_t instance_id;
206     int alias_id;
207     int version_id;
208     /* version id read from the stream */
209     int load_version_id;
210     int section_id;
211     /* section id read from the stream */
212     int load_section_id;
213     const SaveVMHandlers *ops;
214     const VMStateDescription *vmsd;
215     void *opaque;
216     CompatEntry *compat;
217     int is_ram;
218 } SaveStateEntry;
219 
220 typedef struct SaveState {
221     QTAILQ_HEAD(, SaveStateEntry) handlers;
222     SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
223     int global_section_id;
224     uint32_t len;
225     const char *name;
226     uint32_t target_page_bits;
227     uint32_t caps_count;
228     MigrationCapability *capabilities;
229     QemuUUID uuid;
230 } SaveState;
231 
232 static SaveState savevm_state = {
233     .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
234     .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
235     .global_section_id = 0,
236 };
237 
238 static bool should_validate_capability(int capability)
239 {
240     assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
241     /* Validate only new capabilities to keep compatibility. */
242     switch (capability) {
243     case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
244         return true;
245     default:
246         return false;
247     }
248 }
249 
250 static uint32_t get_validatable_capabilities_count(void)
251 {
252     MigrationState *s = migrate_get_current();
253     uint32_t result = 0;
254     int i;
255     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
256         if (should_validate_capability(i) && s->enabled_capabilities[i]) {
257             result++;
258         }
259     }
260     return result;
261 }
262 
263 static int configuration_pre_save(void *opaque)
264 {
265     SaveState *state = opaque;
266     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
267     MigrationState *s = migrate_get_current();
268     int i, j;
269 
270     state->len = strlen(current_name);
271     state->name = current_name;
272     state->target_page_bits = qemu_target_page_bits();
273 
274     state->caps_count = get_validatable_capabilities_count();
275     state->capabilities = g_renew(MigrationCapability, state->capabilities,
276                                   state->caps_count);
277     for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
278         if (should_validate_capability(i) && s->enabled_capabilities[i]) {
279             state->capabilities[j++] = i;
280         }
281     }
282     state->uuid = qemu_uuid;
283 
284     return 0;
285 }
286 
287 static int configuration_post_save(void *opaque)
288 {
289     SaveState *state = opaque;
290 
291     g_free(state->capabilities);
292     state->capabilities = NULL;
293     state->caps_count = 0;
294     return 0;
295 }
296 
297 static int configuration_pre_load(void *opaque)
298 {
299     SaveState *state = opaque;
300 
301     /* If there is no target-page-bits subsection it means the source
302      * predates the variable-target-page-bits support and is using the
303      * minimum possible value for this CPU.
304      */
305     state->target_page_bits = qemu_target_page_bits_min();
306     return 0;
307 }
308 
309 static bool configuration_validate_capabilities(SaveState *state)
310 {
311     bool ret = true;
312     MigrationState *s = migrate_get_current();
313     unsigned long *source_caps_bm;
314     int i;
315 
316     source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
317     for (i = 0; i < state->caps_count; i++) {
318         MigrationCapability capability = state->capabilities[i];
319         set_bit(capability, source_caps_bm);
320     }
321 
322     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
323         bool source_state, target_state;
324         if (!should_validate_capability(i)) {
325             continue;
326         }
327         source_state = test_bit(i, source_caps_bm);
328         target_state = s->enabled_capabilities[i];
329         if (source_state != target_state) {
330             error_report("Capability %s is %s, but received capability is %s",
331                          MigrationCapability_str(i),
332                          target_state ? "on" : "off",
333                          source_state ? "on" : "off");
334             ret = false;
335             /* Don't break here to report all failed capabilities */
336         }
337     }
338 
339     g_free(source_caps_bm);
340     return ret;
341 }
342 
343 static int configuration_post_load(void *opaque, int version_id)
344 {
345     SaveState *state = opaque;
346     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
347     int ret = 0;
348 
349     if (strncmp(state->name, current_name, state->len) != 0) {
350         error_report("Machine type received is '%.*s' and local is '%s'",
351                      (int) state->len, state->name, current_name);
352         ret = -EINVAL;
353         goto out;
354     }
355 
356     if (state->target_page_bits != qemu_target_page_bits()) {
357         error_report("Received TARGET_PAGE_BITS is %d but local is %d",
358                      state->target_page_bits, qemu_target_page_bits());
359         ret = -EINVAL;
360         goto out;
361     }
362 
363     if (!configuration_validate_capabilities(state)) {
364         ret = -EINVAL;
365         goto out;
366     }
367 
368 out:
369     g_free((void *)state->name);
370     state->name = NULL;
371     state->len = 0;
372     g_free(state->capabilities);
373     state->capabilities = NULL;
374     state->caps_count = 0;
375 
376     return ret;
377 }
378 
379 static int get_capability(QEMUFile *f, void *pv, size_t size,
380                           const VMStateField *field)
381 {
382     MigrationCapability *capability = pv;
383     char capability_str[UINT8_MAX + 1];
384     uint8_t len;
385     int i;
386 
387     len = qemu_get_byte(f);
388     qemu_get_buffer(f, (uint8_t *)capability_str, len);
389     capability_str[len] = '\0';
390     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
391         if (!strcmp(MigrationCapability_str(i), capability_str)) {
392             *capability = i;
393             return 0;
394         }
395     }
396     error_report("Received unknown capability %s", capability_str);
397     return -EINVAL;
398 }
399 
400 static int put_capability(QEMUFile *f, void *pv, size_t size,
401                           const VMStateField *field, JSONWriter *vmdesc)
402 {
403     MigrationCapability *capability = pv;
404     const char *capability_str = MigrationCapability_str(*capability);
405     size_t len = strlen(capability_str);
406     assert(len <= UINT8_MAX);
407 
408     qemu_put_byte(f, len);
409     qemu_put_buffer(f, (uint8_t *)capability_str, len);
410     return 0;
411 }
412 
413 static const VMStateInfo vmstate_info_capability = {
414     .name = "capability",
415     .get  = get_capability,
416     .put  = put_capability,
417 };
418 
419 /* The target-page-bits subsection is present only if the
420  * target page size is not the same as the default (ie the
421  * minimum page size for a variable-page-size guest CPU).
422  * If it is present then it contains the actual target page
423  * bits for the machine, and migration will fail if the
424  * two ends don't agree about it.
425  */
426 static bool vmstate_target_page_bits_needed(void *opaque)
427 {
428     return qemu_target_page_bits()
429         > qemu_target_page_bits_min();
430 }
431 
432 static const VMStateDescription vmstate_target_page_bits = {
433     .name = "configuration/target-page-bits",
434     .version_id = 1,
435     .minimum_version_id = 1,
436     .needed = vmstate_target_page_bits_needed,
437     .fields = (VMStateField[]) {
438         VMSTATE_UINT32(target_page_bits, SaveState),
439         VMSTATE_END_OF_LIST()
440     }
441 };
442 
443 static bool vmstate_capabilites_needed(void *opaque)
444 {
445     return get_validatable_capabilities_count() > 0;
446 }
447 
448 static const VMStateDescription vmstate_capabilites = {
449     .name = "configuration/capabilities",
450     .version_id = 1,
451     .minimum_version_id = 1,
452     .needed = vmstate_capabilites_needed,
453     .fields = (VMStateField[]) {
454         VMSTATE_UINT32_V(caps_count, SaveState, 1),
455         VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
456                                     vmstate_info_capability,
457                                     MigrationCapability),
458         VMSTATE_END_OF_LIST()
459     }
460 };
461 
462 static bool vmstate_uuid_needed(void *opaque)
463 {
464     return qemu_uuid_set && migrate_validate_uuid();
465 }
466 
467 static int vmstate_uuid_post_load(void *opaque, int version_id)
468 {
469     SaveState *state = opaque;
470     char uuid_src[UUID_FMT_LEN + 1];
471     char uuid_dst[UUID_FMT_LEN + 1];
472 
473     if (!qemu_uuid_set) {
474         /*
475          * It's warning because user might not know UUID in some cases,
476          * e.g. load an old snapshot
477          */
478         qemu_uuid_unparse(&state->uuid, uuid_src);
479         warn_report("UUID is received %s, but local uuid isn't set",
480                      uuid_src);
481         return 0;
482     }
483     if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
484         qemu_uuid_unparse(&state->uuid, uuid_src);
485         qemu_uuid_unparse(&qemu_uuid, uuid_dst);
486         error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
487         return -EINVAL;
488     }
489     return 0;
490 }
491 
492 static const VMStateDescription vmstate_uuid = {
493     .name = "configuration/uuid",
494     .version_id = 1,
495     .minimum_version_id = 1,
496     .needed = vmstate_uuid_needed,
497     .post_load = vmstate_uuid_post_load,
498     .fields = (VMStateField[]) {
499         VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
500         VMSTATE_END_OF_LIST()
501     }
502 };
503 
504 static const VMStateDescription vmstate_configuration = {
505     .name = "configuration",
506     .version_id = 1,
507     .pre_load = configuration_pre_load,
508     .post_load = configuration_post_load,
509     .pre_save = configuration_pre_save,
510     .post_save = configuration_post_save,
511     .fields = (VMStateField[]) {
512         VMSTATE_UINT32(len, SaveState),
513         VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
514         VMSTATE_END_OF_LIST()
515     },
516     .subsections = (const VMStateDescription *[]) {
517         &vmstate_target_page_bits,
518         &vmstate_capabilites,
519         &vmstate_uuid,
520         NULL
521     }
522 };
523 
524 static void dump_vmstate_vmsd(FILE *out_file,
525                               const VMStateDescription *vmsd, int indent,
526                               bool is_subsection);
527 
528 static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
529                               int indent)
530 {
531     fprintf(out_file, "%*s{\n", indent, "");
532     indent += 2;
533     fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
534     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
535             field->version_id);
536     fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
537             field->field_exists ? "true" : "false");
538     fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
539     if (field->vmsd != NULL) {
540         fprintf(out_file, ",\n");
541         dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
542     }
543     fprintf(out_file, "\n%*s}", indent - 2, "");
544 }
545 
546 static void dump_vmstate_vmss(FILE *out_file,
547                               const VMStateDescription **subsection,
548                               int indent)
549 {
550     if (*subsection != NULL) {
551         dump_vmstate_vmsd(out_file, *subsection, indent, true);
552     }
553 }
554 
555 static void dump_vmstate_vmsd(FILE *out_file,
556                               const VMStateDescription *vmsd, int indent,
557                               bool is_subsection)
558 {
559     if (is_subsection) {
560         fprintf(out_file, "%*s{\n", indent, "");
561     } else {
562         fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
563     }
564     indent += 2;
565     fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
566     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
567             vmsd->version_id);
568     fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
569             vmsd->minimum_version_id);
570     if (vmsd->fields != NULL) {
571         const VMStateField *field = vmsd->fields;
572         bool first;
573 
574         fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
575         first = true;
576         while (field->name != NULL) {
577             if (field->flags & VMS_MUST_EXIST) {
578                 /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
579                 field++;
580                 continue;
581             }
582             if (!first) {
583                 fprintf(out_file, ",\n");
584             }
585             dump_vmstate_vmsf(out_file, field, indent + 2);
586             field++;
587             first = false;
588         }
589         fprintf(out_file, "\n%*s]", indent, "");
590     }
591     if (vmsd->subsections != NULL) {
592         const VMStateDescription **subsection = vmsd->subsections;
593         bool first;
594 
595         fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
596         first = true;
597         while (*subsection != NULL) {
598             if (!first) {
599                 fprintf(out_file, ",\n");
600             }
601             dump_vmstate_vmss(out_file, subsection, indent + 2);
602             subsection++;
603             first = false;
604         }
605         fprintf(out_file, "\n%*s]", indent, "");
606     }
607     fprintf(out_file, "\n%*s}", indent - 2, "");
608 }
609 
610 static void dump_machine_type(FILE *out_file)
611 {
612     MachineClass *mc;
613 
614     mc = MACHINE_GET_CLASS(current_machine);
615 
616     fprintf(out_file, "  \"vmschkmachine\": {\n");
617     fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
618     fprintf(out_file, "  },\n");
619 }
620 
621 void dump_vmstate_json_to_file(FILE *out_file)
622 {
623     GSList *list, *elt;
624     bool first;
625 
626     fprintf(out_file, "{\n");
627     dump_machine_type(out_file);
628 
629     first = true;
630     list = object_class_get_list(TYPE_DEVICE, true);
631     for (elt = list; elt; elt = elt->next) {
632         DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
633                                              TYPE_DEVICE);
634         const char *name;
635         int indent = 2;
636 
637         if (!dc->vmsd) {
638             continue;
639         }
640 
641         if (!first) {
642             fprintf(out_file, ",\n");
643         }
644         name = object_class_get_name(OBJECT_CLASS(dc));
645         fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
646         indent += 2;
647         fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
648         fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
649                 dc->vmsd->version_id);
650         fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
651                 dc->vmsd->minimum_version_id);
652 
653         dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
654 
655         fprintf(out_file, "\n%*s}", indent - 2, "");
656         first = false;
657     }
658     fprintf(out_file, "\n}\n");
659     fclose(out_file);
660     g_slist_free(list);
661 }
662 
663 static uint32_t calculate_new_instance_id(const char *idstr)
664 {
665     SaveStateEntry *se;
666     uint32_t instance_id = 0;
667 
668     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
669         if (strcmp(idstr, se->idstr) == 0
670             && instance_id <= se->instance_id) {
671             instance_id = se->instance_id + 1;
672         }
673     }
674     /* Make sure we never loop over without being noticed */
675     assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
676     return instance_id;
677 }
678 
679 static int calculate_compat_instance_id(const char *idstr)
680 {
681     SaveStateEntry *se;
682     int instance_id = 0;
683 
684     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
685         if (!se->compat) {
686             continue;
687         }
688 
689         if (strcmp(idstr, se->compat->idstr) == 0
690             && instance_id <= se->compat->instance_id) {
691             instance_id = se->compat->instance_id + 1;
692         }
693     }
694     return instance_id;
695 }
696 
697 static inline MigrationPriority save_state_priority(SaveStateEntry *se)
698 {
699     if (se->vmsd) {
700         return se->vmsd->priority;
701     }
702     return MIG_PRI_DEFAULT;
703 }
704 
705 static void savevm_state_handler_insert(SaveStateEntry *nse)
706 {
707     MigrationPriority priority = save_state_priority(nse);
708     SaveStateEntry *se;
709     int i;
710 
711     assert(priority <= MIG_PRI_MAX);
712 
713     for (i = priority - 1; i >= 0; i--) {
714         se = savevm_state.handler_pri_head[i];
715         if (se != NULL) {
716             assert(save_state_priority(se) < priority);
717             break;
718         }
719     }
720 
721     if (i >= 0) {
722         QTAILQ_INSERT_BEFORE(se, nse, entry);
723     } else {
724         QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
725     }
726 
727     if (savevm_state.handler_pri_head[priority] == NULL) {
728         savevm_state.handler_pri_head[priority] = nse;
729     }
730 }
731 
732 static void savevm_state_handler_remove(SaveStateEntry *se)
733 {
734     SaveStateEntry *next;
735     MigrationPriority priority = save_state_priority(se);
736 
737     if (se == savevm_state.handler_pri_head[priority]) {
738         next = QTAILQ_NEXT(se, entry);
739         if (next != NULL && save_state_priority(next) == priority) {
740             savevm_state.handler_pri_head[priority] = next;
741         } else {
742             savevm_state.handler_pri_head[priority] = NULL;
743         }
744     }
745     QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
746 }
747 
748 /* TODO: Individual devices generally have very little idea about the rest
749    of the system, so instance_id should be removed/replaced.
750    Meanwhile pass -1 as instance_id if you do not already have a clearly
751    distinguishing id for all instances of your device class. */
752 int register_savevm_live(const char *idstr,
753                          uint32_t instance_id,
754                          int version_id,
755                          const SaveVMHandlers *ops,
756                          void *opaque)
757 {
758     SaveStateEntry *se;
759 
760     se = g_new0(SaveStateEntry, 1);
761     se->version_id = version_id;
762     se->section_id = savevm_state.global_section_id++;
763     se->ops = ops;
764     se->opaque = opaque;
765     se->vmsd = NULL;
766     /* if this is a live_savem then set is_ram */
767     if (ops->save_setup != NULL) {
768         se->is_ram = 1;
769     }
770 
771     pstrcat(se->idstr, sizeof(se->idstr), idstr);
772 
773     if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
774         se->instance_id = calculate_new_instance_id(se->idstr);
775     } else {
776         se->instance_id = instance_id;
777     }
778     assert(!se->compat || se->instance_id == 0);
779     savevm_state_handler_insert(se);
780     return 0;
781 }
782 
783 void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
784 {
785     SaveStateEntry *se, *new_se;
786     char id[256] = "";
787 
788     if (obj) {
789         char *oid = vmstate_if_get_id(obj);
790         if (oid) {
791             pstrcpy(id, sizeof(id), oid);
792             pstrcat(id, sizeof(id), "/");
793             g_free(oid);
794         }
795     }
796     pstrcat(id, sizeof(id), idstr);
797 
798     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
799         if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
800             savevm_state_handler_remove(se);
801             g_free(se->compat);
802             g_free(se);
803         }
804     }
805 }
806 
807 int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
808                                    const VMStateDescription *vmsd,
809                                    void *opaque, int alias_id,
810                                    int required_for_version,
811                                    Error **errp)
812 {
813     SaveStateEntry *se;
814 
815     /* If this triggers, alias support can be dropped for the vmsd. */
816     assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
817 
818     se = g_new0(SaveStateEntry, 1);
819     se->version_id = vmsd->version_id;
820     se->section_id = savevm_state.global_section_id++;
821     se->opaque = opaque;
822     se->vmsd = vmsd;
823     se->alias_id = alias_id;
824 
825     if (obj) {
826         char *id = vmstate_if_get_id(obj);
827         if (id) {
828             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
829                 sizeof(se->idstr)) {
830                 error_setg(errp, "Path too long for VMState (%s)", id);
831                 g_free(id);
832                 g_free(se);
833 
834                 return -1;
835             }
836             g_free(id);
837 
838             se->compat = g_new0(CompatEntry, 1);
839             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
840             se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
841                          calculate_compat_instance_id(vmsd->name) : instance_id;
842             instance_id = VMSTATE_INSTANCE_ID_ANY;
843         }
844     }
845     pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
846 
847     if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
848         se->instance_id = calculate_new_instance_id(se->idstr);
849     } else {
850         se->instance_id = instance_id;
851     }
852     assert(!se->compat || se->instance_id == 0);
853     savevm_state_handler_insert(se);
854     return 0;
855 }
856 
857 void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
858                         void *opaque)
859 {
860     SaveStateEntry *se, *new_se;
861 
862     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
863         if (se->vmsd == vmsd && se->opaque == opaque) {
864             savevm_state_handler_remove(se);
865             g_free(se->compat);
866             g_free(se);
867         }
868     }
869 }
870 
871 static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
872 {
873     trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
874     if (!se->vmsd) {         /* Old style */
875         return se->ops->load_state(f, se->opaque, se->load_version_id);
876     }
877     return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
878 }
879 
880 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
881                                    JSONWriter *vmdesc)
882 {
883     int64_t old_offset, size;
884 
885     old_offset = qemu_file_total_transferred_fast(f);
886     se->ops->save_state(f, se->opaque);
887     size = qemu_file_total_transferred_fast(f) - old_offset;
888 
889     if (vmdesc) {
890         json_writer_int64(vmdesc, "size", size);
891         json_writer_start_array(vmdesc, "fields");
892         json_writer_start_object(vmdesc, NULL);
893         json_writer_str(vmdesc, "name", "data");
894         json_writer_int64(vmdesc, "size", size);
895         json_writer_str(vmdesc, "type", "buffer");
896         json_writer_end_object(vmdesc);
897         json_writer_end_array(vmdesc);
898     }
899 }
900 
901 static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
902                         JSONWriter *vmdesc)
903 {
904     trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
905     if (!se->vmsd) {
906         vmstate_save_old_style(f, se, vmdesc);
907         return 0;
908     }
909     return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
910 }
911 
912 /*
913  * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
914  */
915 static void save_section_header(QEMUFile *f, SaveStateEntry *se,
916                                 uint8_t section_type)
917 {
918     qemu_put_byte(f, section_type);
919     qemu_put_be32(f, se->section_id);
920 
921     if (section_type == QEMU_VM_SECTION_FULL ||
922         section_type == QEMU_VM_SECTION_START) {
923         /* ID string */
924         size_t len = strlen(se->idstr);
925         qemu_put_byte(f, len);
926         qemu_put_buffer(f, (uint8_t *)se->idstr, len);
927 
928         qemu_put_be32(f, se->instance_id);
929         qemu_put_be32(f, se->version_id);
930     }
931 }
932 
933 /*
934  * Write a footer onto device sections that catches cases misformatted device
935  * sections.
936  */
937 static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
938 {
939     if (migrate_get_current()->send_section_footer) {
940         qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
941         qemu_put_be32(f, se->section_id);
942     }
943 }
944 
945 /**
946  * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
947  *                           command and associated data.
948  *
949  * @f: File to send command on
950  * @command: Command type to send
951  * @len: Length of associated data
952  * @data: Data associated with command.
953  */
954 static void qemu_savevm_command_send(QEMUFile *f,
955                                      enum qemu_vm_cmd command,
956                                      uint16_t len,
957                                      uint8_t *data)
958 {
959     trace_savevm_command_send(command, len);
960     qemu_put_byte(f, QEMU_VM_COMMAND);
961     qemu_put_be16(f, (uint16_t)command);
962     qemu_put_be16(f, len);
963     qemu_put_buffer(f, data, len);
964     qemu_fflush(f);
965 }
966 
967 void qemu_savevm_send_colo_enable(QEMUFile *f)
968 {
969     trace_savevm_send_colo_enable();
970     qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
971 }
972 
973 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
974 {
975     uint32_t buf;
976 
977     trace_savevm_send_ping(value);
978     buf = cpu_to_be32(value);
979     qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
980 }
981 
982 void qemu_savevm_send_open_return_path(QEMUFile *f)
983 {
984     trace_savevm_send_open_return_path();
985     qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
986 }
987 
988 /* We have a buffer of data to send; we don't want that all to be loaded
989  * by the command itself, so the command contains just the length of the
990  * extra buffer that we then send straight after it.
991  * TODO: Must be a better way to organise that
992  *
993  * Returns:
994  *    0 on success
995  *    -ve on error
996  */
997 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
998 {
999     uint32_t tmp;
1000 
1001     if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1002         error_report("%s: Unreasonably large packaged state: %zu",
1003                      __func__, len);
1004         return -1;
1005     }
1006 
1007     tmp = cpu_to_be32(len);
1008 
1009     trace_qemu_savevm_send_packaged();
1010     qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1011 
1012     qemu_put_buffer(f, buf, len);
1013 
1014     return 0;
1015 }
1016 
1017 /* Send prior to any postcopy transfer */
1018 void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1019 {
1020     if (migrate_postcopy_ram()) {
1021         uint64_t tmp[2];
1022         tmp[0] = cpu_to_be64(ram_pagesize_summary());
1023         tmp[1] = cpu_to_be64(qemu_target_page_size());
1024 
1025         trace_qemu_savevm_send_postcopy_advise();
1026         qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1027                                  16, (uint8_t *)tmp);
1028     } else {
1029         qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1030     }
1031 }
1032 
1033 /* Sent prior to starting the destination running in postcopy, discard pages
1034  * that have already been sent but redirtied on the source.
1035  * CMD_POSTCOPY_RAM_DISCARD consist of:
1036  *      byte   version (0)
1037  *      byte   Length of name field (not including 0)
1038  *  n x byte   RAM block name
1039  *      byte   0 terminator (just for safety)
1040  *  n x        Byte ranges within the named RAMBlock
1041  *      be64   Start of the range
1042  *      be64   Length
1043  *
1044  *  name:  RAMBlock name that these entries are part of
1045  *  len: Number of page entries
1046  *  start_list: 'len' addresses
1047  *  length_list: 'len' addresses
1048  *
1049  */
1050 void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1051                                            uint16_t len,
1052                                            uint64_t *start_list,
1053                                            uint64_t *length_list)
1054 {
1055     uint8_t *buf;
1056     uint16_t tmplen;
1057     uint16_t t;
1058     size_t name_len = strlen(name);
1059 
1060     trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1061     assert(name_len < 256);
1062     buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1063     buf[0] = postcopy_ram_discard_version;
1064     buf[1] = name_len;
1065     memcpy(buf + 2, name, name_len);
1066     tmplen = 2 + name_len;
1067     buf[tmplen++] = '\0';
1068 
1069     for (t = 0; t < len; t++) {
1070         stq_be_p(buf + tmplen, start_list[t]);
1071         tmplen += 8;
1072         stq_be_p(buf + tmplen, length_list[t]);
1073         tmplen += 8;
1074     }
1075     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1076     g_free(buf);
1077 }
1078 
1079 /* Get the destination into a state where it can receive postcopy data. */
1080 void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1081 {
1082     trace_savevm_send_postcopy_listen();
1083     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1084 }
1085 
1086 /* Kick the destination into running */
1087 void qemu_savevm_send_postcopy_run(QEMUFile *f)
1088 {
1089     trace_savevm_send_postcopy_run();
1090     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1091 }
1092 
1093 void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1094 {
1095     trace_savevm_send_postcopy_resume();
1096     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1097 }
1098 
1099 void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1100 {
1101     size_t len;
1102     char buf[256];
1103 
1104     trace_savevm_send_recv_bitmap(block_name);
1105 
1106     buf[0] = len = strlen(block_name);
1107     memcpy(buf + 1, block_name, len);
1108 
1109     qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1110 }
1111 
1112 bool qemu_savevm_state_blocked(Error **errp)
1113 {
1114     SaveStateEntry *se;
1115 
1116     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1117         if (se->vmsd && se->vmsd->unmigratable) {
1118             error_setg(errp, "State blocked by non-migratable device '%s'",
1119                        se->idstr);
1120             return true;
1121         }
1122     }
1123     return false;
1124 }
1125 
1126 void qemu_savevm_non_migratable_list(strList **reasons)
1127 {
1128     SaveStateEntry *se;
1129 
1130     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1131         if (se->vmsd && se->vmsd->unmigratable) {
1132             QAPI_LIST_PREPEND(*reasons,
1133                               g_strdup_printf("non-migratable device: %s",
1134                                               se->idstr));
1135         }
1136     }
1137 }
1138 
1139 void qemu_savevm_state_header(QEMUFile *f)
1140 {
1141     trace_savevm_state_header();
1142     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1143     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1144 
1145     if (migrate_get_current()->send_configuration) {
1146         qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1147         vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1148     }
1149 }
1150 
1151 bool qemu_savevm_state_guest_unplug_pending(void)
1152 {
1153     SaveStateEntry *se;
1154 
1155     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1156         if (se->vmsd && se->vmsd->dev_unplug_pending &&
1157             se->vmsd->dev_unplug_pending(se->opaque)) {
1158             return true;
1159         }
1160     }
1161 
1162     return false;
1163 }
1164 
1165 void qemu_savevm_state_setup(QEMUFile *f)
1166 {
1167     SaveStateEntry *se;
1168     Error *local_err = NULL;
1169     int ret;
1170 
1171     trace_savevm_state_setup();
1172     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1173         if (!se->ops || !se->ops->save_setup) {
1174             continue;
1175         }
1176         if (se->ops->is_active) {
1177             if (!se->ops->is_active(se->opaque)) {
1178                 continue;
1179             }
1180         }
1181         save_section_header(f, se, QEMU_VM_SECTION_START);
1182 
1183         ret = se->ops->save_setup(f, se->opaque);
1184         save_section_footer(f, se);
1185         if (ret < 0) {
1186             qemu_file_set_error(f, ret);
1187             break;
1188         }
1189     }
1190 
1191     if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) {
1192         error_report_err(local_err);
1193     }
1194 }
1195 
1196 int qemu_savevm_state_resume_prepare(MigrationState *s)
1197 {
1198     SaveStateEntry *se;
1199     int ret;
1200 
1201     trace_savevm_state_resume_prepare();
1202 
1203     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1204         if (!se->ops || !se->ops->resume_prepare) {
1205             continue;
1206         }
1207         if (se->ops->is_active) {
1208             if (!se->ops->is_active(se->opaque)) {
1209                 continue;
1210             }
1211         }
1212         ret = se->ops->resume_prepare(s, se->opaque);
1213         if (ret < 0) {
1214             return ret;
1215         }
1216     }
1217 
1218     return 0;
1219 }
1220 
1221 /*
1222  * this function has three return values:
1223  *   negative: there was one error, and we have -errno.
1224  *   0 : We haven't finished, caller have to go again
1225  *   1 : We have finished, we can go to complete phase
1226  */
1227 int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1228 {
1229     SaveStateEntry *se;
1230     int ret = 1;
1231 
1232     trace_savevm_state_iterate();
1233     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1234         if (!se->ops || !se->ops->save_live_iterate) {
1235             continue;
1236         }
1237         if (se->ops->is_active &&
1238             !se->ops->is_active(se->opaque)) {
1239             continue;
1240         }
1241         if (se->ops->is_active_iterate &&
1242             !se->ops->is_active_iterate(se->opaque)) {
1243             continue;
1244         }
1245         /*
1246          * In the postcopy phase, any device that doesn't know how to
1247          * do postcopy should have saved it's state in the _complete
1248          * call that's already run, it might get confused if we call
1249          * iterate afterwards.
1250          */
1251         if (postcopy &&
1252             !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1253             continue;
1254         }
1255         if (qemu_file_rate_limit(f)) {
1256             return 0;
1257         }
1258         trace_savevm_section_start(se->idstr, se->section_id);
1259 
1260         save_section_header(f, se, QEMU_VM_SECTION_PART);
1261 
1262         ret = se->ops->save_live_iterate(f, se->opaque);
1263         trace_savevm_section_end(se->idstr, se->section_id, ret);
1264         save_section_footer(f, se);
1265 
1266         if (ret < 0) {
1267             error_report("failed to save SaveStateEntry with id(name): "
1268                          "%d(%s): %d",
1269                          se->section_id, se->idstr, ret);
1270             qemu_file_set_error(f, ret);
1271         }
1272         if (ret <= 0) {
1273             /* Do not proceed to the next vmstate before this one reported
1274                completion of the current stage. This serializes the migration
1275                and reduces the probability that a faster changing state is
1276                synchronized over and over again. */
1277             break;
1278         }
1279     }
1280     return ret;
1281 }
1282 
1283 static bool should_send_vmdesc(void)
1284 {
1285     MachineState *machine = MACHINE(qdev_get_machine());
1286     bool in_postcopy = migration_in_postcopy();
1287     return !machine->suppress_vmdesc && !in_postcopy;
1288 }
1289 
1290 /*
1291  * Calls the save_live_complete_postcopy methods
1292  * causing the last few pages to be sent immediately and doing any associated
1293  * cleanup.
1294  * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1295  * all the other devices, but that happens at the point we switch to postcopy.
1296  */
1297 void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1298 {
1299     SaveStateEntry *se;
1300     int ret;
1301 
1302     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1303         if (!se->ops || !se->ops->save_live_complete_postcopy) {
1304             continue;
1305         }
1306         if (se->ops->is_active) {
1307             if (!se->ops->is_active(se->opaque)) {
1308                 continue;
1309             }
1310         }
1311         trace_savevm_section_start(se->idstr, se->section_id);
1312         /* Section type */
1313         qemu_put_byte(f, QEMU_VM_SECTION_END);
1314         qemu_put_be32(f, se->section_id);
1315 
1316         ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1317         trace_savevm_section_end(se->idstr, se->section_id, ret);
1318         save_section_footer(f, se);
1319         if (ret < 0) {
1320             qemu_file_set_error(f, ret);
1321             return;
1322         }
1323     }
1324 
1325     qemu_put_byte(f, QEMU_VM_EOF);
1326     qemu_fflush(f);
1327 }
1328 
1329 static
1330 int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1331 {
1332     SaveStateEntry *se;
1333     int ret;
1334 
1335     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1336         if (!se->ops ||
1337             (in_postcopy && se->ops->has_postcopy &&
1338              se->ops->has_postcopy(se->opaque)) ||
1339             !se->ops->save_live_complete_precopy) {
1340             continue;
1341         }
1342 
1343         if (se->ops->is_active) {
1344             if (!se->ops->is_active(se->opaque)) {
1345                 continue;
1346             }
1347         }
1348         trace_savevm_section_start(se->idstr, se->section_id);
1349 
1350         save_section_header(f, se, QEMU_VM_SECTION_END);
1351 
1352         ret = se->ops->save_live_complete_precopy(f, se->opaque);
1353         trace_savevm_section_end(se->idstr, se->section_id, ret);
1354         save_section_footer(f, se);
1355         if (ret < 0) {
1356             qemu_file_set_error(f, ret);
1357             return -1;
1358         }
1359     }
1360 
1361     return 0;
1362 }
1363 
1364 int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1365                                                     bool in_postcopy,
1366                                                     bool inactivate_disks)
1367 {
1368     g_autoptr(JSONWriter) vmdesc = NULL;
1369     int vmdesc_len;
1370     SaveStateEntry *se;
1371     int ret;
1372 
1373     vmdesc = json_writer_new(false);
1374     json_writer_start_object(vmdesc, NULL);
1375     json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
1376     json_writer_start_array(vmdesc, "devices");
1377     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1378 
1379         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1380             continue;
1381         }
1382         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1383             trace_savevm_section_skip(se->idstr, se->section_id);
1384             continue;
1385         }
1386 
1387         trace_savevm_section_start(se->idstr, se->section_id);
1388 
1389         json_writer_start_object(vmdesc, NULL);
1390         json_writer_str(vmdesc, "name", se->idstr);
1391         json_writer_int64(vmdesc, "instance_id", se->instance_id);
1392 
1393         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1394         ret = vmstate_save(f, se, vmdesc);
1395         if (ret) {
1396             qemu_file_set_error(f, ret);
1397             return ret;
1398         }
1399         trace_savevm_section_end(se->idstr, se->section_id, 0);
1400         save_section_footer(f, se);
1401 
1402         json_writer_end_object(vmdesc);
1403     }
1404 
1405     if (inactivate_disks) {
1406         /* Inactivate before sending QEMU_VM_EOF so that the
1407          * bdrv_activate_all() on the other end won't fail. */
1408         ret = bdrv_inactivate_all();
1409         if (ret) {
1410             error_report("%s: bdrv_inactivate_all() failed (%d)",
1411                          __func__, ret);
1412             qemu_file_set_error(f, ret);
1413             return ret;
1414         }
1415     }
1416     if (!in_postcopy) {
1417         /* Postcopy stream will still be going */
1418         qemu_put_byte(f, QEMU_VM_EOF);
1419     }
1420 
1421     json_writer_end_array(vmdesc);
1422     json_writer_end_object(vmdesc);
1423     vmdesc_len = strlen(json_writer_get(vmdesc));
1424 
1425     if (should_send_vmdesc()) {
1426         qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1427         qemu_put_be32(f, vmdesc_len);
1428         qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
1429     }
1430 
1431     return 0;
1432 }
1433 
1434 int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1435                                        bool inactivate_disks)
1436 {
1437     int ret;
1438     Error *local_err = NULL;
1439     bool in_postcopy = migration_in_postcopy();
1440 
1441     if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
1442         error_report_err(local_err);
1443     }
1444 
1445     trace_savevm_state_complete_precopy();
1446 
1447     cpu_synchronize_all_states();
1448 
1449     if (!in_postcopy || iterable_only) {
1450         ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
1451         if (ret) {
1452             return ret;
1453         }
1454     }
1455 
1456     if (iterable_only) {
1457         goto flush;
1458     }
1459 
1460     ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
1461                                                           inactivate_disks);
1462     if (ret) {
1463         return ret;
1464     }
1465 
1466 flush:
1467     qemu_fflush(f);
1468     return 0;
1469 }
1470 
1471 /* Give an estimate of the amount left to be transferred,
1472  * the result is split into the amount for units that can and
1473  * for units that can't do postcopy.
1474  */
1475 void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1476                                uint64_t *res_precopy_only,
1477                                uint64_t *res_compatible,
1478                                uint64_t *res_postcopy_only)
1479 {
1480     SaveStateEntry *se;
1481 
1482     *res_precopy_only = 0;
1483     *res_compatible = 0;
1484     *res_postcopy_only = 0;
1485 
1486 
1487     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1488         if (!se->ops || !se->ops->save_live_pending) {
1489             continue;
1490         }
1491         if (se->ops->is_active) {
1492             if (!se->ops->is_active(se->opaque)) {
1493                 continue;
1494             }
1495         }
1496         se->ops->save_live_pending(f, se->opaque, threshold_size,
1497                                    res_precopy_only, res_compatible,
1498                                    res_postcopy_only);
1499     }
1500 }
1501 
1502 void qemu_savevm_state_cleanup(void)
1503 {
1504     SaveStateEntry *se;
1505     Error *local_err = NULL;
1506 
1507     if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1508         error_report_err(local_err);
1509     }
1510 
1511     trace_savevm_state_cleanup();
1512     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1513         if (se->ops && se->ops->save_cleanup) {
1514             se->ops->save_cleanup(se->opaque);
1515         }
1516     }
1517 }
1518 
1519 static int qemu_savevm_state(QEMUFile *f, Error **errp)
1520 {
1521     int ret;
1522     MigrationState *ms = migrate_get_current();
1523     MigrationStatus status;
1524 
1525     if (migration_is_running(ms->state)) {
1526         error_setg(errp, QERR_MIGRATION_ACTIVE);
1527         return -EINVAL;
1528     }
1529 
1530     if (migrate_use_block()) {
1531         error_setg(errp, "Block migration and snapshots are incompatible");
1532         return -EINVAL;
1533     }
1534 
1535     migrate_init(ms);
1536     memset(&ram_counters, 0, sizeof(ram_counters));
1537     memset(&compression_counters, 0, sizeof(compression_counters));
1538     ms->to_dst_file = f;
1539 
1540     qemu_mutex_unlock_iothread();
1541     qemu_savevm_state_header(f);
1542     qemu_savevm_state_setup(f);
1543     qemu_mutex_lock_iothread();
1544 
1545     while (qemu_file_get_error(f) == 0) {
1546         if (qemu_savevm_state_iterate(f, false) > 0) {
1547             break;
1548         }
1549     }
1550 
1551     ret = qemu_file_get_error(f);
1552     if (ret == 0) {
1553         qemu_savevm_state_complete_precopy(f, false, false);
1554         ret = qemu_file_get_error(f);
1555     }
1556     qemu_savevm_state_cleanup();
1557     if (ret != 0) {
1558         error_setg_errno(errp, -ret, "Error while writing VM state");
1559     }
1560 
1561     if (ret != 0) {
1562         status = MIGRATION_STATUS_FAILED;
1563     } else {
1564         status = MIGRATION_STATUS_COMPLETED;
1565     }
1566     migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1567 
1568     /* f is outer parameter, it should not stay in global migration state after
1569      * this function finished */
1570     ms->to_dst_file = NULL;
1571 
1572     return ret;
1573 }
1574 
1575 void qemu_savevm_live_state(QEMUFile *f)
1576 {
1577     /* save QEMU_VM_SECTION_END section */
1578     qemu_savevm_state_complete_precopy(f, true, false);
1579     qemu_put_byte(f, QEMU_VM_EOF);
1580 }
1581 
1582 int qemu_save_device_state(QEMUFile *f)
1583 {
1584     SaveStateEntry *se;
1585 
1586     if (!migration_in_colo_state()) {
1587         qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1588         qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1589     }
1590     cpu_synchronize_all_states();
1591 
1592     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1593         int ret;
1594 
1595         if (se->is_ram) {
1596             continue;
1597         }
1598         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1599             continue;
1600         }
1601         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1602             continue;
1603         }
1604 
1605         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1606 
1607         ret = vmstate_save(f, se, NULL);
1608         if (ret) {
1609             return ret;
1610         }
1611 
1612         save_section_footer(f, se);
1613     }
1614 
1615     qemu_put_byte(f, QEMU_VM_EOF);
1616 
1617     return qemu_file_get_error(f);
1618 }
1619 
1620 static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1621 {
1622     SaveStateEntry *se;
1623 
1624     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1625         if (!strcmp(se->idstr, idstr) &&
1626             (instance_id == se->instance_id ||
1627              instance_id == se->alias_id))
1628             return se;
1629         /* Migrating from an older version? */
1630         if (strstr(se->idstr, idstr) && se->compat) {
1631             if (!strcmp(se->compat->idstr, idstr) &&
1632                 (instance_id == se->compat->instance_id ||
1633                  instance_id == se->alias_id))
1634                 return se;
1635         }
1636     }
1637     return NULL;
1638 }
1639 
1640 enum LoadVMExitCodes {
1641     /* Allow a command to quit all layers of nested loadvm loops */
1642     LOADVM_QUIT     =  1,
1643 };
1644 
1645 /* ------ incoming postcopy messages ------ */
1646 /* 'advise' arrives before any transfers just to tell us that a postcopy
1647  * *might* happen - it might be skipped if precopy transferred everything
1648  * quickly.
1649  */
1650 static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1651                                          uint16_t len)
1652 {
1653     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1654     uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1655     size_t page_size = qemu_target_page_size();
1656     Error *local_err = NULL;
1657 
1658     trace_loadvm_postcopy_handle_advise();
1659     if (ps != POSTCOPY_INCOMING_NONE) {
1660         error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1661         return -1;
1662     }
1663 
1664     switch (len) {
1665     case 0:
1666         if (migrate_postcopy_ram()) {
1667             error_report("RAM postcopy is enabled but have 0 byte advise");
1668             return -EINVAL;
1669         }
1670         return 0;
1671     case 8 + 8:
1672         if (!migrate_postcopy_ram()) {
1673             error_report("RAM postcopy is disabled but have 16 byte advise");
1674             return -EINVAL;
1675         }
1676         break;
1677     default:
1678         error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1679         return -EINVAL;
1680     }
1681 
1682     if (!postcopy_ram_supported_by_host(mis)) {
1683         postcopy_state_set(POSTCOPY_INCOMING_NONE);
1684         return -1;
1685     }
1686 
1687     remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1688     local_pagesize_summary = ram_pagesize_summary();
1689 
1690     if (remote_pagesize_summary != local_pagesize_summary)  {
1691         /*
1692          * This detects two potential causes of mismatch:
1693          *   a) A mismatch in host page sizes
1694          *      Some combinations of mismatch are probably possible but it gets
1695          *      a bit more complicated.  In particular we need to place whole
1696          *      host pages on the dest at once, and we need to ensure that we
1697          *      handle dirtying to make sure we never end up sending part of
1698          *      a hostpage on it's own.
1699          *   b) The use of different huge page sizes on source/destination
1700          *      a more fine grain test is performed during RAM block migration
1701          *      but this test here causes a nice early clear failure, and
1702          *      also fails when passed to an older qemu that doesn't
1703          *      do huge pages.
1704          */
1705         error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1706                                                              " d=%" PRIx64 ")",
1707                      remote_pagesize_summary, local_pagesize_summary);
1708         return -1;
1709     }
1710 
1711     remote_tps = qemu_get_be64(mis->from_src_file);
1712     if (remote_tps != page_size) {
1713         /*
1714          * Again, some differences could be dealt with, but for now keep it
1715          * simple.
1716          */
1717         error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1718                      (int)remote_tps, page_size);
1719         return -1;
1720     }
1721 
1722     if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1723         error_report_err(local_err);
1724         return -1;
1725     }
1726 
1727     if (ram_postcopy_incoming_init(mis)) {
1728         return -1;
1729     }
1730 
1731     return 0;
1732 }
1733 
1734 /* After postcopy we will be told to throw some pages away since they're
1735  * dirty and will have to be demand fetched.  Must happen before CPU is
1736  * started.
1737  * There can be 0..many of these messages, each encoding multiple pages.
1738  */
1739 static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1740                                               uint16_t len)
1741 {
1742     int tmp;
1743     char ramid[256];
1744     PostcopyState ps = postcopy_state_get();
1745 
1746     trace_loadvm_postcopy_ram_handle_discard();
1747 
1748     switch (ps) {
1749     case POSTCOPY_INCOMING_ADVISE:
1750         /* 1st discard */
1751         tmp = postcopy_ram_prepare_discard(mis);
1752         if (tmp) {
1753             return tmp;
1754         }
1755         break;
1756 
1757     case POSTCOPY_INCOMING_DISCARD:
1758         /* Expected state */
1759         break;
1760 
1761     default:
1762         error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1763                      ps);
1764         return -1;
1765     }
1766     /* We're expecting a
1767      *    Version (0)
1768      *    a RAM ID string (length byte, name, 0 term)
1769      *    then at least 1 16 byte chunk
1770     */
1771     if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1772         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1773         return -1;
1774     }
1775 
1776     tmp = qemu_get_byte(mis->from_src_file);
1777     if (tmp != postcopy_ram_discard_version) {
1778         error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1779         return -1;
1780     }
1781 
1782     if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1783         error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1784         return -1;
1785     }
1786     tmp = qemu_get_byte(mis->from_src_file);
1787     if (tmp != 0) {
1788         error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1789         return -1;
1790     }
1791 
1792     len -= 3 + strlen(ramid);
1793     if (len % 16) {
1794         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1795         return -1;
1796     }
1797     trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1798     while (len) {
1799         uint64_t start_addr, block_length;
1800         start_addr = qemu_get_be64(mis->from_src_file);
1801         block_length = qemu_get_be64(mis->from_src_file);
1802 
1803         len -= 16;
1804         int ret = ram_discard_range(ramid, start_addr, block_length);
1805         if (ret) {
1806             return ret;
1807         }
1808     }
1809     trace_loadvm_postcopy_ram_handle_discard_end();
1810 
1811     return 0;
1812 }
1813 
1814 /*
1815  * Triggered by a postcopy_listen command; this thread takes over reading
1816  * the input stream, leaving the main thread free to carry on loading the rest
1817  * of the device state (from RAM).
1818  * (TODO:This could do with being in a postcopy file - but there again it's
1819  * just another input loop, not that postcopy specific)
1820  */
1821 static void *postcopy_ram_listen_thread(void *opaque)
1822 {
1823     MigrationIncomingState *mis = migration_incoming_get_current();
1824     QEMUFile *f = mis->from_src_file;
1825     int load_res;
1826     MigrationState *migr = migrate_get_current();
1827 
1828     object_ref(OBJECT(migr));
1829 
1830     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1831                                    MIGRATION_STATUS_POSTCOPY_ACTIVE);
1832     qemu_sem_post(&mis->thread_sync_sem);
1833     trace_postcopy_ram_listen_thread_start();
1834 
1835     rcu_register_thread();
1836     /*
1837      * Because we're a thread and not a coroutine we can't yield
1838      * in qemu_file, and thus we must be blocking now.
1839      */
1840     qemu_file_set_blocking(f, true);
1841     load_res = qemu_loadvm_state_main(f, mis);
1842 
1843     /*
1844      * This is tricky, but, mis->from_src_file can change after it
1845      * returns, when postcopy recovery happened. In the future, we may
1846      * want a wrapper for the QEMUFile handle.
1847      */
1848     f = mis->from_src_file;
1849 
1850     /* And non-blocking again so we don't block in any cleanup */
1851     qemu_file_set_blocking(f, false);
1852 
1853     trace_postcopy_ram_listen_thread_exit();
1854     if (load_res < 0) {
1855         qemu_file_set_error(f, load_res);
1856         dirty_bitmap_mig_cancel_incoming();
1857         if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
1858             !migrate_postcopy_ram() && migrate_dirty_bitmaps())
1859         {
1860             error_report("%s: loadvm failed during postcopy: %d. All states "
1861                          "are migrated except dirty bitmaps. Some dirty "
1862                          "bitmaps may be lost, and present migrated dirty "
1863                          "bitmaps are correctly migrated and valid.",
1864                          __func__, load_res);
1865             load_res = 0; /* prevent further exit() */
1866         } else {
1867             error_report("%s: loadvm failed: %d", __func__, load_res);
1868             migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1869                                            MIGRATION_STATUS_FAILED);
1870         }
1871     }
1872     if (load_res >= 0) {
1873         /*
1874          * This looks good, but it's possible that the device loading in the
1875          * main thread hasn't finished yet, and so we might not be in 'RUN'
1876          * state yet; wait for the end of the main thread.
1877          */
1878         qemu_event_wait(&mis->main_thread_load_event);
1879     }
1880     postcopy_ram_incoming_cleanup(mis);
1881 
1882     if (load_res < 0) {
1883         /*
1884          * If something went wrong then we have a bad state so exit;
1885          * depending how far we got it might be possible at this point
1886          * to leave the guest running and fire MCEs for pages that never
1887          * arrived as a desperate recovery step.
1888          */
1889         rcu_unregister_thread();
1890         exit(EXIT_FAILURE);
1891     }
1892 
1893     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1894                                    MIGRATION_STATUS_COMPLETED);
1895     /*
1896      * If everything has worked fine, then the main thread has waited
1897      * for us to start, and we're the last use of the mis.
1898      * (If something broke then qemu will have to exit anyway since it's
1899      * got a bad migration state).
1900      */
1901     migration_incoming_state_destroy();
1902     qemu_loadvm_state_cleanup();
1903 
1904     rcu_unregister_thread();
1905     mis->have_listen_thread = false;
1906     postcopy_state_set(POSTCOPY_INCOMING_END);
1907 
1908     object_unref(OBJECT(migr));
1909 
1910     return NULL;
1911 }
1912 
1913 /* After this message we must be able to immediately receive postcopy data */
1914 static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1915 {
1916     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1917     Error *local_err = NULL;
1918 
1919     trace_loadvm_postcopy_handle_listen("enter");
1920 
1921     if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1922         error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1923         return -1;
1924     }
1925     if (ps == POSTCOPY_INCOMING_ADVISE) {
1926         /*
1927          * A rare case, we entered listen without having to do any discards,
1928          * so do the setup that's normally done at the time of the 1st discard.
1929          */
1930         if (migrate_postcopy_ram()) {
1931             postcopy_ram_prepare_discard(mis);
1932         }
1933     }
1934 
1935     trace_loadvm_postcopy_handle_listen("after discard");
1936 
1937     /*
1938      * Sensitise RAM - can now generate requests for blocks that don't exist
1939      * However, at this point the CPU shouldn't be running, and the IO
1940      * shouldn't be doing anything yet so don't actually expect requests
1941      */
1942     if (migrate_postcopy_ram()) {
1943         if (postcopy_ram_incoming_setup(mis)) {
1944             postcopy_ram_incoming_cleanup(mis);
1945             return -1;
1946         }
1947     }
1948 
1949     trace_loadvm_postcopy_handle_listen("after uffd");
1950 
1951     if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
1952         error_report_err(local_err);
1953         return -1;
1954     }
1955 
1956     mis->have_listen_thread = true;
1957     postcopy_thread_create(mis, &mis->listen_thread, "postcopy/listen",
1958                            postcopy_ram_listen_thread, QEMU_THREAD_DETACHED);
1959     trace_loadvm_postcopy_handle_listen("return");
1960 
1961     return 0;
1962 }
1963 
1964 static void loadvm_postcopy_handle_run_bh(void *opaque)
1965 {
1966     Error *local_err = NULL;
1967     MigrationIncomingState *mis = opaque;
1968 
1969     trace_loadvm_postcopy_handle_run_bh("enter");
1970 
1971     /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1972      * in migration.c
1973      */
1974     cpu_synchronize_all_post_init();
1975 
1976     trace_loadvm_postcopy_handle_run_bh("after cpu sync");
1977 
1978     qemu_announce_self(&mis->announce_timer, migrate_announce_params());
1979 
1980     trace_loadvm_postcopy_handle_run_bh("after announce");
1981 
1982     /* Make sure all file formats throw away their mutable metadata.
1983      * If we get an error here, just don't restart the VM yet. */
1984     bdrv_activate_all(&local_err);
1985     if (local_err) {
1986         error_report_err(local_err);
1987         local_err = NULL;
1988         autostart = false;
1989     }
1990 
1991     trace_loadvm_postcopy_handle_run_bh("after invalidate cache");
1992 
1993     dirty_bitmap_mig_before_vm_start();
1994 
1995     if (autostart) {
1996         /* Hold onto your hats, starting the CPU */
1997         vm_start();
1998     } else {
1999         /* leave it paused and let management decide when to start the CPU */
2000         runstate_set(RUN_STATE_PAUSED);
2001     }
2002 
2003     qemu_bh_delete(mis->bh);
2004 
2005     trace_loadvm_postcopy_handle_run_bh("return");
2006 }
2007 
2008 /* After all discards we can start running and asking for pages */
2009 static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
2010 {
2011     PostcopyState ps = postcopy_state_get();
2012 
2013     trace_loadvm_postcopy_handle_run();
2014     if (ps != POSTCOPY_INCOMING_LISTENING) {
2015         error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
2016         return -1;
2017     }
2018 
2019     postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2020     mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
2021     qemu_bh_schedule(mis->bh);
2022 
2023     /* We need to finish reading the stream from the package
2024      * and also stop reading anything more from the stream that loaded the
2025      * package (since it's now being read by the listener thread).
2026      * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2027      */
2028     return LOADVM_QUIT;
2029 }
2030 
2031 /* We must be with page_request_mutex held */
2032 static gboolean postcopy_sync_page_req(gpointer key, gpointer value,
2033                                        gpointer data)
2034 {
2035     MigrationIncomingState *mis = data;
2036     void *host_addr = (void *) key;
2037     ram_addr_t rb_offset;
2038     RAMBlock *rb;
2039     int ret;
2040 
2041     rb = qemu_ram_block_from_host(host_addr, true, &rb_offset);
2042     if (!rb) {
2043         /*
2044          * This should _never_ happen.  However be nice for a migrating VM to
2045          * not crash/assert.  Post an error (note: intended to not use *_once
2046          * because we do want to see all the illegal addresses; and this can
2047          * never be triggered by the guest so we're safe) and move on next.
2048          */
2049         error_report("%s: illegal host addr %p", __func__, host_addr);
2050         /* Try the next entry */
2051         return FALSE;
2052     }
2053 
2054     ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset);
2055     if (ret) {
2056         /* Please refer to above comment. */
2057         error_report("%s: send rp message failed for addr %p",
2058                      __func__, host_addr);
2059         return FALSE;
2060     }
2061 
2062     trace_postcopy_page_req_sync(host_addr);
2063 
2064     return FALSE;
2065 }
2066 
2067 static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis)
2068 {
2069     WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
2070         g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis);
2071     }
2072 }
2073 
2074 static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2075 {
2076     if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2077         error_report("%s: illegal resume received", __func__);
2078         /* Don't fail the load, only for this. */
2079         return 0;
2080     }
2081 
2082     /*
2083      * Reset the last_rb before we resend any page req to source again, since
2084      * the source should have it reset already.
2085      */
2086     mis->last_rb = NULL;
2087 
2088     /*
2089      * This means source VM is ready to resume the postcopy migration.
2090      */
2091     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2092                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
2093 
2094     trace_loadvm_postcopy_handle_resume();
2095 
2096     /* Tell source that "we are ready" */
2097     migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2098 
2099     /*
2100      * After a postcopy recovery, the source should have lost the postcopy
2101      * queue, or potentially the requested pages could have been lost during
2102      * the network down phase.  Let's re-sync with the source VM by re-sending
2103      * all the pending pages that we eagerly need, so these threads won't get
2104      * blocked too long due to the recovery.
2105      *
2106      * Without this procedure, the faulted destination VM threads (waiting for
2107      * page requests right before the postcopy is interrupted) can keep hanging
2108      * until the pages are sent by the source during the background copying of
2109      * pages, or another thread faulted on the same address accidentally.
2110      */
2111     migrate_send_rp_req_pages_pending(mis);
2112 
2113     /*
2114      * It's time to switch state and release the fault thread to continue
2115      * service page faults.  Note that this should be explicitly after the
2116      * above call to migrate_send_rp_req_pages_pending().  In short:
2117      * migrate_send_rp_message_req_pages() is not thread safe, yet.
2118      */
2119     qemu_sem_post(&mis->postcopy_pause_sem_fault);
2120 
2121     if (migrate_postcopy_preempt()) {
2122         /* The channel should already be setup again; make sure of it */
2123         assert(mis->postcopy_qemufile_dst);
2124         /* Kick the fast ram load thread too */
2125         qemu_sem_post(&mis->postcopy_pause_sem_fast_load);
2126     }
2127 
2128     return 0;
2129 }
2130 
2131 /**
2132  * Immediately following this command is a blob of data containing an embedded
2133  * chunk of migration stream; read it and load it.
2134  *
2135  * @mis: Incoming state
2136  * @length: Length of packaged data to read
2137  *
2138  * Returns: Negative values on error
2139  *
2140  */
2141 static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2142 {
2143     int ret;
2144     size_t length;
2145     QIOChannelBuffer *bioc;
2146 
2147     length = qemu_get_be32(mis->from_src_file);
2148     trace_loadvm_handle_cmd_packaged(length);
2149 
2150     if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2151         error_report("Unreasonably large packaged state: %zu", length);
2152         return -1;
2153     }
2154 
2155     bioc = qio_channel_buffer_new(length);
2156     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2157     ret = qemu_get_buffer(mis->from_src_file,
2158                           bioc->data,
2159                           length);
2160     if (ret != length) {
2161         object_unref(OBJECT(bioc));
2162         error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2163                      ret, length);
2164         return (ret < 0) ? ret : -EAGAIN;
2165     }
2166     bioc->usage += length;
2167     trace_loadvm_handle_cmd_packaged_received(ret);
2168 
2169     QEMUFile *packf = qemu_file_new_input(QIO_CHANNEL(bioc));
2170 
2171     ret = qemu_loadvm_state_main(packf, mis);
2172     trace_loadvm_handle_cmd_packaged_main(ret);
2173     qemu_fclose(packf);
2174     object_unref(OBJECT(bioc));
2175 
2176     return ret;
2177 }
2178 
2179 /*
2180  * Handle request that source requests for recved_bitmap on
2181  * destination. Payload format:
2182  *
2183  * len (1 byte) + ramblock_name (<255 bytes)
2184  */
2185 static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2186                                      uint16_t len)
2187 {
2188     QEMUFile *file = mis->from_src_file;
2189     RAMBlock *rb;
2190     char block_name[256];
2191     size_t cnt;
2192 
2193     cnt = qemu_get_counted_string(file, block_name);
2194     if (!cnt) {
2195         error_report("%s: failed to read block name", __func__);
2196         return -EINVAL;
2197     }
2198 
2199     /* Validate before using the data */
2200     if (qemu_file_get_error(file)) {
2201         return qemu_file_get_error(file);
2202     }
2203 
2204     if (len != cnt + 1) {
2205         error_report("%s: invalid payload length (%d)", __func__, len);
2206         return -EINVAL;
2207     }
2208 
2209     rb = qemu_ram_block_by_name(block_name);
2210     if (!rb) {
2211         error_report("%s: block '%s' not found", __func__, block_name);
2212         return -EINVAL;
2213     }
2214 
2215     migrate_send_rp_recv_bitmap(mis, block_name);
2216 
2217     trace_loadvm_handle_recv_bitmap(block_name);
2218 
2219     return 0;
2220 }
2221 
2222 static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2223 {
2224     int ret = migration_incoming_enable_colo();
2225 
2226     if (!ret) {
2227         ret = colo_init_ram_cache();
2228         if (ret) {
2229             migration_incoming_disable_colo();
2230         }
2231     }
2232     return ret;
2233 }
2234 
2235 /*
2236  * Process an incoming 'QEMU_VM_COMMAND'
2237  * 0           just a normal return
2238  * LOADVM_QUIT All good, but exit the loop
2239  * <0          Error
2240  */
2241 static int loadvm_process_command(QEMUFile *f)
2242 {
2243     MigrationIncomingState *mis = migration_incoming_get_current();
2244     uint16_t cmd;
2245     uint16_t len;
2246     uint32_t tmp32;
2247 
2248     cmd = qemu_get_be16(f);
2249     len = qemu_get_be16(f);
2250 
2251     /* Check validity before continue processing of cmds */
2252     if (qemu_file_get_error(f)) {
2253         return qemu_file_get_error(f);
2254     }
2255 
2256     if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2257         error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2258         return -EINVAL;
2259     }
2260 
2261     trace_loadvm_process_command(mig_cmd_args[cmd].name, len);
2262 
2263     if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2264         error_report("%s received with bad length - expecting %zu, got %d",
2265                      mig_cmd_args[cmd].name,
2266                      (size_t)mig_cmd_args[cmd].len, len);
2267         return -ERANGE;
2268     }
2269 
2270     switch (cmd) {
2271     case MIG_CMD_OPEN_RETURN_PATH:
2272         if (mis->to_src_file) {
2273             error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2274             /* Not really a problem, so don't give up */
2275             return 0;
2276         }
2277         mis->to_src_file = qemu_file_get_return_path(f);
2278         if (!mis->to_src_file) {
2279             error_report("CMD_OPEN_RETURN_PATH failed");
2280             return -1;
2281         }
2282         break;
2283 
2284     case MIG_CMD_PING:
2285         tmp32 = qemu_get_be32(f);
2286         trace_loadvm_process_command_ping(tmp32);
2287         if (!mis->to_src_file) {
2288             error_report("CMD_PING (0x%x) received with no return path",
2289                          tmp32);
2290             return -1;
2291         }
2292         migrate_send_rp_pong(mis, tmp32);
2293         break;
2294 
2295     case MIG_CMD_PACKAGED:
2296         return loadvm_handle_cmd_packaged(mis);
2297 
2298     case MIG_CMD_POSTCOPY_ADVISE:
2299         return loadvm_postcopy_handle_advise(mis, len);
2300 
2301     case MIG_CMD_POSTCOPY_LISTEN:
2302         return loadvm_postcopy_handle_listen(mis);
2303 
2304     case MIG_CMD_POSTCOPY_RUN:
2305         return loadvm_postcopy_handle_run(mis);
2306 
2307     case MIG_CMD_POSTCOPY_RAM_DISCARD:
2308         return loadvm_postcopy_ram_handle_discard(mis, len);
2309 
2310     case MIG_CMD_POSTCOPY_RESUME:
2311         return loadvm_postcopy_handle_resume(mis);
2312 
2313     case MIG_CMD_RECV_BITMAP:
2314         return loadvm_handle_recv_bitmap(mis, len);
2315 
2316     case MIG_CMD_ENABLE_COLO:
2317         return loadvm_process_enable_colo(mis);
2318     }
2319 
2320     return 0;
2321 }
2322 
2323 /*
2324  * Read a footer off the wire and check that it matches the expected section
2325  *
2326  * Returns: true if the footer was good
2327  *          false if there is a problem (and calls error_report to say why)
2328  */
2329 static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2330 {
2331     int ret;
2332     uint8_t read_mark;
2333     uint32_t read_section_id;
2334 
2335     if (!migrate_get_current()->send_section_footer) {
2336         /* No footer to check */
2337         return true;
2338     }
2339 
2340     read_mark = qemu_get_byte(f);
2341 
2342     ret = qemu_file_get_error(f);
2343     if (ret) {
2344         error_report("%s: Read section footer failed: %d",
2345                      __func__, ret);
2346         return false;
2347     }
2348 
2349     if (read_mark != QEMU_VM_SECTION_FOOTER) {
2350         error_report("Missing section footer for %s", se->idstr);
2351         return false;
2352     }
2353 
2354     read_section_id = qemu_get_be32(f);
2355     if (read_section_id != se->load_section_id) {
2356         error_report("Mismatched section id in footer for %s -"
2357                      " read 0x%x expected 0x%x",
2358                      se->idstr, read_section_id, se->load_section_id);
2359         return false;
2360     }
2361 
2362     /* All good */
2363     return true;
2364 }
2365 
2366 static int
2367 qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2368 {
2369     uint32_t instance_id, version_id, section_id;
2370     SaveStateEntry *se;
2371     char idstr[256];
2372     int ret;
2373 
2374     /* Read section start */
2375     section_id = qemu_get_be32(f);
2376     if (!qemu_get_counted_string(f, idstr)) {
2377         error_report("Unable to read ID string for section %u",
2378                      section_id);
2379         return -EINVAL;
2380     }
2381     instance_id = qemu_get_be32(f);
2382     version_id = qemu_get_be32(f);
2383 
2384     ret = qemu_file_get_error(f);
2385     if (ret) {
2386         error_report("%s: Failed to read instance/version ID: %d",
2387                      __func__, ret);
2388         return ret;
2389     }
2390 
2391     trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2392             instance_id, version_id);
2393     /* Find savevm section */
2394     se = find_se(idstr, instance_id);
2395     if (se == NULL) {
2396         error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2397                      "Make sure that your current VM setup matches your "
2398                      "saved VM setup, including any hotplugged devices",
2399                      idstr, instance_id);
2400         return -EINVAL;
2401     }
2402 
2403     /* Validate version */
2404     if (version_id > se->version_id) {
2405         error_report("savevm: unsupported version %d for '%s' v%d",
2406                      version_id, idstr, se->version_id);
2407         return -EINVAL;
2408     }
2409     se->load_version_id = version_id;
2410     se->load_section_id = section_id;
2411 
2412     /* Validate if it is a device's state */
2413     if (xen_enabled() && se->is_ram) {
2414         error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2415         return -EINVAL;
2416     }
2417 
2418     ret = vmstate_load(f, se);
2419     if (ret < 0) {
2420         error_report("error while loading state for instance 0x%"PRIx32" of"
2421                      " device '%s'", instance_id, idstr);
2422         return ret;
2423     }
2424     if (!check_section_footer(f, se)) {
2425         return -EINVAL;
2426     }
2427 
2428     return 0;
2429 }
2430 
2431 static int
2432 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2433 {
2434     uint32_t section_id;
2435     SaveStateEntry *se;
2436     int ret;
2437 
2438     section_id = qemu_get_be32(f);
2439 
2440     ret = qemu_file_get_error(f);
2441     if (ret) {
2442         error_report("%s: Failed to read section ID: %d",
2443                      __func__, ret);
2444         return ret;
2445     }
2446 
2447     trace_qemu_loadvm_state_section_partend(section_id);
2448     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2449         if (se->load_section_id == section_id) {
2450             break;
2451         }
2452     }
2453     if (se == NULL) {
2454         error_report("Unknown savevm section %d", section_id);
2455         return -EINVAL;
2456     }
2457 
2458     ret = vmstate_load(f, se);
2459     if (ret < 0) {
2460         error_report("error while loading state section id %d(%s)",
2461                      section_id, se->idstr);
2462         return ret;
2463     }
2464     if (!check_section_footer(f, se)) {
2465         return -EINVAL;
2466     }
2467 
2468     return 0;
2469 }
2470 
2471 static int qemu_loadvm_state_header(QEMUFile *f)
2472 {
2473     unsigned int v;
2474     int ret;
2475 
2476     v = qemu_get_be32(f);
2477     if (v != QEMU_VM_FILE_MAGIC) {
2478         error_report("Not a migration stream");
2479         return -EINVAL;
2480     }
2481 
2482     v = qemu_get_be32(f);
2483     if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2484         error_report("SaveVM v2 format is obsolete and don't work anymore");
2485         return -ENOTSUP;
2486     }
2487     if (v != QEMU_VM_FILE_VERSION) {
2488         error_report("Unsupported migration stream version");
2489         return -ENOTSUP;
2490     }
2491 
2492     if (migrate_get_current()->send_configuration) {
2493         if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2494             error_report("Configuration section missing");
2495             qemu_loadvm_state_cleanup();
2496             return -EINVAL;
2497         }
2498         ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2499 
2500         if (ret) {
2501             qemu_loadvm_state_cleanup();
2502             return ret;
2503         }
2504     }
2505     return 0;
2506 }
2507 
2508 static int qemu_loadvm_state_setup(QEMUFile *f)
2509 {
2510     SaveStateEntry *se;
2511     int ret;
2512 
2513     trace_loadvm_state_setup();
2514     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2515         if (!se->ops || !se->ops->load_setup) {
2516             continue;
2517         }
2518         if (se->ops->is_active) {
2519             if (!se->ops->is_active(se->opaque)) {
2520                 continue;
2521             }
2522         }
2523 
2524         ret = se->ops->load_setup(f, se->opaque);
2525         if (ret < 0) {
2526             qemu_file_set_error(f, ret);
2527             error_report("Load state of device %s failed", se->idstr);
2528             return ret;
2529         }
2530     }
2531     return 0;
2532 }
2533 
2534 void qemu_loadvm_state_cleanup(void)
2535 {
2536     SaveStateEntry *se;
2537 
2538     trace_loadvm_state_cleanup();
2539     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2540         if (se->ops && se->ops->load_cleanup) {
2541             se->ops->load_cleanup(se->opaque);
2542         }
2543     }
2544 }
2545 
2546 /* Return true if we should continue the migration, or false. */
2547 static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2548 {
2549     int i;
2550 
2551     trace_postcopy_pause_incoming();
2552 
2553     assert(migrate_postcopy_ram());
2554 
2555     /*
2556      * Unregister yank with either from/to src would work, since ioc behind it
2557      * is the same
2558      */
2559     migration_ioc_unregister_yank_from_file(mis->from_src_file);
2560 
2561     assert(mis->from_src_file);
2562     qemu_file_shutdown(mis->from_src_file);
2563     qemu_fclose(mis->from_src_file);
2564     mis->from_src_file = NULL;
2565 
2566     assert(mis->to_src_file);
2567     qemu_file_shutdown(mis->to_src_file);
2568     qemu_mutex_lock(&mis->rp_mutex);
2569     qemu_fclose(mis->to_src_file);
2570     mis->to_src_file = NULL;
2571     qemu_mutex_unlock(&mis->rp_mutex);
2572 
2573     /*
2574      * NOTE: this must happen before reset the PostcopyTmpPages below,
2575      * otherwise it's racy to reset those fields when the fast load thread
2576      * can be accessing it in parallel.
2577      */
2578     if (mis->postcopy_qemufile_dst) {
2579         qemu_file_shutdown(mis->postcopy_qemufile_dst);
2580         /* Take the mutex to make sure the fast ram load thread halted */
2581         qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
2582         migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
2583         qemu_fclose(mis->postcopy_qemufile_dst);
2584         mis->postcopy_qemufile_dst = NULL;
2585         qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
2586     }
2587 
2588     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2589                       MIGRATION_STATUS_POSTCOPY_PAUSED);
2590 
2591     /* Notify the fault thread for the invalidated file handle */
2592     postcopy_fault_thread_notify(mis);
2593 
2594     /*
2595      * If network is interrupted, any temp page we received will be useless
2596      * because we didn't mark them as "received" in receivedmap.  After a
2597      * proper recovery later (which will sync src dirty bitmap with receivedmap
2598      * on dest) these cached small pages will be resent again.
2599      */
2600     for (i = 0; i < mis->postcopy_channels; i++) {
2601         postcopy_temp_page_reset(&mis->postcopy_tmp_pages[i]);
2602     }
2603 
2604     error_report("Detected IO failure for postcopy. "
2605                  "Migration paused.");
2606 
2607     while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2608         qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2609     }
2610 
2611     trace_postcopy_pause_incoming_continued();
2612 
2613     return true;
2614 }
2615 
2616 int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2617 {
2618     uint8_t section_type;
2619     int ret = 0;
2620 
2621 retry:
2622     while (true) {
2623         section_type = qemu_get_byte(f);
2624 
2625         ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL);
2626         if (ret) {
2627             break;
2628         }
2629 
2630         trace_qemu_loadvm_state_section(section_type);
2631         switch (section_type) {
2632         case QEMU_VM_SECTION_START:
2633         case QEMU_VM_SECTION_FULL:
2634             ret = qemu_loadvm_section_start_full(f, mis);
2635             if (ret < 0) {
2636                 goto out;
2637             }
2638             break;
2639         case QEMU_VM_SECTION_PART:
2640         case QEMU_VM_SECTION_END:
2641             ret = qemu_loadvm_section_part_end(f, mis);
2642             if (ret < 0) {
2643                 goto out;
2644             }
2645             break;
2646         case QEMU_VM_COMMAND:
2647             ret = loadvm_process_command(f);
2648             trace_qemu_loadvm_state_section_command(ret);
2649             if ((ret < 0) || (ret == LOADVM_QUIT)) {
2650                 goto out;
2651             }
2652             break;
2653         case QEMU_VM_EOF:
2654             /* This is the end of migration */
2655             goto out;
2656         default:
2657             error_report("Unknown savevm section type %d", section_type);
2658             ret = -EINVAL;
2659             goto out;
2660         }
2661     }
2662 
2663 out:
2664     if (ret < 0) {
2665         qemu_file_set_error(f, ret);
2666 
2667         /* Cancel bitmaps incoming regardless of recovery */
2668         dirty_bitmap_mig_cancel_incoming();
2669 
2670         /*
2671          * If we are during an active postcopy, then we pause instead
2672          * of bail out to at least keep the VM's dirty data.  Note
2673          * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2674          * during which we're still receiving device states and we
2675          * still haven't yet started the VM on destination.
2676          *
2677          * Only RAM postcopy supports recovery. Still, if RAM postcopy is
2678          * enabled, canceled bitmaps postcopy will not affect RAM postcopy
2679          * recovering.
2680          */
2681         if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2682             migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
2683             /* Reset f to point to the newly created channel */
2684             f = mis->from_src_file;
2685             goto retry;
2686         }
2687     }
2688     return ret;
2689 }
2690 
2691 int qemu_loadvm_state(QEMUFile *f)
2692 {
2693     MigrationIncomingState *mis = migration_incoming_get_current();
2694     Error *local_err = NULL;
2695     int ret;
2696 
2697     if (qemu_savevm_state_blocked(&local_err)) {
2698         error_report_err(local_err);
2699         return -EINVAL;
2700     }
2701 
2702     ret = qemu_loadvm_state_header(f);
2703     if (ret) {
2704         return ret;
2705     }
2706 
2707     if (qemu_loadvm_state_setup(f) != 0) {
2708         return -EINVAL;
2709     }
2710 
2711     cpu_synchronize_all_pre_loadvm();
2712 
2713     ret = qemu_loadvm_state_main(f, mis);
2714     qemu_event_set(&mis->main_thread_load_event);
2715 
2716     trace_qemu_loadvm_state_post_main(ret);
2717 
2718     if (mis->have_listen_thread) {
2719         /* Listen thread still going, can't clean up yet */
2720         return ret;
2721     }
2722 
2723     if (ret == 0) {
2724         ret = qemu_file_get_error(f);
2725     }
2726 
2727     /*
2728      * Try to read in the VMDESC section as well, so that dumping tools that
2729      * intercept our migration stream have the chance to see it.
2730      */
2731 
2732     /* We've got to be careful; if we don't read the data and just shut the fd
2733      * then the sender can error if we close while it's still sending.
2734      * We also mustn't read data that isn't there; some transports (RDMA)
2735      * will stall waiting for that data when the source has already closed.
2736      */
2737     if (ret == 0 && should_send_vmdesc()) {
2738         uint8_t *buf;
2739         uint32_t size;
2740         uint8_t  section_type = qemu_get_byte(f);
2741 
2742         if (section_type != QEMU_VM_VMDESCRIPTION) {
2743             error_report("Expected vmdescription section, but got %d",
2744                          section_type);
2745             /*
2746              * It doesn't seem worth failing at this point since
2747              * we apparently have an otherwise valid VM state
2748              */
2749         } else {
2750             buf = g_malloc(0x1000);
2751             size = qemu_get_be32(f);
2752 
2753             while (size > 0) {
2754                 uint32_t read_chunk = MIN(size, 0x1000);
2755                 qemu_get_buffer(f, buf, read_chunk);
2756                 size -= read_chunk;
2757             }
2758             g_free(buf);
2759         }
2760     }
2761 
2762     qemu_loadvm_state_cleanup();
2763     cpu_synchronize_all_post_init();
2764 
2765     return ret;
2766 }
2767 
2768 int qemu_load_device_state(QEMUFile *f)
2769 {
2770     MigrationIncomingState *mis = migration_incoming_get_current();
2771     int ret;
2772 
2773     /* Load QEMU_VM_SECTION_FULL section */
2774     ret = qemu_loadvm_state_main(f, mis);
2775     if (ret < 0) {
2776         error_report("Failed to load device state: %d", ret);
2777         return ret;
2778     }
2779 
2780     cpu_synchronize_all_post_init();
2781     return 0;
2782 }
2783 
2784 bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
2785                   bool has_devices, strList *devices, Error **errp)
2786 {
2787     BlockDriverState *bs;
2788     QEMUSnapshotInfo sn1, *sn = &sn1;
2789     int ret = -1, ret2;
2790     QEMUFile *f;
2791     int saved_vm_running;
2792     uint64_t vm_state_size;
2793     g_autoptr(GDateTime) now = g_date_time_new_now_local();
2794     AioContext *aio_context;
2795 
2796     GLOBAL_STATE_CODE();
2797 
2798     if (migration_is_blocked(errp)) {
2799         return false;
2800     }
2801 
2802     if (!replay_can_snapshot()) {
2803         error_setg(errp, "Record/replay does not allow making snapshot "
2804                    "right now. Try once more later.");
2805         return false;
2806     }
2807 
2808     if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
2809         return false;
2810     }
2811 
2812     /* Delete old snapshots of the same name */
2813     if (name) {
2814         if (overwrite) {
2815             if (bdrv_all_delete_snapshot(name, has_devices,
2816                                          devices, errp) < 0) {
2817                 return false;
2818             }
2819         } else {
2820             ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
2821             if (ret2 < 0) {
2822                 return false;
2823             }
2824             if (ret2 == 1) {
2825                 error_setg(errp,
2826                            "Snapshot '%s' already exists in one or more devices",
2827                            name);
2828                 return false;
2829             }
2830         }
2831     }
2832 
2833     bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
2834     if (bs == NULL) {
2835         return false;
2836     }
2837     aio_context = bdrv_get_aio_context(bs);
2838 
2839     saved_vm_running = runstate_is_running();
2840 
2841     ret = global_state_store();
2842     if (ret) {
2843         error_setg(errp, "Error saving global state");
2844         return false;
2845     }
2846     vm_stop(RUN_STATE_SAVE_VM);
2847 
2848     bdrv_drain_all_begin();
2849 
2850     aio_context_acquire(aio_context);
2851 
2852     memset(sn, 0, sizeof(*sn));
2853 
2854     /* fill auxiliary fields */
2855     sn->date_sec = g_date_time_to_unix(now);
2856     sn->date_nsec = g_date_time_get_microsecond(now) * 1000;
2857     sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2858     if (replay_mode != REPLAY_MODE_NONE) {
2859         sn->icount = replay_get_current_icount();
2860     } else {
2861         sn->icount = -1ULL;
2862     }
2863 
2864     if (name) {
2865         pstrcpy(sn->name, sizeof(sn->name), name);
2866     } else {
2867         g_autofree char *autoname = g_date_time_format(now,  "vm-%Y%m%d%H%M%S");
2868         pstrcpy(sn->name, sizeof(sn->name), autoname);
2869     }
2870 
2871     /* save the VM state */
2872     f = qemu_fopen_bdrv(bs, 1);
2873     if (!f) {
2874         error_setg(errp, "Could not open VM state file");
2875         goto the_end;
2876     }
2877     ret = qemu_savevm_state(f, errp);
2878     vm_state_size = qemu_file_total_transferred(f);
2879     ret2 = qemu_fclose(f);
2880     if (ret < 0) {
2881         goto the_end;
2882     }
2883     if (ret2 < 0) {
2884         ret = ret2;
2885         goto the_end;
2886     }
2887 
2888     /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2889      * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2890      * it only releases the lock once.  Therefore synchronous I/O will deadlock
2891      * unless we release the AioContext before bdrv_all_create_snapshot().
2892      */
2893     aio_context_release(aio_context);
2894     aio_context = NULL;
2895 
2896     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
2897                                    has_devices, devices, errp);
2898     if (ret < 0) {
2899         bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
2900         goto the_end;
2901     }
2902 
2903     ret = 0;
2904 
2905  the_end:
2906     if (aio_context) {
2907         aio_context_release(aio_context);
2908     }
2909 
2910     bdrv_drain_all_end();
2911 
2912     if (saved_vm_running) {
2913         vm_start();
2914     }
2915     return ret == 0;
2916 }
2917 
2918 void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2919                                 Error **errp)
2920 {
2921     QEMUFile *f;
2922     QIOChannelFile *ioc;
2923     int saved_vm_running;
2924     int ret;
2925 
2926     if (!has_live) {
2927         /* live default to true so old version of Xen tool stack can have a
2928          * successful live migration */
2929         live = true;
2930     }
2931 
2932     saved_vm_running = runstate_is_running();
2933     vm_stop(RUN_STATE_SAVE_VM);
2934     global_state_store_running();
2935 
2936     ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
2937                                     0660, errp);
2938     if (!ioc) {
2939         goto the_end;
2940     }
2941     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2942     f = qemu_file_new_output(QIO_CHANNEL(ioc));
2943     object_unref(OBJECT(ioc));
2944     ret = qemu_save_device_state(f);
2945     if (ret < 0 || qemu_fclose(f) < 0) {
2946         error_setg(errp, QERR_IO_ERROR);
2947     } else {
2948         /* libxl calls the QMP command "stop" before calling
2949          * "xen-save-devices-state" and in case of migration failure, libxl
2950          * would call "cont".
2951          * So call bdrv_inactivate_all (release locks) here to let the other
2952          * side of the migration take control of the images.
2953          */
2954         if (live && !saved_vm_running) {
2955             ret = bdrv_inactivate_all();
2956             if (ret) {
2957                 error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2958                            __func__, ret);
2959             }
2960         }
2961     }
2962 
2963  the_end:
2964     if (saved_vm_running) {
2965         vm_start();
2966     }
2967 }
2968 
2969 void qmp_xen_load_devices_state(const char *filename, Error **errp)
2970 {
2971     QEMUFile *f;
2972     QIOChannelFile *ioc;
2973     int ret;
2974 
2975     /* Guest must be paused before loading the device state; the RAM state
2976      * will already have been loaded by xc
2977      */
2978     if (runstate_is_running()) {
2979         error_setg(errp, "Cannot update device state while vm is running");
2980         return;
2981     }
2982     vm_stop(RUN_STATE_RESTORE_VM);
2983 
2984     ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2985     if (!ioc) {
2986         return;
2987     }
2988     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2989     f = qemu_file_new_input(QIO_CHANNEL(ioc));
2990     object_unref(OBJECT(ioc));
2991 
2992     ret = qemu_loadvm_state(f);
2993     qemu_fclose(f);
2994     if (ret < 0) {
2995         error_setg(errp, QERR_IO_ERROR);
2996     }
2997     migration_incoming_state_destroy();
2998 }
2999 
3000 bool load_snapshot(const char *name, const char *vmstate,
3001                    bool has_devices, strList *devices, Error **errp)
3002 {
3003     BlockDriverState *bs_vm_state;
3004     QEMUSnapshotInfo sn;
3005     QEMUFile *f;
3006     int ret;
3007     AioContext *aio_context;
3008     MigrationIncomingState *mis = migration_incoming_get_current();
3009 
3010     if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3011         return false;
3012     }
3013     ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3014     if (ret < 0) {
3015         return false;
3016     }
3017     if (ret == 0) {
3018         error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
3019                    name);
3020         return false;
3021     }
3022 
3023     bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3024     if (!bs_vm_state) {
3025         return false;
3026     }
3027     aio_context = bdrv_get_aio_context(bs_vm_state);
3028 
3029     /* Don't even try to load empty VM states */
3030     aio_context_acquire(aio_context);
3031     ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
3032     aio_context_release(aio_context);
3033     if (ret < 0) {
3034         return false;
3035     } else if (sn.vm_state_size == 0) {
3036         error_setg(errp, "This is a disk-only snapshot. Revert to it "
3037                    " offline using qemu-img");
3038         return false;
3039     }
3040 
3041     /*
3042      * Flush the record/replay queue. Now the VM state is going
3043      * to change. Therefore we don't need to preserve its consistency
3044      */
3045     replay_flush_events();
3046 
3047     /* Flush all IO requests so they don't interfere with the new state.  */
3048     bdrv_drain_all_begin();
3049 
3050     ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
3051     if (ret < 0) {
3052         goto err_drain;
3053     }
3054 
3055     /* restore the VM state */
3056     f = qemu_fopen_bdrv(bs_vm_state, 0);
3057     if (!f) {
3058         error_setg(errp, "Could not open VM state file");
3059         goto err_drain;
3060     }
3061 
3062     qemu_system_reset(SHUTDOWN_CAUSE_SNAPSHOT_LOAD);
3063     mis->from_src_file = f;
3064 
3065     if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
3066         ret = -EINVAL;
3067         goto err_drain;
3068     }
3069     aio_context_acquire(aio_context);
3070     ret = qemu_loadvm_state(f);
3071     migration_incoming_state_destroy();
3072     aio_context_release(aio_context);
3073 
3074     bdrv_drain_all_end();
3075 
3076     if (ret < 0) {
3077         error_setg(errp, "Error %d while loading VM state", ret);
3078         return false;
3079     }
3080 
3081     return true;
3082 
3083 err_drain:
3084     bdrv_drain_all_end();
3085     return false;
3086 }
3087 
3088 bool delete_snapshot(const char *name, bool has_devices,
3089                      strList *devices, Error **errp)
3090 {
3091     if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3092         return false;
3093     }
3094 
3095     if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
3096         return false;
3097     }
3098 
3099     return true;
3100 }
3101 
3102 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
3103 {
3104     qemu_ram_set_idstr(mr->ram_block,
3105                        memory_region_name(mr), dev);
3106     qemu_ram_set_migratable(mr->ram_block);
3107 }
3108 
3109 void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
3110 {
3111     qemu_ram_unset_idstr(mr->ram_block);
3112     qemu_ram_unset_migratable(mr->ram_block);
3113 }
3114 
3115 void vmstate_register_ram_global(MemoryRegion *mr)
3116 {
3117     vmstate_register_ram(mr, NULL);
3118 }
3119 
3120 bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
3121 {
3122     /* check needed if --only-migratable is specified */
3123     if (!only_migratable) {
3124         return true;
3125     }
3126 
3127     return !(vmsd && vmsd->unmigratable);
3128 }
3129 
3130 typedef struct SnapshotJob {
3131     Job common;
3132     char *tag;
3133     char *vmstate;
3134     strList *devices;
3135     Coroutine *co;
3136     Error **errp;
3137     bool ret;
3138 } SnapshotJob;
3139 
3140 static void qmp_snapshot_job_free(SnapshotJob *s)
3141 {
3142     g_free(s->tag);
3143     g_free(s->vmstate);
3144     qapi_free_strList(s->devices);
3145 }
3146 
3147 
3148 static void snapshot_load_job_bh(void *opaque)
3149 {
3150     Job *job = opaque;
3151     SnapshotJob *s = container_of(job, SnapshotJob, common);
3152     int orig_vm_running;
3153 
3154     job_progress_set_remaining(&s->common, 1);
3155 
3156     orig_vm_running = runstate_is_running();
3157     vm_stop(RUN_STATE_RESTORE_VM);
3158 
3159     s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
3160     if (s->ret && orig_vm_running) {
3161         vm_start();
3162     }
3163 
3164     job_progress_update(&s->common, 1);
3165 
3166     qmp_snapshot_job_free(s);
3167     aio_co_wake(s->co);
3168 }
3169 
3170 static void snapshot_save_job_bh(void *opaque)
3171 {
3172     Job *job = opaque;
3173     SnapshotJob *s = container_of(job, SnapshotJob, common);
3174 
3175     job_progress_set_remaining(&s->common, 1);
3176     s->ret = save_snapshot(s->tag, false, s->vmstate,
3177                            true, s->devices, s->errp);
3178     job_progress_update(&s->common, 1);
3179 
3180     qmp_snapshot_job_free(s);
3181     aio_co_wake(s->co);
3182 }
3183 
3184 static void snapshot_delete_job_bh(void *opaque)
3185 {
3186     Job *job = opaque;
3187     SnapshotJob *s = container_of(job, SnapshotJob, common);
3188 
3189     job_progress_set_remaining(&s->common, 1);
3190     s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
3191     job_progress_update(&s->common, 1);
3192 
3193     qmp_snapshot_job_free(s);
3194     aio_co_wake(s->co);
3195 }
3196 
3197 static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
3198 {
3199     SnapshotJob *s = container_of(job, SnapshotJob, common);
3200     s->errp = errp;
3201     s->co = qemu_coroutine_self();
3202     aio_bh_schedule_oneshot(qemu_get_aio_context(),
3203                             snapshot_save_job_bh, job);
3204     qemu_coroutine_yield();
3205     return s->ret ? 0 : -1;
3206 }
3207 
3208 static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
3209 {
3210     SnapshotJob *s = container_of(job, SnapshotJob, common);
3211     s->errp = errp;
3212     s->co = qemu_coroutine_self();
3213     aio_bh_schedule_oneshot(qemu_get_aio_context(),
3214                             snapshot_load_job_bh, job);
3215     qemu_coroutine_yield();
3216     return s->ret ? 0 : -1;
3217 }
3218 
3219 static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
3220 {
3221     SnapshotJob *s = container_of(job, SnapshotJob, common);
3222     s->errp = errp;
3223     s->co = qemu_coroutine_self();
3224     aio_bh_schedule_oneshot(qemu_get_aio_context(),
3225                             snapshot_delete_job_bh, job);
3226     qemu_coroutine_yield();
3227     return s->ret ? 0 : -1;
3228 }
3229 
3230 
3231 static const JobDriver snapshot_load_job_driver = {
3232     .instance_size = sizeof(SnapshotJob),
3233     .job_type      = JOB_TYPE_SNAPSHOT_LOAD,
3234     .run           = snapshot_load_job_run,
3235 };
3236 
3237 static const JobDriver snapshot_save_job_driver = {
3238     .instance_size = sizeof(SnapshotJob),
3239     .job_type      = JOB_TYPE_SNAPSHOT_SAVE,
3240     .run           = snapshot_save_job_run,
3241 };
3242 
3243 static const JobDriver snapshot_delete_job_driver = {
3244     .instance_size = sizeof(SnapshotJob),
3245     .job_type      = JOB_TYPE_SNAPSHOT_DELETE,
3246     .run           = snapshot_delete_job_run,
3247 };
3248 
3249 
3250 void qmp_snapshot_save(const char *job_id,
3251                        const char *tag,
3252                        const char *vmstate,
3253                        strList *devices,
3254                        Error **errp)
3255 {
3256     SnapshotJob *s;
3257 
3258     s = job_create(job_id, &snapshot_save_job_driver, NULL,
3259                    qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3260                    NULL, NULL, errp);
3261     if (!s) {
3262         return;
3263     }
3264 
3265     s->tag = g_strdup(tag);
3266     s->vmstate = g_strdup(vmstate);
3267     s->devices = QAPI_CLONE(strList, devices);
3268 
3269     job_start(&s->common);
3270 }
3271 
3272 void qmp_snapshot_load(const char *job_id,
3273                        const char *tag,
3274                        const char *vmstate,
3275                        strList *devices,
3276                        Error **errp)
3277 {
3278     SnapshotJob *s;
3279 
3280     s = job_create(job_id, &snapshot_load_job_driver, NULL,
3281                    qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3282                    NULL, NULL, errp);
3283     if (!s) {
3284         return;
3285     }
3286 
3287     s->tag = g_strdup(tag);
3288     s->vmstate = g_strdup(vmstate);
3289     s->devices = QAPI_CLONE(strList, devices);
3290 
3291     job_start(&s->common);
3292 }
3293 
3294 void qmp_snapshot_delete(const char *job_id,
3295                          const char *tag,
3296                          strList *devices,
3297                          Error **errp)
3298 {
3299     SnapshotJob *s;
3300 
3301     s = job_create(job_id, &snapshot_delete_job_driver, NULL,
3302                    qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3303                    NULL, NULL, errp);
3304     if (!s) {
3305         return;
3306     }
3307 
3308     s->tag = g_strdup(tag);
3309     s->devices = QAPI_CLONE(strList, devices);
3310 
3311     job_start(&s->common);
3312 }
3313