xref: /openbmc/qemu/migration/savevm.c (revision a9ded601)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2009-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "hw/boards.h"
31 #include "hw/hw.h"
32 #include "hw/qdev.h"
33 #include "hw/xen/xen.h"
34 #include "net/net.h"
35 #include "sysemu/sysemu.h"
36 #include "qemu/timer.h"
37 #include "migration.h"
38 #include "migration/snapshot.h"
39 #include "migration/misc.h"
40 #include "migration/register.h"
41 #include "migration/global_state.h"
42 #include "ram.h"
43 #include "qemu-file-channel.h"
44 #include "qemu-file.h"
45 #include "savevm.h"
46 #include "postcopy-ram.h"
47 #include "qapi/qmp/qerror.h"
48 #include "qemu/error-report.h"
49 #include "qemu/queue.h"
50 #include "sysemu/cpus.h"
51 #include "exec/memory.h"
52 #include "exec/target_page.h"
53 #include "qmp-commands.h"
54 #include "trace.h"
55 #include "qemu/bitops.h"
56 #include "qemu/iov.h"
57 #include "block/snapshot.h"
58 #include "qemu/cutils.h"
59 #include "io/channel-buffer.h"
60 #include "io/channel-file.h"
61 
62 #ifndef ETH_P_RARP
63 #define ETH_P_RARP 0x8035
64 #endif
65 #define ARP_HTYPE_ETH 0x0001
66 #define ARP_PTYPE_IP 0x0800
67 #define ARP_OP_REQUEST_REV 0x3
68 
69 const unsigned int postcopy_ram_discard_version = 0;
70 
71 static bool skip_section_footers;
72 
73 /* Subcommands for QEMU_VM_COMMAND */
74 enum qemu_vm_cmd {
75     MIG_CMD_INVALID = 0,   /* Must be 0 */
76     MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
77     MIG_CMD_PING,              /* Request a PONG on the RP */
78 
79     MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
80                                       warn we might want to do PC */
81     MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
82                                       pages as it's running. */
83     MIG_CMD_POSTCOPY_RUN,          /* Start execution */
84 
85     MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
86                                       were previously sent during
87                                       precopy but are dirty. */
88     MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
89     MIG_CMD_MAX
90 };
91 
92 #define MAX_VM_CMD_PACKAGED_SIZE (1ul << 24)
93 static struct mig_cmd_args {
94     ssize_t     len; /* -1 = variable */
95     const char *name;
96 } mig_cmd_args[] = {
97     [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
98     [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
99     [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
100     [MIG_CMD_POSTCOPY_ADVISE]  = { .len = 16, .name = "POSTCOPY_ADVISE" },
101     [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
102     [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
103     [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
104                                    .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
105     [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
106     [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
107 };
108 
109 static int announce_self_create(uint8_t *buf,
110                                 uint8_t *mac_addr)
111 {
112     /* Ethernet header. */
113     memset(buf, 0xff, 6);         /* destination MAC addr */
114     memcpy(buf + 6, mac_addr, 6); /* source MAC addr */
115     *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */
116 
117     /* RARP header. */
118     *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */
119     *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */
120     *(buf + 18) = 6; /* hardware addr length (ethernet) */
121     *(buf + 19) = 4; /* protocol addr length (IPv4) */
122     *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */
123     memcpy(buf + 22, mac_addr, 6); /* source hw addr */
124     memset(buf + 28, 0x00, 4);     /* source protocol addr */
125     memcpy(buf + 32, mac_addr, 6); /* target hw addr */
126     memset(buf + 38, 0x00, 4);     /* target protocol addr */
127 
128     /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */
129     memset(buf + 42, 0x00, 18);
130 
131     return 60; /* len (FCS will be added by hardware) */
132 }
133 
134 static void qemu_announce_self_iter(NICState *nic, void *opaque)
135 {
136     uint8_t buf[60];
137     int len;
138 
139     trace_qemu_announce_self_iter(qemu_ether_ntoa(&nic->conf->macaddr));
140     len = announce_self_create(buf, nic->conf->macaddr.a);
141 
142     qemu_send_packet_raw(qemu_get_queue(nic), buf, len);
143 }
144 
145 
146 static void qemu_announce_self_once(void *opaque)
147 {
148     static int count = SELF_ANNOUNCE_ROUNDS;
149     QEMUTimer *timer = *(QEMUTimer **)opaque;
150 
151     qemu_foreach_nic(qemu_announce_self_iter, NULL);
152 
153     if (--count) {
154         /* delay 50ms, 150ms, 250ms, ... */
155         timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
156                   self_announce_delay(count));
157     } else {
158             timer_del(timer);
159             timer_free(timer);
160     }
161 }
162 
163 void qemu_announce_self(void)
164 {
165     static QEMUTimer *timer;
166     timer = timer_new_ms(QEMU_CLOCK_REALTIME, qemu_announce_self_once, &timer);
167     qemu_announce_self_once(&timer);
168 }
169 
170 /***********************************************************/
171 /* savevm/loadvm support */
172 
173 static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
174                                    int64_t pos)
175 {
176     int ret;
177     QEMUIOVector qiov;
178 
179     qemu_iovec_init_external(&qiov, iov, iovcnt);
180     ret = bdrv_writev_vmstate(opaque, &qiov, pos);
181     if (ret < 0) {
182         return ret;
183     }
184 
185     return qiov.size;
186 }
187 
188 static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
189                                 size_t size)
190 {
191     return bdrv_load_vmstate(opaque, buf, pos, size);
192 }
193 
194 static int bdrv_fclose(void *opaque)
195 {
196     return bdrv_flush(opaque);
197 }
198 
199 static const QEMUFileOps bdrv_read_ops = {
200     .get_buffer = block_get_buffer,
201     .close =      bdrv_fclose
202 };
203 
204 static const QEMUFileOps bdrv_write_ops = {
205     .writev_buffer  = block_writev_buffer,
206     .close          = bdrv_fclose
207 };
208 
209 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
210 {
211     if (is_writable) {
212         return qemu_fopen_ops(bs, &bdrv_write_ops);
213     }
214     return qemu_fopen_ops(bs, &bdrv_read_ops);
215 }
216 
217 
218 /* QEMUFile timer support.
219  * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
220  */
221 
222 void timer_put(QEMUFile *f, QEMUTimer *ts)
223 {
224     uint64_t expire_time;
225 
226     expire_time = timer_expire_time_ns(ts);
227     qemu_put_be64(f, expire_time);
228 }
229 
230 void timer_get(QEMUFile *f, QEMUTimer *ts)
231 {
232     uint64_t expire_time;
233 
234     expire_time = qemu_get_be64(f);
235     if (expire_time != -1) {
236         timer_mod_ns(ts, expire_time);
237     } else {
238         timer_del(ts);
239     }
240 }
241 
242 
243 /* VMState timer support.
244  * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
245  */
246 
247 static int get_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field)
248 {
249     QEMUTimer *v = pv;
250     timer_get(f, v);
251     return 0;
252 }
253 
254 static int put_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field,
255                      QJSON *vmdesc)
256 {
257     QEMUTimer *v = pv;
258     timer_put(f, v);
259 
260     return 0;
261 }
262 
263 const VMStateInfo vmstate_info_timer = {
264     .name = "timer",
265     .get  = get_timer,
266     .put  = put_timer,
267 };
268 
269 
270 typedef struct CompatEntry {
271     char idstr[256];
272     int instance_id;
273 } CompatEntry;
274 
275 typedef struct SaveStateEntry {
276     QTAILQ_ENTRY(SaveStateEntry) entry;
277     char idstr[256];
278     int instance_id;
279     int alias_id;
280     int version_id;
281     /* version id read from the stream */
282     int load_version_id;
283     int section_id;
284     /* section id read from the stream */
285     int load_section_id;
286     SaveVMHandlers *ops;
287     const VMStateDescription *vmsd;
288     void *opaque;
289     CompatEntry *compat;
290     int is_ram;
291 } SaveStateEntry;
292 
293 typedef struct SaveState {
294     QTAILQ_HEAD(, SaveStateEntry) handlers;
295     int global_section_id;
296     bool skip_configuration;
297     uint32_t len;
298     const char *name;
299     uint32_t target_page_bits;
300 } SaveState;
301 
302 static SaveState savevm_state = {
303     .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
304     .global_section_id = 0,
305     .skip_configuration = false,
306 };
307 
308 void savevm_skip_configuration(void)
309 {
310     savevm_state.skip_configuration = true;
311 }
312 
313 
314 static void configuration_pre_save(void *opaque)
315 {
316     SaveState *state = opaque;
317     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
318 
319     state->len = strlen(current_name);
320     state->name = current_name;
321     state->target_page_bits = qemu_target_page_bits();
322 }
323 
324 static int configuration_pre_load(void *opaque)
325 {
326     SaveState *state = opaque;
327 
328     /* If there is no target-page-bits subsection it means the source
329      * predates the variable-target-page-bits support and is using the
330      * minimum possible value for this CPU.
331      */
332     state->target_page_bits = qemu_target_page_bits_min();
333     return 0;
334 }
335 
336 static int configuration_post_load(void *opaque, int version_id)
337 {
338     SaveState *state = opaque;
339     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
340 
341     if (strncmp(state->name, current_name, state->len) != 0) {
342         error_report("Machine type received is '%.*s' and local is '%s'",
343                      (int) state->len, state->name, current_name);
344         return -EINVAL;
345     }
346 
347     if (state->target_page_bits != qemu_target_page_bits()) {
348         error_report("Received TARGET_PAGE_BITS is %d but local is %d",
349                      state->target_page_bits, qemu_target_page_bits());
350         return -EINVAL;
351     }
352 
353     return 0;
354 }
355 
356 /* The target-page-bits subsection is present only if the
357  * target page size is not the same as the default (ie the
358  * minimum page size for a variable-page-size guest CPU).
359  * If it is present then it contains the actual target page
360  * bits for the machine, and migration will fail if the
361  * two ends don't agree about it.
362  */
363 static bool vmstate_target_page_bits_needed(void *opaque)
364 {
365     return qemu_target_page_bits()
366         > qemu_target_page_bits_min();
367 }
368 
369 static const VMStateDescription vmstate_target_page_bits = {
370     .name = "configuration/target-page-bits",
371     .version_id = 1,
372     .minimum_version_id = 1,
373     .needed = vmstate_target_page_bits_needed,
374     .fields = (VMStateField[]) {
375         VMSTATE_UINT32(target_page_bits, SaveState),
376         VMSTATE_END_OF_LIST()
377     }
378 };
379 
380 static const VMStateDescription vmstate_configuration = {
381     .name = "configuration",
382     .version_id = 1,
383     .pre_load = configuration_pre_load,
384     .post_load = configuration_post_load,
385     .pre_save = configuration_pre_save,
386     .fields = (VMStateField[]) {
387         VMSTATE_UINT32(len, SaveState),
388         VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
389         VMSTATE_END_OF_LIST()
390     },
391     .subsections = (const VMStateDescription*[]) {
392         &vmstate_target_page_bits,
393         NULL
394     }
395 };
396 
397 static void dump_vmstate_vmsd(FILE *out_file,
398                               const VMStateDescription *vmsd, int indent,
399                               bool is_subsection);
400 
401 static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
402                               int indent)
403 {
404     fprintf(out_file, "%*s{\n", indent, "");
405     indent += 2;
406     fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
407     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
408             field->version_id);
409     fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
410             field->field_exists ? "true" : "false");
411     fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
412     if (field->vmsd != NULL) {
413         fprintf(out_file, ",\n");
414         dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
415     }
416     fprintf(out_file, "\n%*s}", indent - 2, "");
417 }
418 
419 static void dump_vmstate_vmss(FILE *out_file,
420                               const VMStateDescription **subsection,
421                               int indent)
422 {
423     if (*subsection != NULL) {
424         dump_vmstate_vmsd(out_file, *subsection, indent, true);
425     }
426 }
427 
428 static void dump_vmstate_vmsd(FILE *out_file,
429                               const VMStateDescription *vmsd, int indent,
430                               bool is_subsection)
431 {
432     if (is_subsection) {
433         fprintf(out_file, "%*s{\n", indent, "");
434     } else {
435         fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
436     }
437     indent += 2;
438     fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
439     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
440             vmsd->version_id);
441     fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
442             vmsd->minimum_version_id);
443     if (vmsd->fields != NULL) {
444         const VMStateField *field = vmsd->fields;
445         bool first;
446 
447         fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
448         first = true;
449         while (field->name != NULL) {
450             if (field->flags & VMS_MUST_EXIST) {
451                 /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
452                 field++;
453                 continue;
454             }
455             if (!first) {
456                 fprintf(out_file, ",\n");
457             }
458             dump_vmstate_vmsf(out_file, field, indent + 2);
459             field++;
460             first = false;
461         }
462         fprintf(out_file, "\n%*s]", indent, "");
463     }
464     if (vmsd->subsections != NULL) {
465         const VMStateDescription **subsection = vmsd->subsections;
466         bool first;
467 
468         fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
469         first = true;
470         while (*subsection != NULL) {
471             if (!first) {
472                 fprintf(out_file, ",\n");
473             }
474             dump_vmstate_vmss(out_file, subsection, indent + 2);
475             subsection++;
476             first = false;
477         }
478         fprintf(out_file, "\n%*s]", indent, "");
479     }
480     fprintf(out_file, "\n%*s}", indent - 2, "");
481 }
482 
483 static void dump_machine_type(FILE *out_file)
484 {
485     MachineClass *mc;
486 
487     mc = MACHINE_GET_CLASS(current_machine);
488 
489     fprintf(out_file, "  \"vmschkmachine\": {\n");
490     fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
491     fprintf(out_file, "  },\n");
492 }
493 
494 void dump_vmstate_json_to_file(FILE *out_file)
495 {
496     GSList *list, *elt;
497     bool first;
498 
499     fprintf(out_file, "{\n");
500     dump_machine_type(out_file);
501 
502     first = true;
503     list = object_class_get_list(TYPE_DEVICE, true);
504     for (elt = list; elt; elt = elt->next) {
505         DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
506                                              TYPE_DEVICE);
507         const char *name;
508         int indent = 2;
509 
510         if (!dc->vmsd) {
511             continue;
512         }
513 
514         if (!first) {
515             fprintf(out_file, ",\n");
516         }
517         name = object_class_get_name(OBJECT_CLASS(dc));
518         fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
519         indent += 2;
520         fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
521         fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
522                 dc->vmsd->version_id);
523         fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
524                 dc->vmsd->minimum_version_id);
525 
526         dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
527 
528         fprintf(out_file, "\n%*s}", indent - 2, "");
529         first = false;
530     }
531     fprintf(out_file, "\n}\n");
532     fclose(out_file);
533 }
534 
535 static int calculate_new_instance_id(const char *idstr)
536 {
537     SaveStateEntry *se;
538     int instance_id = 0;
539 
540     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
541         if (strcmp(idstr, se->idstr) == 0
542             && instance_id <= se->instance_id) {
543             instance_id = se->instance_id + 1;
544         }
545     }
546     return instance_id;
547 }
548 
549 static int calculate_compat_instance_id(const char *idstr)
550 {
551     SaveStateEntry *se;
552     int instance_id = 0;
553 
554     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
555         if (!se->compat) {
556             continue;
557         }
558 
559         if (strcmp(idstr, se->compat->idstr) == 0
560             && instance_id <= se->compat->instance_id) {
561             instance_id = se->compat->instance_id + 1;
562         }
563     }
564     return instance_id;
565 }
566 
567 static inline MigrationPriority save_state_priority(SaveStateEntry *se)
568 {
569     if (se->vmsd) {
570         return se->vmsd->priority;
571     }
572     return MIG_PRI_DEFAULT;
573 }
574 
575 static void savevm_state_handler_insert(SaveStateEntry *nse)
576 {
577     MigrationPriority priority = save_state_priority(nse);
578     SaveStateEntry *se;
579 
580     assert(priority <= MIG_PRI_MAX);
581 
582     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
583         if (save_state_priority(se) < priority) {
584             break;
585         }
586     }
587 
588     if (se) {
589         QTAILQ_INSERT_BEFORE(se, nse, entry);
590     } else {
591         QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
592     }
593 }
594 
595 /* TODO: Individual devices generally have very little idea about the rest
596    of the system, so instance_id should be removed/replaced.
597    Meanwhile pass -1 as instance_id if you do not already have a clearly
598    distinguishing id for all instances of your device class. */
599 int register_savevm_live(DeviceState *dev,
600                          const char *idstr,
601                          int instance_id,
602                          int version_id,
603                          SaveVMHandlers *ops,
604                          void *opaque)
605 {
606     SaveStateEntry *se;
607 
608     se = g_new0(SaveStateEntry, 1);
609     se->version_id = version_id;
610     se->section_id = savevm_state.global_section_id++;
611     se->ops = ops;
612     se->opaque = opaque;
613     se->vmsd = NULL;
614     /* if this is a live_savem then set is_ram */
615     if (ops->save_live_setup != NULL) {
616         se->is_ram = 1;
617     }
618 
619     if (dev) {
620         char *id = qdev_get_dev_path(dev);
621         if (id) {
622             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
623                 sizeof(se->idstr)) {
624                 error_report("Path too long for VMState (%s)", id);
625                 g_free(id);
626                 g_free(se);
627 
628                 return -1;
629             }
630             g_free(id);
631 
632             se->compat = g_new0(CompatEntry, 1);
633             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr);
634             se->compat->instance_id = instance_id == -1 ?
635                          calculate_compat_instance_id(idstr) : instance_id;
636             instance_id = -1;
637         }
638     }
639     pstrcat(se->idstr, sizeof(se->idstr), idstr);
640 
641     if (instance_id == -1) {
642         se->instance_id = calculate_new_instance_id(se->idstr);
643     } else {
644         se->instance_id = instance_id;
645     }
646     assert(!se->compat || se->instance_id == 0);
647     savevm_state_handler_insert(se);
648     return 0;
649 }
650 
651 void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque)
652 {
653     SaveStateEntry *se, *new_se;
654     char id[256] = "";
655 
656     if (dev) {
657         char *path = qdev_get_dev_path(dev);
658         if (path) {
659             pstrcpy(id, sizeof(id), path);
660             pstrcat(id, sizeof(id), "/");
661             g_free(path);
662         }
663     }
664     pstrcat(id, sizeof(id), idstr);
665 
666     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
667         if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
668             QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
669             g_free(se->compat);
670             g_free(se);
671         }
672     }
673 }
674 
675 int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
676                                    const VMStateDescription *vmsd,
677                                    void *opaque, int alias_id,
678                                    int required_for_version,
679                                    Error **errp)
680 {
681     SaveStateEntry *se;
682 
683     /* If this triggers, alias support can be dropped for the vmsd. */
684     assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
685 
686     se = g_new0(SaveStateEntry, 1);
687     se->version_id = vmsd->version_id;
688     se->section_id = savevm_state.global_section_id++;
689     se->opaque = opaque;
690     se->vmsd = vmsd;
691     se->alias_id = alias_id;
692 
693     if (dev) {
694         char *id = qdev_get_dev_path(dev);
695         if (id) {
696             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
697                 sizeof(se->idstr)) {
698                 error_setg(errp, "Path too long for VMState (%s)", id);
699                 g_free(id);
700                 g_free(se);
701 
702                 return -1;
703             }
704             g_free(id);
705 
706             se->compat = g_new0(CompatEntry, 1);
707             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
708             se->compat->instance_id = instance_id == -1 ?
709                          calculate_compat_instance_id(vmsd->name) : instance_id;
710             instance_id = -1;
711         }
712     }
713     pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
714 
715     if (instance_id == -1) {
716         se->instance_id = calculate_new_instance_id(se->idstr);
717     } else {
718         se->instance_id = instance_id;
719     }
720     assert(!se->compat || se->instance_id == 0);
721     savevm_state_handler_insert(se);
722     return 0;
723 }
724 
725 void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd,
726                         void *opaque)
727 {
728     SaveStateEntry *se, *new_se;
729 
730     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
731         if (se->vmsd == vmsd && se->opaque == opaque) {
732             QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
733             g_free(se->compat);
734             g_free(se);
735         }
736     }
737 }
738 
739 static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
740 {
741     trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
742     if (!se->vmsd) {         /* Old style */
743         return se->ops->load_state(f, se->opaque, se->load_version_id);
744     }
745     return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
746 }
747 
748 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
749 {
750     int64_t old_offset, size;
751 
752     old_offset = qemu_ftell_fast(f);
753     se->ops->save_state(f, se->opaque);
754     size = qemu_ftell_fast(f) - old_offset;
755 
756     if (vmdesc) {
757         json_prop_int(vmdesc, "size", size);
758         json_start_array(vmdesc, "fields");
759         json_start_object(vmdesc, NULL);
760         json_prop_str(vmdesc, "name", "data");
761         json_prop_int(vmdesc, "size", size);
762         json_prop_str(vmdesc, "type", "buffer");
763         json_end_object(vmdesc);
764         json_end_array(vmdesc);
765     }
766 }
767 
768 static void vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
769 {
770     trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
771     if (!se->vmsd) {
772         vmstate_save_old_style(f, se, vmdesc);
773         return;
774     }
775     vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
776 }
777 
778 void savevm_skip_section_footers(void)
779 {
780     skip_section_footers = true;
781 }
782 
783 /*
784  * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
785  */
786 static void save_section_header(QEMUFile *f, SaveStateEntry *se,
787                                 uint8_t section_type)
788 {
789     qemu_put_byte(f, section_type);
790     qemu_put_be32(f, se->section_id);
791 
792     if (section_type == QEMU_VM_SECTION_FULL ||
793         section_type == QEMU_VM_SECTION_START) {
794         /* ID string */
795         size_t len = strlen(se->idstr);
796         qemu_put_byte(f, len);
797         qemu_put_buffer(f, (uint8_t *)se->idstr, len);
798 
799         qemu_put_be32(f, se->instance_id);
800         qemu_put_be32(f, se->version_id);
801     }
802 }
803 
804 /*
805  * Write a footer onto device sections that catches cases misformatted device
806  * sections.
807  */
808 static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
809 {
810     if (!skip_section_footers) {
811         qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
812         qemu_put_be32(f, se->section_id);
813     }
814 }
815 
816 /**
817  * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
818  *                           command and associated data.
819  *
820  * @f: File to send command on
821  * @command: Command type to send
822  * @len: Length of associated data
823  * @data: Data associated with command.
824  */
825 static void qemu_savevm_command_send(QEMUFile *f,
826                                      enum qemu_vm_cmd command,
827                                      uint16_t len,
828                                      uint8_t *data)
829 {
830     trace_savevm_command_send(command, len);
831     qemu_put_byte(f, QEMU_VM_COMMAND);
832     qemu_put_be16(f, (uint16_t)command);
833     qemu_put_be16(f, len);
834     qemu_put_buffer(f, data, len);
835     qemu_fflush(f);
836 }
837 
838 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
839 {
840     uint32_t buf;
841 
842     trace_savevm_send_ping(value);
843     buf = cpu_to_be32(value);
844     qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
845 }
846 
847 void qemu_savevm_send_open_return_path(QEMUFile *f)
848 {
849     trace_savevm_send_open_return_path();
850     qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
851 }
852 
853 /* We have a buffer of data to send; we don't want that all to be loaded
854  * by the command itself, so the command contains just the length of the
855  * extra buffer that we then send straight after it.
856  * TODO: Must be a better way to organise that
857  *
858  * Returns:
859  *    0 on success
860  *    -ve on error
861  */
862 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
863 {
864     uint32_t tmp;
865 
866     if (len > MAX_VM_CMD_PACKAGED_SIZE) {
867         error_report("%s: Unreasonably large packaged state: %zu",
868                      __func__, len);
869         return -1;
870     }
871 
872     tmp = cpu_to_be32(len);
873 
874     trace_qemu_savevm_send_packaged();
875     qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
876 
877     qemu_put_buffer(f, buf, len);
878 
879     return 0;
880 }
881 
882 /* Send prior to any postcopy transfer */
883 void qemu_savevm_send_postcopy_advise(QEMUFile *f)
884 {
885     uint64_t tmp[2];
886     tmp[0] = cpu_to_be64(ram_pagesize_summary());
887     tmp[1] = cpu_to_be64(qemu_target_page_size());
888 
889     trace_qemu_savevm_send_postcopy_advise();
890     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp);
891 }
892 
893 /* Sent prior to starting the destination running in postcopy, discard pages
894  * that have already been sent but redirtied on the source.
895  * CMD_POSTCOPY_RAM_DISCARD consist of:
896  *      byte   version (0)
897  *      byte   Length of name field (not including 0)
898  *  n x byte   RAM block name
899  *      byte   0 terminator (just for safety)
900  *  n x        Byte ranges within the named RAMBlock
901  *      be64   Start of the range
902  *      be64   Length
903  *
904  *  name:  RAMBlock name that these entries are part of
905  *  len: Number of page entries
906  *  start_list: 'len' addresses
907  *  length_list: 'len' addresses
908  *
909  */
910 void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
911                                            uint16_t len,
912                                            uint64_t *start_list,
913                                            uint64_t *length_list)
914 {
915     uint8_t *buf;
916     uint16_t tmplen;
917     uint16_t t;
918     size_t name_len = strlen(name);
919 
920     trace_qemu_savevm_send_postcopy_ram_discard(name, len);
921     assert(name_len < 256);
922     buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
923     buf[0] = postcopy_ram_discard_version;
924     buf[1] = name_len;
925     memcpy(buf + 2, name, name_len);
926     tmplen = 2 + name_len;
927     buf[tmplen++] = '\0';
928 
929     for (t = 0; t < len; t++) {
930         stq_be_p(buf + tmplen, start_list[t]);
931         tmplen += 8;
932         stq_be_p(buf + tmplen, length_list[t]);
933         tmplen += 8;
934     }
935     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
936     g_free(buf);
937 }
938 
939 /* Get the destination into a state where it can receive postcopy data. */
940 void qemu_savevm_send_postcopy_listen(QEMUFile *f)
941 {
942     trace_savevm_send_postcopy_listen();
943     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
944 }
945 
946 /* Kick the destination into running */
947 void qemu_savevm_send_postcopy_run(QEMUFile *f)
948 {
949     trace_savevm_send_postcopy_run();
950     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
951 }
952 
953 bool qemu_savevm_state_blocked(Error **errp)
954 {
955     SaveStateEntry *se;
956 
957     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
958         if (se->vmsd && se->vmsd->unmigratable) {
959             error_setg(errp, "State blocked by non-migratable device '%s'",
960                        se->idstr);
961             return true;
962         }
963     }
964     return false;
965 }
966 
967 static bool enforce_config_section(void)
968 {
969     MachineState *machine = MACHINE(qdev_get_machine());
970     return machine->enforce_config_section;
971 }
972 
973 void qemu_savevm_state_header(QEMUFile *f)
974 {
975     trace_savevm_state_header();
976     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
977     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
978 
979     if (!savevm_state.skip_configuration || enforce_config_section()) {
980         qemu_put_byte(f, QEMU_VM_CONFIGURATION);
981         vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
982     }
983 
984 }
985 
986 void qemu_savevm_state_begin(QEMUFile *f)
987 {
988     SaveStateEntry *se;
989     int ret;
990 
991     trace_savevm_state_begin();
992     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
993         if (!se->ops || !se->ops->save_live_setup) {
994             continue;
995         }
996         if (se->ops && se->ops->is_active) {
997             if (!se->ops->is_active(se->opaque)) {
998                 continue;
999             }
1000         }
1001         save_section_header(f, se, QEMU_VM_SECTION_START);
1002 
1003         ret = se->ops->save_live_setup(f, se->opaque);
1004         save_section_footer(f, se);
1005         if (ret < 0) {
1006             qemu_file_set_error(f, ret);
1007             break;
1008         }
1009     }
1010 }
1011 
1012 /*
1013  * this function has three return values:
1014  *   negative: there was one error, and we have -errno.
1015  *   0 : We haven't finished, caller have to go again
1016  *   1 : We have finished, we can go to complete phase
1017  */
1018 int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1019 {
1020     SaveStateEntry *se;
1021     int ret = 1;
1022 
1023     trace_savevm_state_iterate();
1024     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1025         if (!se->ops || !se->ops->save_live_iterate) {
1026             continue;
1027         }
1028         if (se->ops && se->ops->is_active) {
1029             if (!se->ops->is_active(se->opaque)) {
1030                 continue;
1031             }
1032         }
1033         /*
1034          * In the postcopy phase, any device that doesn't know how to
1035          * do postcopy should have saved it's state in the _complete
1036          * call that's already run, it might get confused if we call
1037          * iterate afterwards.
1038          */
1039         if (postcopy && !se->ops->save_live_complete_postcopy) {
1040             continue;
1041         }
1042         if (qemu_file_rate_limit(f)) {
1043             return 0;
1044         }
1045         trace_savevm_section_start(se->idstr, se->section_id);
1046 
1047         save_section_header(f, se, QEMU_VM_SECTION_PART);
1048 
1049         ret = se->ops->save_live_iterate(f, se->opaque);
1050         trace_savevm_section_end(se->idstr, se->section_id, ret);
1051         save_section_footer(f, se);
1052 
1053         if (ret < 0) {
1054             qemu_file_set_error(f, ret);
1055         }
1056         if (ret <= 0) {
1057             /* Do not proceed to the next vmstate before this one reported
1058                completion of the current stage. This serializes the migration
1059                and reduces the probability that a faster changing state is
1060                synchronized over and over again. */
1061             break;
1062         }
1063     }
1064     return ret;
1065 }
1066 
1067 static bool should_send_vmdesc(void)
1068 {
1069     MachineState *machine = MACHINE(qdev_get_machine());
1070     bool in_postcopy = migration_in_postcopy();
1071     return !machine->suppress_vmdesc && !in_postcopy;
1072 }
1073 
1074 /*
1075  * Calls the save_live_complete_postcopy methods
1076  * causing the last few pages to be sent immediately and doing any associated
1077  * cleanup.
1078  * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1079  * all the other devices, but that happens at the point we switch to postcopy.
1080  */
1081 void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1082 {
1083     SaveStateEntry *se;
1084     int ret;
1085 
1086     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1087         if (!se->ops || !se->ops->save_live_complete_postcopy) {
1088             continue;
1089         }
1090         if (se->ops && se->ops->is_active) {
1091             if (!se->ops->is_active(se->opaque)) {
1092                 continue;
1093             }
1094         }
1095         trace_savevm_section_start(se->idstr, se->section_id);
1096         /* Section type */
1097         qemu_put_byte(f, QEMU_VM_SECTION_END);
1098         qemu_put_be32(f, se->section_id);
1099 
1100         ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1101         trace_savevm_section_end(se->idstr, se->section_id, ret);
1102         save_section_footer(f, se);
1103         if (ret < 0) {
1104             qemu_file_set_error(f, ret);
1105             return;
1106         }
1107     }
1108 
1109     qemu_put_byte(f, QEMU_VM_EOF);
1110     qemu_fflush(f);
1111 }
1112 
1113 void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
1114 {
1115     QJSON *vmdesc;
1116     int vmdesc_len;
1117     SaveStateEntry *se;
1118     int ret;
1119     bool in_postcopy = migration_in_postcopy();
1120 
1121     trace_savevm_state_complete_precopy();
1122 
1123     cpu_synchronize_all_states();
1124 
1125     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1126         if (!se->ops ||
1127             (in_postcopy && se->ops->save_live_complete_postcopy) ||
1128             (in_postcopy && !iterable_only) ||
1129             !se->ops->save_live_complete_precopy) {
1130             continue;
1131         }
1132 
1133         if (se->ops && se->ops->is_active) {
1134             if (!se->ops->is_active(se->opaque)) {
1135                 continue;
1136             }
1137         }
1138         trace_savevm_section_start(se->idstr, se->section_id);
1139 
1140         save_section_header(f, se, QEMU_VM_SECTION_END);
1141 
1142         ret = se->ops->save_live_complete_precopy(f, se->opaque);
1143         trace_savevm_section_end(se->idstr, se->section_id, ret);
1144         save_section_footer(f, se);
1145         if (ret < 0) {
1146             qemu_file_set_error(f, ret);
1147             return;
1148         }
1149     }
1150 
1151     if (iterable_only) {
1152         return;
1153     }
1154 
1155     vmdesc = qjson_new();
1156     json_prop_int(vmdesc, "page_size", qemu_target_page_size());
1157     json_start_array(vmdesc, "devices");
1158     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1159 
1160         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1161             continue;
1162         }
1163         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1164             trace_savevm_section_skip(se->idstr, se->section_id);
1165             continue;
1166         }
1167 
1168         trace_savevm_section_start(se->idstr, se->section_id);
1169 
1170         json_start_object(vmdesc, NULL);
1171         json_prop_str(vmdesc, "name", se->idstr);
1172         json_prop_int(vmdesc, "instance_id", se->instance_id);
1173 
1174         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1175         vmstate_save(f, se, vmdesc);
1176         trace_savevm_section_end(se->idstr, se->section_id, 0);
1177         save_section_footer(f, se);
1178 
1179         json_end_object(vmdesc);
1180     }
1181 
1182     if (!in_postcopy) {
1183         /* Postcopy stream will still be going */
1184         qemu_put_byte(f, QEMU_VM_EOF);
1185     }
1186 
1187     json_end_array(vmdesc);
1188     qjson_finish(vmdesc);
1189     vmdesc_len = strlen(qjson_get_str(vmdesc));
1190 
1191     if (should_send_vmdesc()) {
1192         qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1193         qemu_put_be32(f, vmdesc_len);
1194         qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len);
1195     }
1196     qjson_destroy(vmdesc);
1197 
1198     qemu_fflush(f);
1199 }
1200 
1201 /* Give an estimate of the amount left to be transferred,
1202  * the result is split into the amount for units that can and
1203  * for units that can't do postcopy.
1204  */
1205 void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1206                                uint64_t *res_non_postcopiable,
1207                                uint64_t *res_postcopiable)
1208 {
1209     SaveStateEntry *se;
1210 
1211     *res_non_postcopiable = 0;
1212     *res_postcopiable = 0;
1213 
1214 
1215     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1216         if (!se->ops || !se->ops->save_live_pending) {
1217             continue;
1218         }
1219         if (se->ops && se->ops->is_active) {
1220             if (!se->ops->is_active(se->opaque)) {
1221                 continue;
1222             }
1223         }
1224         se->ops->save_live_pending(f, se->opaque, threshold_size,
1225                                    res_non_postcopiable, res_postcopiable);
1226     }
1227 }
1228 
1229 void qemu_savevm_state_cleanup(void)
1230 {
1231     SaveStateEntry *se;
1232 
1233     trace_savevm_state_cleanup();
1234     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1235         if (se->ops && se->ops->cleanup) {
1236             se->ops->cleanup(se->opaque);
1237         }
1238     }
1239 }
1240 
1241 static int qemu_savevm_state(QEMUFile *f, Error **errp)
1242 {
1243     int ret;
1244     MigrationState *ms = migrate_init();
1245     MigrationStatus status;
1246     ms->to_dst_file = f;
1247 
1248     if (migration_is_blocked(errp)) {
1249         ret = -EINVAL;
1250         goto done;
1251     }
1252 
1253     if (migrate_use_block()) {
1254         error_setg(errp, "Block migration and snapshots are incompatible");
1255         ret = -EINVAL;
1256         goto done;
1257     }
1258 
1259     qemu_mutex_unlock_iothread();
1260     qemu_savevm_state_header(f);
1261     qemu_savevm_state_begin(f);
1262     qemu_mutex_lock_iothread();
1263 
1264     while (qemu_file_get_error(f) == 0) {
1265         if (qemu_savevm_state_iterate(f, false) > 0) {
1266             break;
1267         }
1268     }
1269 
1270     ret = qemu_file_get_error(f);
1271     if (ret == 0) {
1272         qemu_savevm_state_complete_precopy(f, false);
1273         ret = qemu_file_get_error(f);
1274     }
1275     qemu_savevm_state_cleanup();
1276     if (ret != 0) {
1277         error_setg_errno(errp, -ret, "Error while writing VM state");
1278     }
1279 
1280 done:
1281     if (ret != 0) {
1282         status = MIGRATION_STATUS_FAILED;
1283     } else {
1284         status = MIGRATION_STATUS_COMPLETED;
1285     }
1286     migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1287 
1288     /* f is outer parameter, it should not stay in global migration state after
1289      * this function finished */
1290     ms->to_dst_file = NULL;
1291 
1292     return ret;
1293 }
1294 
1295 static int qemu_save_device_state(QEMUFile *f)
1296 {
1297     SaveStateEntry *se;
1298 
1299     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1300     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1301 
1302     cpu_synchronize_all_states();
1303 
1304     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1305         if (se->is_ram) {
1306             continue;
1307         }
1308         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1309             continue;
1310         }
1311         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1312             continue;
1313         }
1314 
1315         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1316 
1317         vmstate_save(f, se, NULL);
1318 
1319         save_section_footer(f, se);
1320     }
1321 
1322     qemu_put_byte(f, QEMU_VM_EOF);
1323 
1324     return qemu_file_get_error(f);
1325 }
1326 
1327 static SaveStateEntry *find_se(const char *idstr, int instance_id)
1328 {
1329     SaveStateEntry *se;
1330 
1331     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1332         if (!strcmp(se->idstr, idstr) &&
1333             (instance_id == se->instance_id ||
1334              instance_id == se->alias_id))
1335             return se;
1336         /* Migrating from an older version? */
1337         if (strstr(se->idstr, idstr) && se->compat) {
1338             if (!strcmp(se->compat->idstr, idstr) &&
1339                 (instance_id == se->compat->instance_id ||
1340                  instance_id == se->alias_id))
1341                 return se;
1342         }
1343     }
1344     return NULL;
1345 }
1346 
1347 enum LoadVMExitCodes {
1348     /* Allow a command to quit all layers of nested loadvm loops */
1349     LOADVM_QUIT     =  1,
1350 };
1351 
1352 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
1353 
1354 /* ------ incoming postcopy messages ------ */
1355 /* 'advise' arrives before any transfers just to tell us that a postcopy
1356  * *might* happen - it might be skipped if precopy transferred everything
1357  * quickly.
1358  */
1359 static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
1360 {
1361     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1362     uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1363 
1364     trace_loadvm_postcopy_handle_advise();
1365     if (ps != POSTCOPY_INCOMING_NONE) {
1366         error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1367         return -1;
1368     }
1369 
1370     if (!postcopy_ram_supported_by_host()) {
1371         postcopy_state_set(POSTCOPY_INCOMING_NONE);
1372         return -1;
1373     }
1374 
1375     remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1376     local_pagesize_summary = ram_pagesize_summary();
1377 
1378     if (remote_pagesize_summary != local_pagesize_summary)  {
1379         /*
1380          * This detects two potential causes of mismatch:
1381          *   a) A mismatch in host page sizes
1382          *      Some combinations of mismatch are probably possible but it gets
1383          *      a bit more complicated.  In particular we need to place whole
1384          *      host pages on the dest at once, and we need to ensure that we
1385          *      handle dirtying to make sure we never end up sending part of
1386          *      a hostpage on it's own.
1387          *   b) The use of different huge page sizes on source/destination
1388          *      a more fine grain test is performed during RAM block migration
1389          *      but this test here causes a nice early clear failure, and
1390          *      also fails when passed to an older qemu that doesn't
1391          *      do huge pages.
1392          */
1393         error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1394                                                              " d=%" PRIx64 ")",
1395                      remote_pagesize_summary, local_pagesize_summary);
1396         return -1;
1397     }
1398 
1399     remote_tps = qemu_get_be64(mis->from_src_file);
1400     if (remote_tps != qemu_target_page_size()) {
1401         /*
1402          * Again, some differences could be dealt with, but for now keep it
1403          * simple.
1404          */
1405         error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1406                      (int)remote_tps, qemu_target_page_size());
1407         return -1;
1408     }
1409 
1410     if (ram_postcopy_incoming_init(mis)) {
1411         return -1;
1412     }
1413 
1414     postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1415 
1416     return 0;
1417 }
1418 
1419 /* After postcopy we will be told to throw some pages away since they're
1420  * dirty and will have to be demand fetched.  Must happen before CPU is
1421  * started.
1422  * There can be 0..many of these messages, each encoding multiple pages.
1423  */
1424 static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1425                                               uint16_t len)
1426 {
1427     int tmp;
1428     char ramid[256];
1429     PostcopyState ps = postcopy_state_get();
1430 
1431     trace_loadvm_postcopy_ram_handle_discard();
1432 
1433     switch (ps) {
1434     case POSTCOPY_INCOMING_ADVISE:
1435         /* 1st discard */
1436         tmp = postcopy_ram_prepare_discard(mis);
1437         if (tmp) {
1438             return tmp;
1439         }
1440         break;
1441 
1442     case POSTCOPY_INCOMING_DISCARD:
1443         /* Expected state */
1444         break;
1445 
1446     default:
1447         error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1448                      ps);
1449         return -1;
1450     }
1451     /* We're expecting a
1452      *    Version (0)
1453      *    a RAM ID string (length byte, name, 0 term)
1454      *    then at least 1 16 byte chunk
1455     */
1456     if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1457         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1458         return -1;
1459     }
1460 
1461     tmp = qemu_get_byte(mis->from_src_file);
1462     if (tmp != postcopy_ram_discard_version) {
1463         error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1464         return -1;
1465     }
1466 
1467     if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1468         error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1469         return -1;
1470     }
1471     tmp = qemu_get_byte(mis->from_src_file);
1472     if (tmp != 0) {
1473         error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1474         return -1;
1475     }
1476 
1477     len -= 3 + strlen(ramid);
1478     if (len % 16) {
1479         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1480         return -1;
1481     }
1482     trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1483     while (len) {
1484         uint64_t start_addr, block_length;
1485         start_addr = qemu_get_be64(mis->from_src_file);
1486         block_length = qemu_get_be64(mis->from_src_file);
1487 
1488         len -= 16;
1489         int ret = ram_discard_range(ramid, start_addr, block_length);
1490         if (ret) {
1491             return ret;
1492         }
1493     }
1494     trace_loadvm_postcopy_ram_handle_discard_end();
1495 
1496     return 0;
1497 }
1498 
1499 /*
1500  * Triggered by a postcopy_listen command; this thread takes over reading
1501  * the input stream, leaving the main thread free to carry on loading the rest
1502  * of the device state (from RAM).
1503  * (TODO:This could do with being in a postcopy file - but there again it's
1504  * just another input loop, not that postcopy specific)
1505  */
1506 static void *postcopy_ram_listen_thread(void *opaque)
1507 {
1508     QEMUFile *f = opaque;
1509     MigrationIncomingState *mis = migration_incoming_get_current();
1510     int load_res;
1511 
1512     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1513                                    MIGRATION_STATUS_POSTCOPY_ACTIVE);
1514     qemu_sem_post(&mis->listen_thread_sem);
1515     trace_postcopy_ram_listen_thread_start();
1516 
1517     /*
1518      * Because we're a thread and not a coroutine we can't yield
1519      * in qemu_file, and thus we must be blocking now.
1520      */
1521     qemu_file_set_blocking(f, true);
1522     load_res = qemu_loadvm_state_main(f, mis);
1523     /* And non-blocking again so we don't block in any cleanup */
1524     qemu_file_set_blocking(f, false);
1525 
1526     trace_postcopy_ram_listen_thread_exit();
1527     if (load_res < 0) {
1528         error_report("%s: loadvm failed: %d", __func__, load_res);
1529         qemu_file_set_error(f, load_res);
1530         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1531                                        MIGRATION_STATUS_FAILED);
1532     } else {
1533         /*
1534          * This looks good, but it's possible that the device loading in the
1535          * main thread hasn't finished yet, and so we might not be in 'RUN'
1536          * state yet; wait for the end of the main thread.
1537          */
1538         qemu_event_wait(&mis->main_thread_load_event);
1539     }
1540     postcopy_ram_incoming_cleanup(mis);
1541 
1542     if (load_res < 0) {
1543         /*
1544          * If something went wrong then we have a bad state so exit;
1545          * depending how far we got it might be possible at this point
1546          * to leave the guest running and fire MCEs for pages that never
1547          * arrived as a desperate recovery step.
1548          */
1549         exit(EXIT_FAILURE);
1550     }
1551 
1552     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1553                                    MIGRATION_STATUS_COMPLETED);
1554     /*
1555      * If everything has worked fine, then the main thread has waited
1556      * for us to start, and we're the last use of the mis.
1557      * (If something broke then qemu will have to exit anyway since it's
1558      * got a bad migration state).
1559      */
1560     migration_incoming_state_destroy();
1561 
1562 
1563     return NULL;
1564 }
1565 
1566 /* After this message we must be able to immediately receive postcopy data */
1567 static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1568 {
1569     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1570     trace_loadvm_postcopy_handle_listen();
1571     if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1572         error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1573         return -1;
1574     }
1575     if (ps == POSTCOPY_INCOMING_ADVISE) {
1576         /*
1577          * A rare case, we entered listen without having to do any discards,
1578          * so do the setup that's normally done at the time of the 1st discard.
1579          */
1580         postcopy_ram_prepare_discard(mis);
1581     }
1582 
1583     /*
1584      * Sensitise RAM - can now generate requests for blocks that don't exist
1585      * However, at this point the CPU shouldn't be running, and the IO
1586      * shouldn't be doing anything yet so don't actually expect requests
1587      */
1588     if (postcopy_ram_enable_notify(mis)) {
1589         return -1;
1590     }
1591 
1592     if (mis->have_listen_thread) {
1593         error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
1594         return -1;
1595     }
1596 
1597     mis->have_listen_thread = true;
1598     /* Start up the listening thread and wait for it to signal ready */
1599     qemu_sem_init(&mis->listen_thread_sem, 0);
1600     qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1601                        postcopy_ram_listen_thread, mis->from_src_file,
1602                        QEMU_THREAD_DETACHED);
1603     qemu_sem_wait(&mis->listen_thread_sem);
1604     qemu_sem_destroy(&mis->listen_thread_sem);
1605 
1606     return 0;
1607 }
1608 
1609 
1610 typedef struct {
1611     QEMUBH *bh;
1612 } HandleRunBhData;
1613 
1614 static void loadvm_postcopy_handle_run_bh(void *opaque)
1615 {
1616     Error *local_err = NULL;
1617     HandleRunBhData *data = opaque;
1618 
1619     /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1620      * in migration.c
1621      */
1622     cpu_synchronize_all_post_init();
1623 
1624     qemu_announce_self();
1625 
1626     /* Make sure all file formats flush their mutable metadata.
1627      * If we get an error here, just don't restart the VM yet. */
1628     bdrv_invalidate_cache_all(&local_err);
1629     if (local_err) {
1630         error_report_err(local_err);
1631         local_err = NULL;
1632         autostart = false;
1633     }
1634 
1635     trace_loadvm_postcopy_handle_run_cpu_sync();
1636     cpu_synchronize_all_post_init();
1637 
1638     trace_loadvm_postcopy_handle_run_vmstart();
1639 
1640     if (autostart) {
1641         /* Hold onto your hats, starting the CPU */
1642         vm_start();
1643     } else {
1644         /* leave it paused and let management decide when to start the CPU */
1645         runstate_set(RUN_STATE_PAUSED);
1646     }
1647 
1648     qemu_bh_delete(data->bh);
1649     g_free(data);
1650 }
1651 
1652 /* After all discards we can start running and asking for pages */
1653 static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
1654 {
1655     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
1656     HandleRunBhData *data;
1657 
1658     trace_loadvm_postcopy_handle_run();
1659     if (ps != POSTCOPY_INCOMING_LISTENING) {
1660         error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
1661         return -1;
1662     }
1663 
1664     data = g_new(HandleRunBhData, 1);
1665     data->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, data);
1666     qemu_bh_schedule(data->bh);
1667 
1668     /* We need to finish reading the stream from the package
1669      * and also stop reading anything more from the stream that loaded the
1670      * package (since it's now being read by the listener thread).
1671      * LOADVM_QUIT will quit all the layers of nested loadvm loops.
1672      */
1673     return LOADVM_QUIT;
1674 }
1675 
1676 /**
1677  * Immediately following this command is a blob of data containing an embedded
1678  * chunk of migration stream; read it and load it.
1679  *
1680  * @mis: Incoming state
1681  * @length: Length of packaged data to read
1682  *
1683  * Returns: Negative values on error
1684  *
1685  */
1686 static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
1687 {
1688     int ret;
1689     size_t length;
1690     QIOChannelBuffer *bioc;
1691 
1692     length = qemu_get_be32(mis->from_src_file);
1693     trace_loadvm_handle_cmd_packaged(length);
1694 
1695     if (length > MAX_VM_CMD_PACKAGED_SIZE) {
1696         error_report("Unreasonably large packaged state: %zu", length);
1697         return -1;
1698     }
1699 
1700     bioc = qio_channel_buffer_new(length);
1701     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
1702     ret = qemu_get_buffer(mis->from_src_file,
1703                           bioc->data,
1704                           length);
1705     if (ret != length) {
1706         object_unref(OBJECT(bioc));
1707         error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
1708                      ret, length);
1709         return (ret < 0) ? ret : -EAGAIN;
1710     }
1711     bioc->usage += length;
1712     trace_loadvm_handle_cmd_packaged_received(ret);
1713 
1714     QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
1715 
1716     ret = qemu_loadvm_state_main(packf, mis);
1717     trace_loadvm_handle_cmd_packaged_main(ret);
1718     qemu_fclose(packf);
1719     object_unref(OBJECT(bioc));
1720 
1721     return ret;
1722 }
1723 
1724 /*
1725  * Process an incoming 'QEMU_VM_COMMAND'
1726  * 0           just a normal return
1727  * LOADVM_QUIT All good, but exit the loop
1728  * <0          Error
1729  */
1730 static int loadvm_process_command(QEMUFile *f)
1731 {
1732     MigrationIncomingState *mis = migration_incoming_get_current();
1733     uint16_t cmd;
1734     uint16_t len;
1735     uint32_t tmp32;
1736 
1737     cmd = qemu_get_be16(f);
1738     len = qemu_get_be16(f);
1739 
1740     trace_loadvm_process_command(cmd, len);
1741     if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
1742         error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
1743         return -EINVAL;
1744     }
1745 
1746     if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
1747         error_report("%s received with bad length - expecting %zu, got %d",
1748                      mig_cmd_args[cmd].name,
1749                      (size_t)mig_cmd_args[cmd].len, len);
1750         return -ERANGE;
1751     }
1752 
1753     switch (cmd) {
1754     case MIG_CMD_OPEN_RETURN_PATH:
1755         if (mis->to_src_file) {
1756             error_report("CMD_OPEN_RETURN_PATH called when RP already open");
1757             /* Not really a problem, so don't give up */
1758             return 0;
1759         }
1760         mis->to_src_file = qemu_file_get_return_path(f);
1761         if (!mis->to_src_file) {
1762             error_report("CMD_OPEN_RETURN_PATH failed");
1763             return -1;
1764         }
1765         break;
1766 
1767     case MIG_CMD_PING:
1768         tmp32 = qemu_get_be32(f);
1769         trace_loadvm_process_command_ping(tmp32);
1770         if (!mis->to_src_file) {
1771             error_report("CMD_PING (0x%x) received with no return path",
1772                          tmp32);
1773             return -1;
1774         }
1775         migrate_send_rp_pong(mis, tmp32);
1776         break;
1777 
1778     case MIG_CMD_PACKAGED:
1779         return loadvm_handle_cmd_packaged(mis);
1780 
1781     case MIG_CMD_POSTCOPY_ADVISE:
1782         return loadvm_postcopy_handle_advise(mis);
1783 
1784     case MIG_CMD_POSTCOPY_LISTEN:
1785         return loadvm_postcopy_handle_listen(mis);
1786 
1787     case MIG_CMD_POSTCOPY_RUN:
1788         return loadvm_postcopy_handle_run(mis);
1789 
1790     case MIG_CMD_POSTCOPY_RAM_DISCARD:
1791         return loadvm_postcopy_ram_handle_discard(mis, len);
1792     }
1793 
1794     return 0;
1795 }
1796 
1797 /*
1798  * Read a footer off the wire and check that it matches the expected section
1799  *
1800  * Returns: true if the footer was good
1801  *          false if there is a problem (and calls error_report to say why)
1802  */
1803 static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
1804 {
1805     uint8_t read_mark;
1806     uint32_t read_section_id;
1807 
1808     if (skip_section_footers) {
1809         /* No footer to check */
1810         return true;
1811     }
1812 
1813     read_mark = qemu_get_byte(f);
1814 
1815     if (read_mark != QEMU_VM_SECTION_FOOTER) {
1816         error_report("Missing section footer for %s", se->idstr);
1817         return false;
1818     }
1819 
1820     read_section_id = qemu_get_be32(f);
1821     if (read_section_id != se->load_section_id) {
1822         error_report("Mismatched section id in footer for %s -"
1823                      " read 0x%x expected 0x%x",
1824                      se->idstr, read_section_id, se->load_section_id);
1825         return false;
1826     }
1827 
1828     /* All good */
1829     return true;
1830 }
1831 
1832 static int
1833 qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
1834 {
1835     uint32_t instance_id, version_id, section_id;
1836     SaveStateEntry *se;
1837     char idstr[256];
1838     int ret;
1839 
1840     /* Read section start */
1841     section_id = qemu_get_be32(f);
1842     if (!qemu_get_counted_string(f, idstr)) {
1843         error_report("Unable to read ID string for section %u",
1844                      section_id);
1845         return -EINVAL;
1846     }
1847     instance_id = qemu_get_be32(f);
1848     version_id = qemu_get_be32(f);
1849 
1850     trace_qemu_loadvm_state_section_startfull(section_id, idstr,
1851             instance_id, version_id);
1852     /* Find savevm section */
1853     se = find_se(idstr, instance_id);
1854     if (se == NULL) {
1855         error_report("Unknown savevm section or instance '%s' %d",
1856                      idstr, instance_id);
1857         return -EINVAL;
1858     }
1859 
1860     /* Validate version */
1861     if (version_id > se->version_id) {
1862         error_report("savevm: unsupported version %d for '%s' v%d",
1863                      version_id, idstr, se->version_id);
1864         return -EINVAL;
1865     }
1866     se->load_version_id = version_id;
1867     se->load_section_id = section_id;
1868 
1869     /* Validate if it is a device's state */
1870     if (xen_enabled() && se->is_ram) {
1871         error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
1872         return -EINVAL;
1873     }
1874 
1875     ret = vmstate_load(f, se);
1876     if (ret < 0) {
1877         error_report("error while loading state for instance 0x%x of"
1878                      " device '%s'", instance_id, idstr);
1879         return ret;
1880     }
1881     if (!check_section_footer(f, se)) {
1882         return -EINVAL;
1883     }
1884 
1885     return 0;
1886 }
1887 
1888 static int
1889 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
1890 {
1891     uint32_t section_id;
1892     SaveStateEntry *se;
1893     int ret;
1894 
1895     section_id = qemu_get_be32(f);
1896 
1897     trace_qemu_loadvm_state_section_partend(section_id);
1898     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1899         if (se->load_section_id == section_id) {
1900             break;
1901         }
1902     }
1903     if (se == NULL) {
1904         error_report("Unknown savevm section %d", section_id);
1905         return -EINVAL;
1906     }
1907 
1908     ret = vmstate_load(f, se);
1909     if (ret < 0) {
1910         error_report("error while loading state section id %d(%s)",
1911                      section_id, se->idstr);
1912         return ret;
1913     }
1914     if (!check_section_footer(f, se)) {
1915         return -EINVAL;
1916     }
1917 
1918     return 0;
1919 }
1920 
1921 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
1922 {
1923     uint8_t section_type;
1924     int ret = 0;
1925 
1926     while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
1927         ret = 0;
1928         trace_qemu_loadvm_state_section(section_type);
1929         switch (section_type) {
1930         case QEMU_VM_SECTION_START:
1931         case QEMU_VM_SECTION_FULL:
1932             ret = qemu_loadvm_section_start_full(f, mis);
1933             if (ret < 0) {
1934                 goto out;
1935             }
1936             break;
1937         case QEMU_VM_SECTION_PART:
1938         case QEMU_VM_SECTION_END:
1939             ret = qemu_loadvm_section_part_end(f, mis);
1940             if (ret < 0) {
1941                 goto out;
1942             }
1943             break;
1944         case QEMU_VM_COMMAND:
1945             ret = loadvm_process_command(f);
1946             trace_qemu_loadvm_state_section_command(ret);
1947             if ((ret < 0) || (ret & LOADVM_QUIT)) {
1948                 goto out;
1949             }
1950             break;
1951         default:
1952             error_report("Unknown savevm section type %d", section_type);
1953             ret = -EINVAL;
1954             goto out;
1955         }
1956     }
1957 
1958 out:
1959     if (ret < 0) {
1960         qemu_file_set_error(f, ret);
1961     }
1962     return ret;
1963 }
1964 
1965 int qemu_loadvm_state(QEMUFile *f)
1966 {
1967     MigrationIncomingState *mis = migration_incoming_get_current();
1968     Error *local_err = NULL;
1969     unsigned int v;
1970     int ret;
1971 
1972     if (qemu_savevm_state_blocked(&local_err)) {
1973         error_report_err(local_err);
1974         return -EINVAL;
1975     }
1976 
1977     v = qemu_get_be32(f);
1978     if (v != QEMU_VM_FILE_MAGIC) {
1979         error_report("Not a migration stream");
1980         return -EINVAL;
1981     }
1982 
1983     v = qemu_get_be32(f);
1984     if (v == QEMU_VM_FILE_VERSION_COMPAT) {
1985         error_report("SaveVM v2 format is obsolete and don't work anymore");
1986         return -ENOTSUP;
1987     }
1988     if (v != QEMU_VM_FILE_VERSION) {
1989         error_report("Unsupported migration stream version");
1990         return -ENOTSUP;
1991     }
1992 
1993     if (!savevm_state.skip_configuration || enforce_config_section()) {
1994         if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
1995             error_report("Configuration section missing");
1996             return -EINVAL;
1997         }
1998         ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
1999 
2000         if (ret) {
2001             return ret;
2002         }
2003     }
2004 
2005     cpu_synchronize_all_pre_loadvm();
2006 
2007     ret = qemu_loadvm_state_main(f, mis);
2008     qemu_event_set(&mis->main_thread_load_event);
2009 
2010     trace_qemu_loadvm_state_post_main(ret);
2011 
2012     if (mis->have_listen_thread) {
2013         /* Listen thread still going, can't clean up yet */
2014         return ret;
2015     }
2016 
2017     if (ret == 0) {
2018         ret = qemu_file_get_error(f);
2019     }
2020 
2021     /*
2022      * Try to read in the VMDESC section as well, so that dumping tools that
2023      * intercept our migration stream have the chance to see it.
2024      */
2025 
2026     /* We've got to be careful; if we don't read the data and just shut the fd
2027      * then the sender can error if we close while it's still sending.
2028      * We also mustn't read data that isn't there; some transports (RDMA)
2029      * will stall waiting for that data when the source has already closed.
2030      */
2031     if (ret == 0 && should_send_vmdesc()) {
2032         uint8_t *buf;
2033         uint32_t size;
2034         uint8_t  section_type = qemu_get_byte(f);
2035 
2036         if (section_type != QEMU_VM_VMDESCRIPTION) {
2037             error_report("Expected vmdescription section, but got %d",
2038                          section_type);
2039             /*
2040              * It doesn't seem worth failing at this point since
2041              * we apparently have an otherwise valid VM state
2042              */
2043         } else {
2044             buf = g_malloc(0x1000);
2045             size = qemu_get_be32(f);
2046 
2047             while (size > 0) {
2048                 uint32_t read_chunk = MIN(size, 0x1000);
2049                 qemu_get_buffer(f, buf, read_chunk);
2050                 size -= read_chunk;
2051             }
2052             g_free(buf);
2053         }
2054     }
2055 
2056     cpu_synchronize_all_post_init();
2057 
2058     return ret;
2059 }
2060 
2061 int save_snapshot(const char *name, Error **errp)
2062 {
2063     BlockDriverState *bs, *bs1;
2064     QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1;
2065     int ret = -1;
2066     QEMUFile *f;
2067     int saved_vm_running;
2068     uint64_t vm_state_size;
2069     qemu_timeval tv;
2070     struct tm tm;
2071     AioContext *aio_context;
2072 
2073     if (!bdrv_all_can_snapshot(&bs)) {
2074         error_setg(errp, "Device '%s' is writable but does not support "
2075                    "snapshots", bdrv_get_device_name(bs));
2076         return ret;
2077     }
2078 
2079     /* Delete old snapshots of the same name */
2080     if (name) {
2081         ret = bdrv_all_delete_snapshot(name, &bs1, errp);
2082         if (ret < 0) {
2083             error_prepend(errp, "Error while deleting snapshot on device "
2084                           "'%s': ", bdrv_get_device_name(bs1));
2085             return ret;
2086         }
2087     }
2088 
2089     bs = bdrv_all_find_vmstate_bs();
2090     if (bs == NULL) {
2091         error_setg(errp, "No block device can accept snapshots");
2092         return ret;
2093     }
2094     aio_context = bdrv_get_aio_context(bs);
2095 
2096     saved_vm_running = runstate_is_running();
2097 
2098     ret = global_state_store();
2099     if (ret) {
2100         error_setg(errp, "Error saving global state");
2101         return ret;
2102     }
2103     vm_stop(RUN_STATE_SAVE_VM);
2104 
2105     aio_context_acquire(aio_context);
2106 
2107     memset(sn, 0, sizeof(*sn));
2108 
2109     /* fill auxiliary fields */
2110     qemu_gettimeofday(&tv);
2111     sn->date_sec = tv.tv_sec;
2112     sn->date_nsec = tv.tv_usec * 1000;
2113     sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2114 
2115     if (name) {
2116         ret = bdrv_snapshot_find(bs, old_sn, name);
2117         if (ret >= 0) {
2118             pstrcpy(sn->name, sizeof(sn->name), old_sn->name);
2119             pstrcpy(sn->id_str, sizeof(sn->id_str), old_sn->id_str);
2120         } else {
2121             pstrcpy(sn->name, sizeof(sn->name), name);
2122         }
2123     } else {
2124         /* cast below needed for OpenBSD where tv_sec is still 'long' */
2125         localtime_r((const time_t *)&tv.tv_sec, &tm);
2126         strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm);
2127     }
2128 
2129     /* save the VM state */
2130     f = qemu_fopen_bdrv(bs, 1);
2131     if (!f) {
2132         error_setg(errp, "Could not open VM state file");
2133         goto the_end;
2134     }
2135     ret = qemu_savevm_state(f, errp);
2136     vm_state_size = qemu_ftell(f);
2137     qemu_fclose(f);
2138     if (ret < 0) {
2139         goto the_end;
2140     }
2141 
2142     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
2143     if (ret < 0) {
2144         error_setg(errp, "Error while creating snapshot on '%s'",
2145                    bdrv_get_device_name(bs));
2146         goto the_end;
2147     }
2148 
2149     ret = 0;
2150 
2151  the_end:
2152     aio_context_release(aio_context);
2153     if (saved_vm_running) {
2154         vm_start();
2155     }
2156     return ret;
2157 }
2158 
2159 void qmp_xen_save_devices_state(const char *filename, Error **errp)
2160 {
2161     QEMUFile *f;
2162     QIOChannelFile *ioc;
2163     int saved_vm_running;
2164     int ret;
2165 
2166     saved_vm_running = runstate_is_running();
2167     vm_stop(RUN_STATE_SAVE_VM);
2168     global_state_store_running();
2169 
2170     ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp);
2171     if (!ioc) {
2172         goto the_end;
2173     }
2174     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2175     f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2176     ret = qemu_save_device_state(f);
2177     qemu_fclose(f);
2178     if (ret < 0) {
2179         error_setg(errp, QERR_IO_ERROR);
2180     }
2181 
2182  the_end:
2183     if (saved_vm_running) {
2184         vm_start();
2185     }
2186 }
2187 
2188 void qmp_xen_load_devices_state(const char *filename, Error **errp)
2189 {
2190     QEMUFile *f;
2191     QIOChannelFile *ioc;
2192     int ret;
2193 
2194     /* Guest must be paused before loading the device state; the RAM state
2195      * will already have been loaded by xc
2196      */
2197     if (runstate_is_running()) {
2198         error_setg(errp, "Cannot update device state while vm is running");
2199         return;
2200     }
2201     vm_stop(RUN_STATE_RESTORE_VM);
2202 
2203     ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2204     if (!ioc) {
2205         return;
2206     }
2207     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2208     f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2209 
2210     ret = qemu_loadvm_state(f);
2211     qemu_fclose(f);
2212     if (ret < 0) {
2213         error_setg(errp, QERR_IO_ERROR);
2214     }
2215     migration_incoming_state_destroy();
2216 }
2217 
2218 int load_snapshot(const char *name, Error **errp)
2219 {
2220     BlockDriverState *bs, *bs_vm_state;
2221     QEMUSnapshotInfo sn;
2222     QEMUFile *f;
2223     int ret;
2224     AioContext *aio_context;
2225     MigrationIncomingState *mis = migration_incoming_get_current();
2226 
2227     if (!bdrv_all_can_snapshot(&bs)) {
2228         error_setg(errp,
2229                    "Device '%s' is writable but does not support snapshots",
2230                    bdrv_get_device_name(bs));
2231         return -ENOTSUP;
2232     }
2233     ret = bdrv_all_find_snapshot(name, &bs);
2234     if (ret < 0) {
2235         error_setg(errp,
2236                    "Device '%s' does not have the requested snapshot '%s'",
2237                    bdrv_get_device_name(bs), name);
2238         return ret;
2239     }
2240 
2241     bs_vm_state = bdrv_all_find_vmstate_bs();
2242     if (!bs_vm_state) {
2243         error_setg(errp, "No block device supports snapshots");
2244         return -ENOTSUP;
2245     }
2246     aio_context = bdrv_get_aio_context(bs_vm_state);
2247 
2248     /* Don't even try to load empty VM states */
2249     aio_context_acquire(aio_context);
2250     ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
2251     aio_context_release(aio_context);
2252     if (ret < 0) {
2253         return ret;
2254     } else if (sn.vm_state_size == 0) {
2255         error_setg(errp, "This is a disk-only snapshot. Revert to it "
2256                    " offline using qemu-img");
2257         return -EINVAL;
2258     }
2259 
2260     /* Flush all IO requests so they don't interfere with the new state.  */
2261     bdrv_drain_all();
2262 
2263     ret = bdrv_all_goto_snapshot(name, &bs);
2264     if (ret < 0) {
2265         error_setg(errp, "Error %d while activating snapshot '%s' on '%s'",
2266                      ret, name, bdrv_get_device_name(bs));
2267         return ret;
2268     }
2269 
2270     /* restore the VM state */
2271     f = qemu_fopen_bdrv(bs_vm_state, 0);
2272     if (!f) {
2273         error_setg(errp, "Could not open VM state file");
2274         return -EINVAL;
2275     }
2276 
2277     qemu_system_reset(SHUTDOWN_CAUSE_NONE);
2278     mis->from_src_file = f;
2279 
2280     aio_context_acquire(aio_context);
2281     ret = qemu_loadvm_state(f);
2282     aio_context_release(aio_context);
2283 
2284     migration_incoming_state_destroy();
2285     if (ret < 0) {
2286         error_setg(errp, "Error %d while loading VM state", ret);
2287         return ret;
2288     }
2289 
2290     return 0;
2291 }
2292 
2293 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
2294 {
2295     qemu_ram_set_idstr(mr->ram_block,
2296                        memory_region_name(mr), dev);
2297 }
2298 
2299 void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
2300 {
2301     qemu_ram_unset_idstr(mr->ram_block);
2302 }
2303 
2304 void vmstate_register_ram_global(MemoryRegion *mr)
2305 {
2306     vmstate_register_ram(mr, NULL);
2307 }
2308 
2309 bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
2310 {
2311     /* check needed if --only-migratable is specified */
2312     if (!only_migratable) {
2313         return true;
2314     }
2315 
2316     return !(vmsd && vmsd->unmigratable);
2317 }
2318