xref: /openbmc/qemu/migration/savevm.c (revision 8b812533)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2009-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "hw/boards.h"
31 #include "hw/xen/xen.h"
32 #include "net/net.h"
33 #include "migration.h"
34 #include "migration/snapshot.h"
35 #include "migration/misc.h"
36 #include "migration/register.h"
37 #include "migration/global_state.h"
38 #include "ram.h"
39 #include "qemu-file-channel.h"
40 #include "qemu-file.h"
41 #include "savevm.h"
42 #include "postcopy-ram.h"
43 #include "qapi/qmp/qerror.h"
44 #include "qemu/error-report.h"
45 #include "sysemu/cpus.h"
46 #include "exec/memory.h"
47 #include "exec/target_page.h"
48 #include "qmp-commands.h"
49 #include "trace.h"
50 #include "qemu/iov.h"
51 #include "block/snapshot.h"
52 #include "qemu/cutils.h"
53 #include "io/channel-buffer.h"
54 #include "io/channel-file.h"
55 
56 #ifndef ETH_P_RARP
57 #define ETH_P_RARP 0x8035
58 #endif
59 #define ARP_HTYPE_ETH 0x0001
60 #define ARP_PTYPE_IP 0x0800
61 #define ARP_OP_REQUEST_REV 0x3
62 
63 const unsigned int postcopy_ram_discard_version = 0;
64 
65 /* Subcommands for QEMU_VM_COMMAND */
66 enum qemu_vm_cmd {
67     MIG_CMD_INVALID = 0,   /* Must be 0 */
68     MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
69     MIG_CMD_PING,              /* Request a PONG on the RP */
70 
71     MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
72                                       warn we might want to do PC */
73     MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
74                                       pages as it's running. */
75     MIG_CMD_POSTCOPY_RUN,          /* Start execution */
76 
77     MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
78                                       were previously sent during
79                                       precopy but are dirty. */
80     MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
81     MIG_CMD_MAX
82 };
83 
84 #define MAX_VM_CMD_PACKAGED_SIZE (1ul << 24)
85 static struct mig_cmd_args {
86     ssize_t     len; /* -1 = variable */
87     const char *name;
88 } mig_cmd_args[] = {
89     [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
90     [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
91     [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
92     [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
93     [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
94     [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
95     [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
96                                    .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
97     [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
98     [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
99 };
100 
101 /* Note for MIG_CMD_POSTCOPY_ADVISE:
102  * The format of arguments is depending on postcopy mode:
103  * - postcopy RAM only
104  *   uint64_t host page size
105  *   uint64_t taget page size
106  *
107  * - postcopy RAM and postcopy dirty bitmaps
108  *   format is the same as for postcopy RAM only
109  *
110  * - postcopy dirty bitmaps only
111  *   Nothing. Command length field is 0.
112  *
113  * Be careful: adding a new postcopy entity with some other parameters should
114  * not break format self-description ability. Good way is to introduce some
115  * generic extendable format with an exception for two old entities.
116  */
117 
118 static int announce_self_create(uint8_t *buf,
119                                 uint8_t *mac_addr)
120 {
121     /* Ethernet header. */
122     memset(buf, 0xff, 6);         /* destination MAC addr */
123     memcpy(buf + 6, mac_addr, 6); /* source MAC addr */
124     *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */
125 
126     /* RARP header. */
127     *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */
128     *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */
129     *(buf + 18) = 6; /* hardware addr length (ethernet) */
130     *(buf + 19) = 4; /* protocol addr length (IPv4) */
131     *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */
132     memcpy(buf + 22, mac_addr, 6); /* source hw addr */
133     memset(buf + 28, 0x00, 4);     /* source protocol addr */
134     memcpy(buf + 32, mac_addr, 6); /* target hw addr */
135     memset(buf + 38, 0x00, 4);     /* target protocol addr */
136 
137     /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */
138     memset(buf + 42, 0x00, 18);
139 
140     return 60; /* len (FCS will be added by hardware) */
141 }
142 
143 static void qemu_announce_self_iter(NICState *nic, void *opaque)
144 {
145     uint8_t buf[60];
146     int len;
147 
148     trace_qemu_announce_self_iter(qemu_ether_ntoa(&nic->conf->macaddr));
149     len = announce_self_create(buf, nic->conf->macaddr.a);
150 
151     qemu_send_packet_raw(qemu_get_queue(nic), buf, len);
152 }
153 
154 
155 static void qemu_announce_self_once(void *opaque)
156 {
157     static int count = SELF_ANNOUNCE_ROUNDS;
158     QEMUTimer *timer = *(QEMUTimer **)opaque;
159 
160     qemu_foreach_nic(qemu_announce_self_iter, NULL);
161 
162     if (--count) {
163         /* delay 50ms, 150ms, 250ms, ... */
164         timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
165                   self_announce_delay(count));
166     } else {
167             timer_del(timer);
168             timer_free(timer);
169     }
170 }
171 
172 void qemu_announce_self(void)
173 {
174     static QEMUTimer *timer;
175     timer = timer_new_ms(QEMU_CLOCK_REALTIME, qemu_announce_self_once, &timer);
176     qemu_announce_self_once(&timer);
177 }
178 
179 /***********************************************************/
180 /* savevm/loadvm support */
181 
182 static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
183                                    int64_t pos)
184 {
185     int ret;
186     QEMUIOVector qiov;
187 
188     qemu_iovec_init_external(&qiov, iov, iovcnt);
189     ret = bdrv_writev_vmstate(opaque, &qiov, pos);
190     if (ret < 0) {
191         return ret;
192     }
193 
194     return qiov.size;
195 }
196 
197 static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
198                                 size_t size)
199 {
200     return bdrv_load_vmstate(opaque, buf, pos, size);
201 }
202 
203 static int bdrv_fclose(void *opaque)
204 {
205     return bdrv_flush(opaque);
206 }
207 
208 static const QEMUFileOps bdrv_read_ops = {
209     .get_buffer = block_get_buffer,
210     .close =      bdrv_fclose
211 };
212 
213 static const QEMUFileOps bdrv_write_ops = {
214     .writev_buffer  = block_writev_buffer,
215     .close          = bdrv_fclose
216 };
217 
218 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
219 {
220     if (is_writable) {
221         return qemu_fopen_ops(bs, &bdrv_write_ops);
222     }
223     return qemu_fopen_ops(bs, &bdrv_read_ops);
224 }
225 
226 
227 /* QEMUFile timer support.
228  * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
229  */
230 
231 void timer_put(QEMUFile *f, QEMUTimer *ts)
232 {
233     uint64_t expire_time;
234 
235     expire_time = timer_expire_time_ns(ts);
236     qemu_put_be64(f, expire_time);
237 }
238 
239 void timer_get(QEMUFile *f, QEMUTimer *ts)
240 {
241     uint64_t expire_time;
242 
243     expire_time = qemu_get_be64(f);
244     if (expire_time != -1) {
245         timer_mod_ns(ts, expire_time);
246     } else {
247         timer_del(ts);
248     }
249 }
250 
251 
252 /* VMState timer support.
253  * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
254  */
255 
256 static int get_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field)
257 {
258     QEMUTimer *v = pv;
259     timer_get(f, v);
260     return 0;
261 }
262 
263 static int put_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field,
264                      QJSON *vmdesc)
265 {
266     QEMUTimer *v = pv;
267     timer_put(f, v);
268 
269     return 0;
270 }
271 
272 const VMStateInfo vmstate_info_timer = {
273     .name = "timer",
274     .get  = get_timer,
275     .put  = put_timer,
276 };
277 
278 
279 typedef struct CompatEntry {
280     char idstr[256];
281     int instance_id;
282 } CompatEntry;
283 
284 typedef struct SaveStateEntry {
285     QTAILQ_ENTRY(SaveStateEntry) entry;
286     char idstr[256];
287     int instance_id;
288     int alias_id;
289     int version_id;
290     /* version id read from the stream */
291     int load_version_id;
292     int section_id;
293     /* section id read from the stream */
294     int load_section_id;
295     SaveVMHandlers *ops;
296     const VMStateDescription *vmsd;
297     void *opaque;
298     CompatEntry *compat;
299     int is_ram;
300 } SaveStateEntry;
301 
302 typedef struct SaveState {
303     QTAILQ_HEAD(, SaveStateEntry) handlers;
304     int global_section_id;
305     uint32_t len;
306     const char *name;
307     uint32_t target_page_bits;
308 } SaveState;
309 
310 static SaveState savevm_state = {
311     .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
312     .global_section_id = 0,
313 };
314 
315 static void configuration_pre_save(void *opaque)
316 {
317     SaveState *state = opaque;
318     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
319 
320     state->len = strlen(current_name);
321     state->name = current_name;
322     state->target_page_bits = qemu_target_page_bits();
323 }
324 
325 static int configuration_pre_load(void *opaque)
326 {
327     SaveState *state = opaque;
328 
329     /* If there is no target-page-bits subsection it means the source
330      * predates the variable-target-page-bits support and is using the
331      * minimum possible value for this CPU.
332      */
333     state->target_page_bits = qemu_target_page_bits_min();
334     return 0;
335 }
336 
337 static int configuration_post_load(void *opaque, int version_id)
338 {
339     SaveState *state = opaque;
340     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
341 
342     if (strncmp(state->name, current_name, state->len) != 0) {
343         error_report("Machine type received is '%.*s' and local is '%s'",
344                      (int) state->len, state->name, current_name);
345         return -EINVAL;
346     }
347 
348     if (state->target_page_bits != qemu_target_page_bits()) {
349         error_report("Received TARGET_PAGE_BITS is %d but local is %d",
350                      state->target_page_bits, qemu_target_page_bits());
351         return -EINVAL;
352     }
353 
354     return 0;
355 }
356 
357 /* The target-page-bits subsection is present only if the
358  * target page size is not the same as the default (ie the
359  * minimum page size for a variable-page-size guest CPU).
360  * If it is present then it contains the actual target page
361  * bits for the machine, and migration will fail if the
362  * two ends don't agree about it.
363  */
364 static bool vmstate_target_page_bits_needed(void *opaque)
365 {
366     return qemu_target_page_bits()
367         > qemu_target_page_bits_min();
368 }
369 
370 static const VMStateDescription vmstate_target_page_bits = {
371     .name = "configuration/target-page-bits",
372     .version_id = 1,
373     .minimum_version_id = 1,
374     .needed = vmstate_target_page_bits_needed,
375     .fields = (VMStateField[]) {
376         VMSTATE_UINT32(target_page_bits, SaveState),
377         VMSTATE_END_OF_LIST()
378     }
379 };
380 
381 static const VMStateDescription vmstate_configuration = {
382     .name = "configuration",
383     .version_id = 1,
384     .pre_load = configuration_pre_load,
385     .post_load = configuration_post_load,
386     .pre_save = configuration_pre_save,
387     .fields = (VMStateField[]) {
388         VMSTATE_UINT32(len, SaveState),
389         VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
390         VMSTATE_END_OF_LIST()
391     },
392     .subsections = (const VMStateDescription*[]) {
393         &vmstate_target_page_bits,
394         NULL
395     }
396 };
397 
398 static void dump_vmstate_vmsd(FILE *out_file,
399                               const VMStateDescription *vmsd, int indent,
400                               bool is_subsection);
401 
402 static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
403                               int indent)
404 {
405     fprintf(out_file, "%*s{\n", indent, "");
406     indent += 2;
407     fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
408     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
409             field->version_id);
410     fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
411             field->field_exists ? "true" : "false");
412     fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
413     if (field->vmsd != NULL) {
414         fprintf(out_file, ",\n");
415         dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
416     }
417     fprintf(out_file, "\n%*s}", indent - 2, "");
418 }
419 
420 static void dump_vmstate_vmss(FILE *out_file,
421                               const VMStateDescription **subsection,
422                               int indent)
423 {
424     if (*subsection != NULL) {
425         dump_vmstate_vmsd(out_file, *subsection, indent, true);
426     }
427 }
428 
429 static void dump_vmstate_vmsd(FILE *out_file,
430                               const VMStateDescription *vmsd, int indent,
431                               bool is_subsection)
432 {
433     if (is_subsection) {
434         fprintf(out_file, "%*s{\n", indent, "");
435     } else {
436         fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
437     }
438     indent += 2;
439     fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
440     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
441             vmsd->version_id);
442     fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
443             vmsd->minimum_version_id);
444     if (vmsd->fields != NULL) {
445         const VMStateField *field = vmsd->fields;
446         bool first;
447 
448         fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
449         first = true;
450         while (field->name != NULL) {
451             if (field->flags & VMS_MUST_EXIST) {
452                 /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
453                 field++;
454                 continue;
455             }
456             if (!first) {
457                 fprintf(out_file, ",\n");
458             }
459             dump_vmstate_vmsf(out_file, field, indent + 2);
460             field++;
461             first = false;
462         }
463         fprintf(out_file, "\n%*s]", indent, "");
464     }
465     if (vmsd->subsections != NULL) {
466         const VMStateDescription **subsection = vmsd->subsections;
467         bool first;
468 
469         fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
470         first = true;
471         while (*subsection != NULL) {
472             if (!first) {
473                 fprintf(out_file, ",\n");
474             }
475             dump_vmstate_vmss(out_file, subsection, indent + 2);
476             subsection++;
477             first = false;
478         }
479         fprintf(out_file, "\n%*s]", indent, "");
480     }
481     fprintf(out_file, "\n%*s}", indent - 2, "");
482 }
483 
484 static void dump_machine_type(FILE *out_file)
485 {
486     MachineClass *mc;
487 
488     mc = MACHINE_GET_CLASS(current_machine);
489 
490     fprintf(out_file, "  \"vmschkmachine\": {\n");
491     fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
492     fprintf(out_file, "  },\n");
493 }
494 
495 void dump_vmstate_json_to_file(FILE *out_file)
496 {
497     GSList *list, *elt;
498     bool first;
499 
500     fprintf(out_file, "{\n");
501     dump_machine_type(out_file);
502 
503     first = true;
504     list = object_class_get_list(TYPE_DEVICE, true);
505     for (elt = list; elt; elt = elt->next) {
506         DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
507                                              TYPE_DEVICE);
508         const char *name;
509         int indent = 2;
510 
511         if (!dc->vmsd) {
512             continue;
513         }
514 
515         if (!first) {
516             fprintf(out_file, ",\n");
517         }
518         name = object_class_get_name(OBJECT_CLASS(dc));
519         fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
520         indent += 2;
521         fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
522         fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
523                 dc->vmsd->version_id);
524         fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
525                 dc->vmsd->minimum_version_id);
526 
527         dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
528 
529         fprintf(out_file, "\n%*s}", indent - 2, "");
530         first = false;
531     }
532     fprintf(out_file, "\n}\n");
533     fclose(out_file);
534 }
535 
536 static int calculate_new_instance_id(const char *idstr)
537 {
538     SaveStateEntry *se;
539     int instance_id = 0;
540 
541     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
542         if (strcmp(idstr, se->idstr) == 0
543             && instance_id <= se->instance_id) {
544             instance_id = se->instance_id + 1;
545         }
546     }
547     return instance_id;
548 }
549 
550 static int calculate_compat_instance_id(const char *idstr)
551 {
552     SaveStateEntry *se;
553     int instance_id = 0;
554 
555     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
556         if (!se->compat) {
557             continue;
558         }
559 
560         if (strcmp(idstr, se->compat->idstr) == 0
561             && instance_id <= se->compat->instance_id) {
562             instance_id = se->compat->instance_id + 1;
563         }
564     }
565     return instance_id;
566 }
567 
568 static inline MigrationPriority save_state_priority(SaveStateEntry *se)
569 {
570     if (se->vmsd) {
571         return se->vmsd->priority;
572     }
573     return MIG_PRI_DEFAULT;
574 }
575 
576 static void savevm_state_handler_insert(SaveStateEntry *nse)
577 {
578     MigrationPriority priority = save_state_priority(nse);
579     SaveStateEntry *se;
580 
581     assert(priority <= MIG_PRI_MAX);
582 
583     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
584         if (save_state_priority(se) < priority) {
585             break;
586         }
587     }
588 
589     if (se) {
590         QTAILQ_INSERT_BEFORE(se, nse, entry);
591     } else {
592         QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
593     }
594 }
595 
596 /* TODO: Individual devices generally have very little idea about the rest
597    of the system, so instance_id should be removed/replaced.
598    Meanwhile pass -1 as instance_id if you do not already have a clearly
599    distinguishing id for all instances of your device class. */
600 int register_savevm_live(DeviceState *dev,
601                          const char *idstr,
602                          int instance_id,
603                          int version_id,
604                          SaveVMHandlers *ops,
605                          void *opaque)
606 {
607     SaveStateEntry *se;
608 
609     se = g_new0(SaveStateEntry, 1);
610     se->version_id = version_id;
611     se->section_id = savevm_state.global_section_id++;
612     se->ops = ops;
613     se->opaque = opaque;
614     se->vmsd = NULL;
615     /* if this is a live_savem then set is_ram */
616     if (ops->save_setup != NULL) {
617         se->is_ram = 1;
618     }
619 
620     if (dev) {
621         char *id = qdev_get_dev_path(dev);
622         if (id) {
623             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
624                 sizeof(se->idstr)) {
625                 error_report("Path too long for VMState (%s)", id);
626                 g_free(id);
627                 g_free(se);
628 
629                 return -1;
630             }
631             g_free(id);
632 
633             se->compat = g_new0(CompatEntry, 1);
634             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr);
635             se->compat->instance_id = instance_id == -1 ?
636                          calculate_compat_instance_id(idstr) : instance_id;
637             instance_id = -1;
638         }
639     }
640     pstrcat(se->idstr, sizeof(se->idstr), idstr);
641 
642     if (instance_id == -1) {
643         se->instance_id = calculate_new_instance_id(se->idstr);
644     } else {
645         se->instance_id = instance_id;
646     }
647     assert(!se->compat || se->instance_id == 0);
648     savevm_state_handler_insert(se);
649     return 0;
650 }
651 
652 void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque)
653 {
654     SaveStateEntry *se, *new_se;
655     char id[256] = "";
656 
657     if (dev) {
658         char *path = qdev_get_dev_path(dev);
659         if (path) {
660             pstrcpy(id, sizeof(id), path);
661             pstrcat(id, sizeof(id), "/");
662             g_free(path);
663         }
664     }
665     pstrcat(id, sizeof(id), idstr);
666 
667     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
668         if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
669             QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
670             g_free(se->compat);
671             g_free(se);
672         }
673     }
674 }
675 
676 int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
677                                    const VMStateDescription *vmsd,
678                                    void *opaque, int alias_id,
679                                    int required_for_version,
680                                    Error **errp)
681 {
682     SaveStateEntry *se;
683 
684     /* If this triggers, alias support can be dropped for the vmsd. */
685     assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
686 
687     se = g_new0(SaveStateEntry, 1);
688     se->version_id = vmsd->version_id;
689     se->section_id = savevm_state.global_section_id++;
690     se->opaque = opaque;
691     se->vmsd = vmsd;
692     se->alias_id = alias_id;
693 
694     if (dev) {
695         char *id = qdev_get_dev_path(dev);
696         if (id) {
697             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
698                 sizeof(se->idstr)) {
699                 error_setg(errp, "Path too long for VMState (%s)", id);
700                 g_free(id);
701                 g_free(se);
702 
703                 return -1;
704             }
705             g_free(id);
706 
707             se->compat = g_new0(CompatEntry, 1);
708             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
709             se->compat->instance_id = instance_id == -1 ?
710                          calculate_compat_instance_id(vmsd->name) : instance_id;
711             instance_id = -1;
712         }
713     }
714     pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
715 
716     if (instance_id == -1) {
717         se->instance_id = calculate_new_instance_id(se->idstr);
718     } else {
719         se->instance_id = instance_id;
720     }
721     assert(!se->compat || se->instance_id == 0);
722     savevm_state_handler_insert(se);
723     return 0;
724 }
725 
726 void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd,
727                         void *opaque)
728 {
729     SaveStateEntry *se, *new_se;
730 
731     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
732         if (se->vmsd == vmsd && se->opaque == opaque) {
733             QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
734             g_free(se->compat);
735             g_free(se);
736         }
737     }
738 }
739 
740 static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
741 {
742     trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
743     if (!se->vmsd) {         /* Old style */
744         return se->ops->load_state(f, se->opaque, se->load_version_id);
745     }
746     return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
747 }
748 
749 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
750 {
751     int64_t old_offset, size;
752 
753     old_offset = qemu_ftell_fast(f);
754     se->ops->save_state(f, se->opaque);
755     size = qemu_ftell_fast(f) - old_offset;
756 
757     if (vmdesc) {
758         json_prop_int(vmdesc, "size", size);
759         json_start_array(vmdesc, "fields");
760         json_start_object(vmdesc, NULL);
761         json_prop_str(vmdesc, "name", "data");
762         json_prop_int(vmdesc, "size", size);
763         json_prop_str(vmdesc, "type", "buffer");
764         json_end_object(vmdesc);
765         json_end_array(vmdesc);
766     }
767 }
768 
769 static void vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
770 {
771     trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
772     if (!se->vmsd) {
773         vmstate_save_old_style(f, se, vmdesc);
774         return;
775     }
776     vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
777 }
778 
779 /*
780  * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
781  */
782 static void save_section_header(QEMUFile *f, SaveStateEntry *se,
783                                 uint8_t section_type)
784 {
785     qemu_put_byte(f, section_type);
786     qemu_put_be32(f, se->section_id);
787 
788     if (section_type == QEMU_VM_SECTION_FULL ||
789         section_type == QEMU_VM_SECTION_START) {
790         /* ID string */
791         size_t len = strlen(se->idstr);
792         qemu_put_byte(f, len);
793         qemu_put_buffer(f, (uint8_t *)se->idstr, len);
794 
795         qemu_put_be32(f, se->instance_id);
796         qemu_put_be32(f, se->version_id);
797     }
798 }
799 
800 /*
801  * Write a footer onto device sections that catches cases misformatted device
802  * sections.
803  */
804 static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
805 {
806     if (migrate_get_current()->send_section_footer) {
807         qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
808         qemu_put_be32(f, se->section_id);
809     }
810 }
811 
812 /**
813  * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
814  *                           command and associated data.
815  *
816  * @f: File to send command on
817  * @command: Command type to send
818  * @len: Length of associated data
819  * @data: Data associated with command.
820  */
821 static void qemu_savevm_command_send(QEMUFile *f,
822                                      enum qemu_vm_cmd command,
823                                      uint16_t len,
824                                      uint8_t *data)
825 {
826     trace_savevm_command_send(command, len);
827     qemu_put_byte(f, QEMU_VM_COMMAND);
828     qemu_put_be16(f, (uint16_t)command);
829     qemu_put_be16(f, len);
830     qemu_put_buffer(f, data, len);
831     qemu_fflush(f);
832 }
833 
834 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
835 {
836     uint32_t buf;
837 
838     trace_savevm_send_ping(value);
839     buf = cpu_to_be32(value);
840     qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
841 }
842 
843 void qemu_savevm_send_open_return_path(QEMUFile *f)
844 {
845     trace_savevm_send_open_return_path();
846     qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
847 }
848 
849 /* We have a buffer of data to send; we don't want that all to be loaded
850  * by the command itself, so the command contains just the length of the
851  * extra buffer that we then send straight after it.
852  * TODO: Must be a better way to organise that
853  *
854  * Returns:
855  *    0 on success
856  *    -ve on error
857  */
858 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
859 {
860     uint32_t tmp;
861 
862     if (len > MAX_VM_CMD_PACKAGED_SIZE) {
863         error_report("%s: Unreasonably large packaged state: %zu",
864                      __func__, len);
865         return -1;
866     }
867 
868     tmp = cpu_to_be32(len);
869 
870     trace_qemu_savevm_send_packaged();
871     qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
872 
873     qemu_put_buffer(f, buf, len);
874 
875     return 0;
876 }
877 
878 /* Send prior to any postcopy transfer */
879 void qemu_savevm_send_postcopy_advise(QEMUFile *f)
880 {
881     if (migrate_postcopy_ram()) {
882         uint64_t tmp[2];
883         tmp[0] = cpu_to_be64(ram_pagesize_summary());
884         tmp[1] = cpu_to_be64(qemu_target_page_size());
885 
886         trace_qemu_savevm_send_postcopy_advise();
887         qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
888                                  16, (uint8_t *)tmp);
889     } else {
890         qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
891     }
892 }
893 
894 /* Sent prior to starting the destination running in postcopy, discard pages
895  * that have already been sent but redirtied on the source.
896  * CMD_POSTCOPY_RAM_DISCARD consist of:
897  *      byte   version (0)
898  *      byte   Length of name field (not including 0)
899  *  n x byte   RAM block name
900  *      byte   0 terminator (just for safety)
901  *  n x        Byte ranges within the named RAMBlock
902  *      be64   Start of the range
903  *      be64   Length
904  *
905  *  name:  RAMBlock name that these entries are part of
906  *  len: Number of page entries
907  *  start_list: 'len' addresses
908  *  length_list: 'len' addresses
909  *
910  */
911 void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
912                                            uint16_t len,
913                                            uint64_t *start_list,
914                                            uint64_t *length_list)
915 {
916     uint8_t *buf;
917     uint16_t tmplen;
918     uint16_t t;
919     size_t name_len = strlen(name);
920 
921     trace_qemu_savevm_send_postcopy_ram_discard(name, len);
922     assert(name_len < 256);
923     buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
924     buf[0] = postcopy_ram_discard_version;
925     buf[1] = name_len;
926     memcpy(buf + 2, name, name_len);
927     tmplen = 2 + name_len;
928     buf[tmplen++] = '\0';
929 
930     for (t = 0; t < len; t++) {
931         stq_be_p(buf + tmplen, start_list[t]);
932         tmplen += 8;
933         stq_be_p(buf + tmplen, length_list[t]);
934         tmplen += 8;
935     }
936     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
937     g_free(buf);
938 }
939 
940 /* Get the destination into a state where it can receive postcopy data. */
941 void qemu_savevm_send_postcopy_listen(QEMUFile *f)
942 {
943     trace_savevm_send_postcopy_listen();
944     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
945 }
946 
947 /* Kick the destination into running */
948 void qemu_savevm_send_postcopy_run(QEMUFile *f)
949 {
950     trace_savevm_send_postcopy_run();
951     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
952 }
953 
954 bool qemu_savevm_state_blocked(Error **errp)
955 {
956     SaveStateEntry *se;
957 
958     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
959         if (se->vmsd && se->vmsd->unmigratable) {
960             error_setg(errp, "State blocked by non-migratable device '%s'",
961                        se->idstr);
962             return true;
963         }
964     }
965     return false;
966 }
967 
968 void qemu_savevm_state_header(QEMUFile *f)
969 {
970     trace_savevm_state_header();
971     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
972     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
973 
974     if (migrate_get_current()->send_configuration) {
975         qemu_put_byte(f, QEMU_VM_CONFIGURATION);
976         vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
977     }
978 }
979 
980 void qemu_savevm_state_setup(QEMUFile *f)
981 {
982     SaveStateEntry *se;
983     int ret;
984 
985     trace_savevm_state_setup();
986     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
987         if (!se->ops || !se->ops->save_setup) {
988             continue;
989         }
990         if (se->ops && se->ops->is_active) {
991             if (!se->ops->is_active(se->opaque)) {
992                 continue;
993             }
994         }
995         save_section_header(f, se, QEMU_VM_SECTION_START);
996 
997         ret = se->ops->save_setup(f, se->opaque);
998         save_section_footer(f, se);
999         if (ret < 0) {
1000             qemu_file_set_error(f, ret);
1001             break;
1002         }
1003     }
1004 }
1005 
1006 /*
1007  * this function has three return values:
1008  *   negative: there was one error, and we have -errno.
1009  *   0 : We haven't finished, caller have to go again
1010  *   1 : We have finished, we can go to complete phase
1011  */
1012 int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1013 {
1014     SaveStateEntry *se;
1015     int ret = 1;
1016 
1017     trace_savevm_state_iterate();
1018     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1019         if (!se->ops || !se->ops->save_live_iterate) {
1020             continue;
1021         }
1022         if (se->ops && se->ops->is_active) {
1023             if (!se->ops->is_active(se->opaque)) {
1024                 continue;
1025             }
1026         }
1027         /*
1028          * In the postcopy phase, any device that doesn't know how to
1029          * do postcopy should have saved it's state in the _complete
1030          * call that's already run, it might get confused if we call
1031          * iterate afterwards.
1032          */
1033         if (postcopy &&
1034             !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1035             continue;
1036         }
1037         if (qemu_file_rate_limit(f)) {
1038             return 0;
1039         }
1040         trace_savevm_section_start(se->idstr, se->section_id);
1041 
1042         save_section_header(f, se, QEMU_VM_SECTION_PART);
1043 
1044         ret = se->ops->save_live_iterate(f, se->opaque);
1045         trace_savevm_section_end(se->idstr, se->section_id, ret);
1046         save_section_footer(f, se);
1047 
1048         if (ret < 0) {
1049             qemu_file_set_error(f, ret);
1050         }
1051         if (ret <= 0) {
1052             /* Do not proceed to the next vmstate before this one reported
1053                completion of the current stage. This serializes the migration
1054                and reduces the probability that a faster changing state is
1055                synchronized over and over again. */
1056             break;
1057         }
1058     }
1059     return ret;
1060 }
1061 
1062 static bool should_send_vmdesc(void)
1063 {
1064     MachineState *machine = MACHINE(qdev_get_machine());
1065     bool in_postcopy = migration_in_postcopy();
1066     return !machine->suppress_vmdesc && !in_postcopy;
1067 }
1068 
1069 /*
1070  * Calls the save_live_complete_postcopy methods
1071  * causing the last few pages to be sent immediately and doing any associated
1072  * cleanup.
1073  * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1074  * all the other devices, but that happens at the point we switch to postcopy.
1075  */
1076 void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1077 {
1078     SaveStateEntry *se;
1079     int ret;
1080 
1081     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1082         if (!se->ops || !se->ops->save_live_complete_postcopy) {
1083             continue;
1084         }
1085         if (se->ops && se->ops->is_active) {
1086             if (!se->ops->is_active(se->opaque)) {
1087                 continue;
1088             }
1089         }
1090         trace_savevm_section_start(se->idstr, se->section_id);
1091         /* Section type */
1092         qemu_put_byte(f, QEMU_VM_SECTION_END);
1093         qemu_put_be32(f, se->section_id);
1094 
1095         ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1096         trace_savevm_section_end(se->idstr, se->section_id, ret);
1097         save_section_footer(f, se);
1098         if (ret < 0) {
1099             qemu_file_set_error(f, ret);
1100             return;
1101         }
1102     }
1103 
1104     qemu_put_byte(f, QEMU_VM_EOF);
1105     qemu_fflush(f);
1106 }
1107 
1108 int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1109                                        bool inactivate_disks)
1110 {
1111     QJSON *vmdesc;
1112     int vmdesc_len;
1113     SaveStateEntry *se;
1114     int ret;
1115     bool in_postcopy = migration_in_postcopy();
1116 
1117     trace_savevm_state_complete_precopy();
1118 
1119     cpu_synchronize_all_states();
1120 
1121     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1122         if (!se->ops ||
1123             (in_postcopy && se->ops->has_postcopy &&
1124              se->ops->has_postcopy(se->opaque)) ||
1125             (in_postcopy && !iterable_only) ||
1126             !se->ops->save_live_complete_precopy) {
1127             continue;
1128         }
1129 
1130         if (se->ops && se->ops->is_active) {
1131             if (!se->ops->is_active(se->opaque)) {
1132                 continue;
1133             }
1134         }
1135         trace_savevm_section_start(se->idstr, se->section_id);
1136 
1137         save_section_header(f, se, QEMU_VM_SECTION_END);
1138 
1139         ret = se->ops->save_live_complete_precopy(f, se->opaque);
1140         trace_savevm_section_end(se->idstr, se->section_id, ret);
1141         save_section_footer(f, se);
1142         if (ret < 0) {
1143             qemu_file_set_error(f, ret);
1144             return -1;
1145         }
1146     }
1147 
1148     if (iterable_only) {
1149         return 0;
1150     }
1151 
1152     vmdesc = qjson_new();
1153     json_prop_int(vmdesc, "page_size", qemu_target_page_size());
1154     json_start_array(vmdesc, "devices");
1155     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1156 
1157         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1158             continue;
1159         }
1160         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1161             trace_savevm_section_skip(se->idstr, se->section_id);
1162             continue;
1163         }
1164 
1165         trace_savevm_section_start(se->idstr, se->section_id);
1166 
1167         json_start_object(vmdesc, NULL);
1168         json_prop_str(vmdesc, "name", se->idstr);
1169         json_prop_int(vmdesc, "instance_id", se->instance_id);
1170 
1171         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1172         vmstate_save(f, se, vmdesc);
1173         trace_savevm_section_end(se->idstr, se->section_id, 0);
1174         save_section_footer(f, se);
1175 
1176         json_end_object(vmdesc);
1177     }
1178 
1179     if (inactivate_disks) {
1180         /* Inactivate before sending QEMU_VM_EOF so that the
1181          * bdrv_invalidate_cache_all() on the other end won't fail. */
1182         ret = bdrv_inactivate_all();
1183         if (ret) {
1184             error_report("%s: bdrv_inactivate_all() failed (%d)",
1185                          __func__, ret);
1186             qemu_file_set_error(f, ret);
1187             return ret;
1188         }
1189     }
1190     if (!in_postcopy) {
1191         /* Postcopy stream will still be going */
1192         qemu_put_byte(f, QEMU_VM_EOF);
1193     }
1194 
1195     json_end_array(vmdesc);
1196     qjson_finish(vmdesc);
1197     vmdesc_len = strlen(qjson_get_str(vmdesc));
1198 
1199     if (should_send_vmdesc()) {
1200         qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1201         qemu_put_be32(f, vmdesc_len);
1202         qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len);
1203     }
1204     qjson_destroy(vmdesc);
1205 
1206     qemu_fflush(f);
1207     return 0;
1208 }
1209 
1210 /* Give an estimate of the amount left to be transferred,
1211  * the result is split into the amount for units that can and
1212  * for units that can't do postcopy.
1213  */
1214 void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1215                                uint64_t *res_non_postcopiable,
1216                                uint64_t *res_postcopiable)
1217 {
1218     SaveStateEntry *se;
1219 
1220     *res_non_postcopiable = 0;
1221     *res_postcopiable = 0;
1222 
1223 
1224     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1225         if (!se->ops || !se->ops->save_live_pending) {
1226             continue;
1227         }
1228         if (se->ops && se->ops->is_active) {
1229             if (!se->ops->is_active(se->opaque)) {
1230                 continue;
1231             }
1232         }
1233         se->ops->save_live_pending(f, se->opaque, threshold_size,
1234                                    res_non_postcopiable, res_postcopiable);
1235     }
1236 }
1237 
1238 void qemu_savevm_state_cleanup(void)
1239 {
1240     SaveStateEntry *se;
1241 
1242     trace_savevm_state_cleanup();
1243     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1244         if (se->ops && se->ops->save_cleanup) {
1245             se->ops->save_cleanup(se->opaque);
1246         }
1247     }
1248 }
1249 
1250 static int qemu_savevm_state(QEMUFile *f, Error **errp)
1251 {
1252     int ret;
1253     MigrationState *ms = migrate_init();
1254     MigrationStatus status;
1255     ms->to_dst_file = f;
1256 
1257     if (migration_is_blocked(errp)) {
1258         ret = -EINVAL;
1259         goto done;
1260     }
1261 
1262     if (migrate_use_block()) {
1263         error_setg(errp, "Block migration and snapshots are incompatible");
1264         ret = -EINVAL;
1265         goto done;
1266     }
1267 
1268     qemu_mutex_unlock_iothread();
1269     qemu_savevm_state_header(f);
1270     qemu_savevm_state_setup(f);
1271     qemu_mutex_lock_iothread();
1272 
1273     while (qemu_file_get_error(f) == 0) {
1274         if (qemu_savevm_state_iterate(f, false) > 0) {
1275             break;
1276         }
1277     }
1278 
1279     ret = qemu_file_get_error(f);
1280     if (ret == 0) {
1281         qemu_savevm_state_complete_precopy(f, false, false);
1282         ret = qemu_file_get_error(f);
1283     }
1284     qemu_savevm_state_cleanup();
1285     if (ret != 0) {
1286         error_setg_errno(errp, -ret, "Error while writing VM state");
1287     }
1288 
1289 done:
1290     if (ret != 0) {
1291         status = MIGRATION_STATUS_FAILED;
1292     } else {
1293         status = MIGRATION_STATUS_COMPLETED;
1294     }
1295     migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1296 
1297     /* f is outer parameter, it should not stay in global migration state after
1298      * this function finished */
1299     ms->to_dst_file = NULL;
1300 
1301     return ret;
1302 }
1303 
1304 static int qemu_save_device_state(QEMUFile *f)
1305 {
1306     SaveStateEntry *se;
1307 
1308     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1309     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1310 
1311     cpu_synchronize_all_states();
1312 
1313     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1314         if (se->is_ram) {
1315             continue;
1316         }
1317         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1318             continue;
1319         }
1320         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1321             continue;
1322         }
1323 
1324         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1325 
1326         vmstate_save(f, se, NULL);
1327 
1328         save_section_footer(f, se);
1329     }
1330 
1331     qemu_put_byte(f, QEMU_VM_EOF);
1332 
1333     return qemu_file_get_error(f);
1334 }
1335 
1336 static SaveStateEntry *find_se(const char *idstr, int instance_id)
1337 {
1338     SaveStateEntry *se;
1339 
1340     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1341         if (!strcmp(se->idstr, idstr) &&
1342             (instance_id == se->instance_id ||
1343              instance_id == se->alias_id))
1344             return se;
1345         /* Migrating from an older version? */
1346         if (strstr(se->idstr, idstr) && se->compat) {
1347             if (!strcmp(se->compat->idstr, idstr) &&
1348                 (instance_id == se->compat->instance_id ||
1349                  instance_id == se->alias_id))
1350                 return se;
1351         }
1352     }
1353     return NULL;
1354 }
1355 
1356 enum LoadVMExitCodes {
1357     /* Allow a command to quit all layers of nested loadvm loops */
1358     LOADVM_QUIT     =  1,
1359 };
1360 
1361 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
1362 
1363 /* ------ incoming postcopy messages ------ */
1364 /* 'advise' arrives before any transfers just to tell us that a postcopy
1365  * *might* happen - it might be skipped if precopy transferred everything
1366  * quickly.
1367  */
1368 static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
1369 {
1370     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1371     uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1372 
1373     trace_loadvm_postcopy_handle_advise();
1374     if (ps != POSTCOPY_INCOMING_NONE) {
1375         error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1376         return -1;
1377     }
1378 
1379     if (!migrate_postcopy_ram()) {
1380         return 0;
1381     }
1382 
1383     if (!postcopy_ram_supported_by_host(mis)) {
1384         postcopy_state_set(POSTCOPY_INCOMING_NONE);
1385         return -1;
1386     }
1387 
1388     remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1389     local_pagesize_summary = ram_pagesize_summary();
1390 
1391     if (remote_pagesize_summary != local_pagesize_summary)  {
1392         /*
1393          * This detects two potential causes of mismatch:
1394          *   a) A mismatch in host page sizes
1395          *      Some combinations of mismatch are probably possible but it gets
1396          *      a bit more complicated.  In particular we need to place whole
1397          *      host pages on the dest at once, and we need to ensure that we
1398          *      handle dirtying to make sure we never end up sending part of
1399          *      a hostpage on it's own.
1400          *   b) The use of different huge page sizes on source/destination
1401          *      a more fine grain test is performed during RAM block migration
1402          *      but this test here causes a nice early clear failure, and
1403          *      also fails when passed to an older qemu that doesn't
1404          *      do huge pages.
1405          */
1406         error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1407                                                              " d=%" PRIx64 ")",
1408                      remote_pagesize_summary, local_pagesize_summary);
1409         return -1;
1410     }
1411 
1412     remote_tps = qemu_get_be64(mis->from_src_file);
1413     if (remote_tps != qemu_target_page_size()) {
1414         /*
1415          * Again, some differences could be dealt with, but for now keep it
1416          * simple.
1417          */
1418         error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1419                      (int)remote_tps, qemu_target_page_size());
1420         return -1;
1421     }
1422 
1423     if (ram_postcopy_incoming_init(mis)) {
1424         return -1;
1425     }
1426 
1427     postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1428 
1429     return 0;
1430 }
1431 
1432 /* After postcopy we will be told to throw some pages away since they're
1433  * dirty and will have to be demand fetched.  Must happen before CPU is
1434  * started.
1435  * There can be 0..many of these messages, each encoding multiple pages.
1436  */
1437 static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1438                                               uint16_t len)
1439 {
1440     int tmp;
1441     char ramid[256];
1442     PostcopyState ps = postcopy_state_get();
1443 
1444     trace_loadvm_postcopy_ram_handle_discard();
1445 
1446     switch (ps) {
1447     case POSTCOPY_INCOMING_ADVISE:
1448         /* 1st discard */
1449         tmp = postcopy_ram_prepare_discard(mis);
1450         if (tmp) {
1451             return tmp;
1452         }
1453         break;
1454 
1455     case POSTCOPY_INCOMING_DISCARD:
1456         /* Expected state */
1457         break;
1458 
1459     default:
1460         error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1461                      ps);
1462         return -1;
1463     }
1464     /* We're expecting a
1465      *    Version (0)
1466      *    a RAM ID string (length byte, name, 0 term)
1467      *    then at least 1 16 byte chunk
1468     */
1469     if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1470         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1471         return -1;
1472     }
1473 
1474     tmp = qemu_get_byte(mis->from_src_file);
1475     if (tmp != postcopy_ram_discard_version) {
1476         error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1477         return -1;
1478     }
1479 
1480     if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1481         error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1482         return -1;
1483     }
1484     tmp = qemu_get_byte(mis->from_src_file);
1485     if (tmp != 0) {
1486         error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1487         return -1;
1488     }
1489 
1490     len -= 3 + strlen(ramid);
1491     if (len % 16) {
1492         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1493         return -1;
1494     }
1495     trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1496     while (len) {
1497         uint64_t start_addr, block_length;
1498         start_addr = qemu_get_be64(mis->from_src_file);
1499         block_length = qemu_get_be64(mis->from_src_file);
1500 
1501         len -= 16;
1502         int ret = ram_discard_range(ramid, start_addr, block_length);
1503         if (ret) {
1504             return ret;
1505         }
1506     }
1507     trace_loadvm_postcopy_ram_handle_discard_end();
1508 
1509     return 0;
1510 }
1511 
1512 /*
1513  * Triggered by a postcopy_listen command; this thread takes over reading
1514  * the input stream, leaving the main thread free to carry on loading the rest
1515  * of the device state (from RAM).
1516  * (TODO:This could do with being in a postcopy file - but there again it's
1517  * just another input loop, not that postcopy specific)
1518  */
1519 static void *postcopy_ram_listen_thread(void *opaque)
1520 {
1521     QEMUFile *f = opaque;
1522     MigrationIncomingState *mis = migration_incoming_get_current();
1523     int load_res;
1524 
1525     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1526                                    MIGRATION_STATUS_POSTCOPY_ACTIVE);
1527     qemu_sem_post(&mis->listen_thread_sem);
1528     trace_postcopy_ram_listen_thread_start();
1529 
1530     /*
1531      * Because we're a thread and not a coroutine we can't yield
1532      * in qemu_file, and thus we must be blocking now.
1533      */
1534     qemu_file_set_blocking(f, true);
1535     load_res = qemu_loadvm_state_main(f, mis);
1536     /* And non-blocking again so we don't block in any cleanup */
1537     qemu_file_set_blocking(f, false);
1538 
1539     trace_postcopy_ram_listen_thread_exit();
1540     if (load_res < 0) {
1541         error_report("%s: loadvm failed: %d", __func__, load_res);
1542         qemu_file_set_error(f, load_res);
1543         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1544                                        MIGRATION_STATUS_FAILED);
1545     } else {
1546         /*
1547          * This looks good, but it's possible that the device loading in the
1548          * main thread hasn't finished yet, and so we might not be in 'RUN'
1549          * state yet; wait for the end of the main thread.
1550          */
1551         qemu_event_wait(&mis->main_thread_load_event);
1552     }
1553     postcopy_ram_incoming_cleanup(mis);
1554 
1555     if (load_res < 0) {
1556         /*
1557          * If something went wrong then we have a bad state so exit;
1558          * depending how far we got it might be possible at this point
1559          * to leave the guest running and fire MCEs for pages that never
1560          * arrived as a desperate recovery step.
1561          */
1562         exit(EXIT_FAILURE);
1563     }
1564 
1565     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1566                                    MIGRATION_STATUS_COMPLETED);
1567     /*
1568      * If everything has worked fine, then the main thread has waited
1569      * for us to start, and we're the last use of the mis.
1570      * (If something broke then qemu will have to exit anyway since it's
1571      * got a bad migration state).
1572      */
1573     migration_incoming_state_destroy();
1574     qemu_loadvm_state_cleanup();
1575 
1576     return NULL;
1577 }
1578 
1579 /* After this message we must be able to immediately receive postcopy data */
1580 static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1581 {
1582     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1583     trace_loadvm_postcopy_handle_listen();
1584     if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1585         error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1586         return -1;
1587     }
1588     if (ps == POSTCOPY_INCOMING_ADVISE) {
1589         /*
1590          * A rare case, we entered listen without having to do any discards,
1591          * so do the setup that's normally done at the time of the 1st discard.
1592          */
1593         if (migrate_postcopy_ram()) {
1594             postcopy_ram_prepare_discard(mis);
1595         }
1596     }
1597 
1598     /*
1599      * Sensitise RAM - can now generate requests for blocks that don't exist
1600      * However, at this point the CPU shouldn't be running, and the IO
1601      * shouldn't be doing anything yet so don't actually expect requests
1602      */
1603     if (migrate_postcopy_ram()) {
1604         if (postcopy_ram_enable_notify(mis)) {
1605             return -1;
1606         }
1607     }
1608 
1609     if (mis->have_listen_thread) {
1610         error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
1611         return -1;
1612     }
1613 
1614     mis->have_listen_thread = true;
1615     /* Start up the listening thread and wait for it to signal ready */
1616     qemu_sem_init(&mis->listen_thread_sem, 0);
1617     qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1618                        postcopy_ram_listen_thread, mis->from_src_file,
1619                        QEMU_THREAD_DETACHED);
1620     qemu_sem_wait(&mis->listen_thread_sem);
1621     qemu_sem_destroy(&mis->listen_thread_sem);
1622 
1623     return 0;
1624 }
1625 
1626 
1627 typedef struct {
1628     QEMUBH *bh;
1629 } HandleRunBhData;
1630 
1631 static void loadvm_postcopy_handle_run_bh(void *opaque)
1632 {
1633     Error *local_err = NULL;
1634     HandleRunBhData *data = opaque;
1635 
1636     /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1637      * in migration.c
1638      */
1639     cpu_synchronize_all_post_init();
1640 
1641     qemu_announce_self();
1642 
1643     /* Make sure all file formats flush their mutable metadata.
1644      * If we get an error here, just don't restart the VM yet. */
1645     bdrv_invalidate_cache_all(&local_err);
1646     if (local_err) {
1647         error_report_err(local_err);
1648         local_err = NULL;
1649         autostart = false;
1650     }
1651 
1652     trace_loadvm_postcopy_handle_run_cpu_sync();
1653     cpu_synchronize_all_post_init();
1654 
1655     trace_loadvm_postcopy_handle_run_vmstart();
1656 
1657     if (autostart) {
1658         /* Hold onto your hats, starting the CPU */
1659         vm_start();
1660     } else {
1661         /* leave it paused and let management decide when to start the CPU */
1662         runstate_set(RUN_STATE_PAUSED);
1663     }
1664 
1665     qemu_bh_delete(data->bh);
1666     g_free(data);
1667 }
1668 
1669 /* After all discards we can start running and asking for pages */
1670 static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
1671 {
1672     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
1673     HandleRunBhData *data;
1674 
1675     trace_loadvm_postcopy_handle_run();
1676     if (ps != POSTCOPY_INCOMING_LISTENING) {
1677         error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
1678         return -1;
1679     }
1680 
1681     data = g_new(HandleRunBhData, 1);
1682     data->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, data);
1683     qemu_bh_schedule(data->bh);
1684 
1685     /* We need to finish reading the stream from the package
1686      * and also stop reading anything more from the stream that loaded the
1687      * package (since it's now being read by the listener thread).
1688      * LOADVM_QUIT will quit all the layers of nested loadvm loops.
1689      */
1690     return LOADVM_QUIT;
1691 }
1692 
1693 /**
1694  * Immediately following this command is a blob of data containing an embedded
1695  * chunk of migration stream; read it and load it.
1696  *
1697  * @mis: Incoming state
1698  * @length: Length of packaged data to read
1699  *
1700  * Returns: Negative values on error
1701  *
1702  */
1703 static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
1704 {
1705     int ret;
1706     size_t length;
1707     QIOChannelBuffer *bioc;
1708 
1709     length = qemu_get_be32(mis->from_src_file);
1710     trace_loadvm_handle_cmd_packaged(length);
1711 
1712     if (length > MAX_VM_CMD_PACKAGED_SIZE) {
1713         error_report("Unreasonably large packaged state: %zu", length);
1714         return -1;
1715     }
1716 
1717     bioc = qio_channel_buffer_new(length);
1718     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
1719     ret = qemu_get_buffer(mis->from_src_file,
1720                           bioc->data,
1721                           length);
1722     if (ret != length) {
1723         object_unref(OBJECT(bioc));
1724         error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
1725                      ret, length);
1726         return (ret < 0) ? ret : -EAGAIN;
1727     }
1728     bioc->usage += length;
1729     trace_loadvm_handle_cmd_packaged_received(ret);
1730 
1731     QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
1732 
1733     ret = qemu_loadvm_state_main(packf, mis);
1734     trace_loadvm_handle_cmd_packaged_main(ret);
1735     qemu_fclose(packf);
1736     object_unref(OBJECT(bioc));
1737 
1738     return ret;
1739 }
1740 
1741 /*
1742  * Process an incoming 'QEMU_VM_COMMAND'
1743  * 0           just a normal return
1744  * LOADVM_QUIT All good, but exit the loop
1745  * <0          Error
1746  */
1747 static int loadvm_process_command(QEMUFile *f)
1748 {
1749     MigrationIncomingState *mis = migration_incoming_get_current();
1750     uint16_t cmd;
1751     uint16_t len;
1752     uint32_t tmp32;
1753 
1754     cmd = qemu_get_be16(f);
1755     len = qemu_get_be16(f);
1756 
1757     trace_loadvm_process_command(cmd, len);
1758     if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
1759         error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
1760         return -EINVAL;
1761     }
1762 
1763     if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
1764         error_report("%s received with bad length - expecting %zu, got %d",
1765                      mig_cmd_args[cmd].name,
1766                      (size_t)mig_cmd_args[cmd].len, len);
1767         return -ERANGE;
1768     }
1769 
1770     switch (cmd) {
1771     case MIG_CMD_OPEN_RETURN_PATH:
1772         if (mis->to_src_file) {
1773             error_report("CMD_OPEN_RETURN_PATH called when RP already open");
1774             /* Not really a problem, so don't give up */
1775             return 0;
1776         }
1777         mis->to_src_file = qemu_file_get_return_path(f);
1778         if (!mis->to_src_file) {
1779             error_report("CMD_OPEN_RETURN_PATH failed");
1780             return -1;
1781         }
1782         break;
1783 
1784     case MIG_CMD_PING:
1785         tmp32 = qemu_get_be32(f);
1786         trace_loadvm_process_command_ping(tmp32);
1787         if (!mis->to_src_file) {
1788             error_report("CMD_PING (0x%x) received with no return path",
1789                          tmp32);
1790             return -1;
1791         }
1792         migrate_send_rp_pong(mis, tmp32);
1793         break;
1794 
1795     case MIG_CMD_PACKAGED:
1796         return loadvm_handle_cmd_packaged(mis);
1797 
1798     case MIG_CMD_POSTCOPY_ADVISE:
1799         return loadvm_postcopy_handle_advise(mis);
1800 
1801     case MIG_CMD_POSTCOPY_LISTEN:
1802         return loadvm_postcopy_handle_listen(mis);
1803 
1804     case MIG_CMD_POSTCOPY_RUN:
1805         return loadvm_postcopy_handle_run(mis);
1806 
1807     case MIG_CMD_POSTCOPY_RAM_DISCARD:
1808         return loadvm_postcopy_ram_handle_discard(mis, len);
1809     }
1810 
1811     return 0;
1812 }
1813 
1814 /*
1815  * Read a footer off the wire and check that it matches the expected section
1816  *
1817  * Returns: true if the footer was good
1818  *          false if there is a problem (and calls error_report to say why)
1819  */
1820 static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
1821 {
1822     uint8_t read_mark;
1823     uint32_t read_section_id;
1824 
1825     if (!migrate_get_current()->send_section_footer) {
1826         /* No footer to check */
1827         return true;
1828     }
1829 
1830     read_mark = qemu_get_byte(f);
1831 
1832     if (read_mark != QEMU_VM_SECTION_FOOTER) {
1833         error_report("Missing section footer for %s", se->idstr);
1834         return false;
1835     }
1836 
1837     read_section_id = qemu_get_be32(f);
1838     if (read_section_id != se->load_section_id) {
1839         error_report("Mismatched section id in footer for %s -"
1840                      " read 0x%x expected 0x%x",
1841                      se->idstr, read_section_id, se->load_section_id);
1842         return false;
1843     }
1844 
1845     /* All good */
1846     return true;
1847 }
1848 
1849 static int
1850 qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
1851 {
1852     uint32_t instance_id, version_id, section_id;
1853     SaveStateEntry *se;
1854     char idstr[256];
1855     int ret;
1856 
1857     /* Read section start */
1858     section_id = qemu_get_be32(f);
1859     if (!qemu_get_counted_string(f, idstr)) {
1860         error_report("Unable to read ID string for section %u",
1861                      section_id);
1862         return -EINVAL;
1863     }
1864     instance_id = qemu_get_be32(f);
1865     version_id = qemu_get_be32(f);
1866 
1867     trace_qemu_loadvm_state_section_startfull(section_id, idstr,
1868             instance_id, version_id);
1869     /* Find savevm section */
1870     se = find_se(idstr, instance_id);
1871     if (se == NULL) {
1872         error_report("Unknown savevm section or instance '%s' %d",
1873                      idstr, instance_id);
1874         return -EINVAL;
1875     }
1876 
1877     /* Validate version */
1878     if (version_id > se->version_id) {
1879         error_report("savevm: unsupported version %d for '%s' v%d",
1880                      version_id, idstr, se->version_id);
1881         return -EINVAL;
1882     }
1883     se->load_version_id = version_id;
1884     se->load_section_id = section_id;
1885 
1886     /* Validate if it is a device's state */
1887     if (xen_enabled() && se->is_ram) {
1888         error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
1889         return -EINVAL;
1890     }
1891 
1892     ret = vmstate_load(f, se);
1893     if (ret < 0) {
1894         error_report("error while loading state for instance 0x%x of"
1895                      " device '%s'", instance_id, idstr);
1896         return ret;
1897     }
1898     if (!check_section_footer(f, se)) {
1899         return -EINVAL;
1900     }
1901 
1902     return 0;
1903 }
1904 
1905 static int
1906 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
1907 {
1908     uint32_t section_id;
1909     SaveStateEntry *se;
1910     int ret;
1911 
1912     section_id = qemu_get_be32(f);
1913 
1914     trace_qemu_loadvm_state_section_partend(section_id);
1915     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1916         if (se->load_section_id == section_id) {
1917             break;
1918         }
1919     }
1920     if (se == NULL) {
1921         error_report("Unknown savevm section %d", section_id);
1922         return -EINVAL;
1923     }
1924 
1925     ret = vmstate_load(f, se);
1926     if (ret < 0) {
1927         error_report("error while loading state section id %d(%s)",
1928                      section_id, se->idstr);
1929         return ret;
1930     }
1931     if (!check_section_footer(f, se)) {
1932         return -EINVAL;
1933     }
1934 
1935     return 0;
1936 }
1937 
1938 static int qemu_loadvm_state_setup(QEMUFile *f)
1939 {
1940     SaveStateEntry *se;
1941     int ret;
1942 
1943     trace_loadvm_state_setup();
1944     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1945         if (!se->ops || !se->ops->load_setup) {
1946             continue;
1947         }
1948         if (se->ops && se->ops->is_active) {
1949             if (!se->ops->is_active(se->opaque)) {
1950                 continue;
1951             }
1952         }
1953 
1954         ret = se->ops->load_setup(f, se->opaque);
1955         if (ret < 0) {
1956             qemu_file_set_error(f, ret);
1957             error_report("Load state of device %s failed", se->idstr);
1958             return ret;
1959         }
1960     }
1961     return 0;
1962 }
1963 
1964 void qemu_loadvm_state_cleanup(void)
1965 {
1966     SaveStateEntry *se;
1967 
1968     trace_loadvm_state_cleanup();
1969     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1970         if (se->ops && se->ops->load_cleanup) {
1971             se->ops->load_cleanup(se->opaque);
1972         }
1973     }
1974 }
1975 
1976 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
1977 {
1978     uint8_t section_type;
1979     int ret = 0;
1980 
1981     while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
1982         ret = 0;
1983         trace_qemu_loadvm_state_section(section_type);
1984         switch (section_type) {
1985         case QEMU_VM_SECTION_START:
1986         case QEMU_VM_SECTION_FULL:
1987             ret = qemu_loadvm_section_start_full(f, mis);
1988             if (ret < 0) {
1989                 goto out;
1990             }
1991             break;
1992         case QEMU_VM_SECTION_PART:
1993         case QEMU_VM_SECTION_END:
1994             ret = qemu_loadvm_section_part_end(f, mis);
1995             if (ret < 0) {
1996                 goto out;
1997             }
1998             break;
1999         case QEMU_VM_COMMAND:
2000             ret = loadvm_process_command(f);
2001             trace_qemu_loadvm_state_section_command(ret);
2002             if ((ret < 0) || (ret & LOADVM_QUIT)) {
2003                 goto out;
2004             }
2005             break;
2006         default:
2007             error_report("Unknown savevm section type %d", section_type);
2008             ret = -EINVAL;
2009             goto out;
2010         }
2011     }
2012 
2013 out:
2014     if (ret < 0) {
2015         qemu_file_set_error(f, ret);
2016     }
2017     return ret;
2018 }
2019 
2020 int qemu_loadvm_state(QEMUFile *f)
2021 {
2022     MigrationIncomingState *mis = migration_incoming_get_current();
2023     Error *local_err = NULL;
2024     unsigned int v;
2025     int ret;
2026 
2027     if (qemu_savevm_state_blocked(&local_err)) {
2028         error_report_err(local_err);
2029         return -EINVAL;
2030     }
2031 
2032     v = qemu_get_be32(f);
2033     if (v != QEMU_VM_FILE_MAGIC) {
2034         error_report("Not a migration stream");
2035         return -EINVAL;
2036     }
2037 
2038     v = qemu_get_be32(f);
2039     if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2040         error_report("SaveVM v2 format is obsolete and don't work anymore");
2041         return -ENOTSUP;
2042     }
2043     if (v != QEMU_VM_FILE_VERSION) {
2044         error_report("Unsupported migration stream version");
2045         return -ENOTSUP;
2046     }
2047 
2048     if (qemu_loadvm_state_setup(f) != 0) {
2049         return -EINVAL;
2050     }
2051 
2052     if (migrate_get_current()->send_configuration) {
2053         if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2054             error_report("Configuration section missing");
2055             return -EINVAL;
2056         }
2057         ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2058 
2059         if (ret) {
2060             return ret;
2061         }
2062     }
2063 
2064     cpu_synchronize_all_pre_loadvm();
2065 
2066     ret = qemu_loadvm_state_main(f, mis);
2067     qemu_event_set(&mis->main_thread_load_event);
2068 
2069     trace_qemu_loadvm_state_post_main(ret);
2070 
2071     if (mis->have_listen_thread) {
2072         /* Listen thread still going, can't clean up yet */
2073         return ret;
2074     }
2075 
2076     if (ret == 0) {
2077         ret = qemu_file_get_error(f);
2078     }
2079 
2080     /*
2081      * Try to read in the VMDESC section as well, so that dumping tools that
2082      * intercept our migration stream have the chance to see it.
2083      */
2084 
2085     /* We've got to be careful; if we don't read the data and just shut the fd
2086      * then the sender can error if we close while it's still sending.
2087      * We also mustn't read data that isn't there; some transports (RDMA)
2088      * will stall waiting for that data when the source has already closed.
2089      */
2090     if (ret == 0 && should_send_vmdesc()) {
2091         uint8_t *buf;
2092         uint32_t size;
2093         uint8_t  section_type = qemu_get_byte(f);
2094 
2095         if (section_type != QEMU_VM_VMDESCRIPTION) {
2096             error_report("Expected vmdescription section, but got %d",
2097                          section_type);
2098             /*
2099              * It doesn't seem worth failing at this point since
2100              * we apparently have an otherwise valid VM state
2101              */
2102         } else {
2103             buf = g_malloc(0x1000);
2104             size = qemu_get_be32(f);
2105 
2106             while (size > 0) {
2107                 uint32_t read_chunk = MIN(size, 0x1000);
2108                 qemu_get_buffer(f, buf, read_chunk);
2109                 size -= read_chunk;
2110             }
2111             g_free(buf);
2112         }
2113     }
2114 
2115     qemu_loadvm_state_cleanup();
2116     cpu_synchronize_all_post_init();
2117 
2118     return ret;
2119 }
2120 
2121 int save_snapshot(const char *name, Error **errp)
2122 {
2123     BlockDriverState *bs, *bs1;
2124     QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1;
2125     int ret = -1;
2126     QEMUFile *f;
2127     int saved_vm_running;
2128     uint64_t vm_state_size;
2129     qemu_timeval tv;
2130     struct tm tm;
2131     AioContext *aio_context;
2132 
2133     if (!bdrv_all_can_snapshot(&bs)) {
2134         error_setg(errp, "Device '%s' is writable but does not support "
2135                    "snapshots", bdrv_get_device_name(bs));
2136         return ret;
2137     }
2138 
2139     /* Delete old snapshots of the same name */
2140     if (name) {
2141         ret = bdrv_all_delete_snapshot(name, &bs1, errp);
2142         if (ret < 0) {
2143             error_prepend(errp, "Error while deleting snapshot on device "
2144                           "'%s': ", bdrv_get_device_name(bs1));
2145             return ret;
2146         }
2147     }
2148 
2149     bs = bdrv_all_find_vmstate_bs();
2150     if (bs == NULL) {
2151         error_setg(errp, "No block device can accept snapshots");
2152         return ret;
2153     }
2154     aio_context = bdrv_get_aio_context(bs);
2155 
2156     saved_vm_running = runstate_is_running();
2157 
2158     ret = global_state_store();
2159     if (ret) {
2160         error_setg(errp, "Error saving global state");
2161         return ret;
2162     }
2163     vm_stop(RUN_STATE_SAVE_VM);
2164 
2165     bdrv_drain_all_begin();
2166 
2167     aio_context_acquire(aio_context);
2168 
2169     memset(sn, 0, sizeof(*sn));
2170 
2171     /* fill auxiliary fields */
2172     qemu_gettimeofday(&tv);
2173     sn->date_sec = tv.tv_sec;
2174     sn->date_nsec = tv.tv_usec * 1000;
2175     sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2176 
2177     if (name) {
2178         ret = bdrv_snapshot_find(bs, old_sn, name);
2179         if (ret >= 0) {
2180             pstrcpy(sn->name, sizeof(sn->name), old_sn->name);
2181             pstrcpy(sn->id_str, sizeof(sn->id_str), old_sn->id_str);
2182         } else {
2183             pstrcpy(sn->name, sizeof(sn->name), name);
2184         }
2185     } else {
2186         /* cast below needed for OpenBSD where tv_sec is still 'long' */
2187         localtime_r((const time_t *)&tv.tv_sec, &tm);
2188         strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm);
2189     }
2190 
2191     /* save the VM state */
2192     f = qemu_fopen_bdrv(bs, 1);
2193     if (!f) {
2194         error_setg(errp, "Could not open VM state file");
2195         goto the_end;
2196     }
2197     ret = qemu_savevm_state(f, errp);
2198     vm_state_size = qemu_ftell(f);
2199     qemu_fclose(f);
2200     if (ret < 0) {
2201         goto the_end;
2202     }
2203 
2204     /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2205      * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2206      * it only releases the lock once.  Therefore synchronous I/O will deadlock
2207      * unless we release the AioContext before bdrv_all_create_snapshot().
2208      */
2209     aio_context_release(aio_context);
2210     aio_context = NULL;
2211 
2212     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
2213     if (ret < 0) {
2214         error_setg(errp, "Error while creating snapshot on '%s'",
2215                    bdrv_get_device_name(bs));
2216         goto the_end;
2217     }
2218 
2219     ret = 0;
2220 
2221  the_end:
2222     if (aio_context) {
2223         aio_context_release(aio_context);
2224     }
2225 
2226     bdrv_drain_all_end();
2227 
2228     if (saved_vm_running) {
2229         vm_start();
2230     }
2231     return ret;
2232 }
2233 
2234 void qmp_xen_save_devices_state(const char *filename, Error **errp)
2235 {
2236     QEMUFile *f;
2237     QIOChannelFile *ioc;
2238     int saved_vm_running;
2239     int ret;
2240 
2241     saved_vm_running = runstate_is_running();
2242     vm_stop(RUN_STATE_SAVE_VM);
2243     global_state_store_running();
2244 
2245     ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp);
2246     if (!ioc) {
2247         goto the_end;
2248     }
2249     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2250     f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2251     ret = qemu_save_device_state(f);
2252     qemu_fclose(f);
2253     if (ret < 0) {
2254         error_setg(errp, QERR_IO_ERROR);
2255     }
2256 
2257  the_end:
2258     if (saved_vm_running) {
2259         vm_start();
2260     }
2261 }
2262 
2263 void qmp_xen_load_devices_state(const char *filename, Error **errp)
2264 {
2265     QEMUFile *f;
2266     QIOChannelFile *ioc;
2267     int ret;
2268 
2269     /* Guest must be paused before loading the device state; the RAM state
2270      * will already have been loaded by xc
2271      */
2272     if (runstate_is_running()) {
2273         error_setg(errp, "Cannot update device state while vm is running");
2274         return;
2275     }
2276     vm_stop(RUN_STATE_RESTORE_VM);
2277 
2278     ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2279     if (!ioc) {
2280         return;
2281     }
2282     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2283     f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2284 
2285     ret = qemu_loadvm_state(f);
2286     qemu_fclose(f);
2287     if (ret < 0) {
2288         error_setg(errp, QERR_IO_ERROR);
2289     }
2290     migration_incoming_state_destroy();
2291 }
2292 
2293 int load_snapshot(const char *name, Error **errp)
2294 {
2295     BlockDriverState *bs, *bs_vm_state;
2296     QEMUSnapshotInfo sn;
2297     QEMUFile *f;
2298     int ret;
2299     AioContext *aio_context;
2300     MigrationIncomingState *mis = migration_incoming_get_current();
2301 
2302     if (!bdrv_all_can_snapshot(&bs)) {
2303         error_setg(errp,
2304                    "Device '%s' is writable but does not support snapshots",
2305                    bdrv_get_device_name(bs));
2306         return -ENOTSUP;
2307     }
2308     ret = bdrv_all_find_snapshot(name, &bs);
2309     if (ret < 0) {
2310         error_setg(errp,
2311                    "Device '%s' does not have the requested snapshot '%s'",
2312                    bdrv_get_device_name(bs), name);
2313         return ret;
2314     }
2315 
2316     bs_vm_state = bdrv_all_find_vmstate_bs();
2317     if (!bs_vm_state) {
2318         error_setg(errp, "No block device supports snapshots");
2319         return -ENOTSUP;
2320     }
2321     aio_context = bdrv_get_aio_context(bs_vm_state);
2322 
2323     /* Don't even try to load empty VM states */
2324     aio_context_acquire(aio_context);
2325     ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
2326     aio_context_release(aio_context);
2327     if (ret < 0) {
2328         return ret;
2329     } else if (sn.vm_state_size == 0) {
2330         error_setg(errp, "This is a disk-only snapshot. Revert to it "
2331                    " offline using qemu-img");
2332         return -EINVAL;
2333     }
2334 
2335     /* Flush all IO requests so they don't interfere with the new state.  */
2336     bdrv_drain_all_begin();
2337 
2338     ret = bdrv_all_goto_snapshot(name, &bs);
2339     if (ret < 0) {
2340         error_setg(errp, "Error %d while activating snapshot '%s' on '%s'",
2341                      ret, name, bdrv_get_device_name(bs));
2342         goto err_drain;
2343     }
2344 
2345     /* restore the VM state */
2346     f = qemu_fopen_bdrv(bs_vm_state, 0);
2347     if (!f) {
2348         error_setg(errp, "Could not open VM state file");
2349         ret = -EINVAL;
2350         goto err_drain;
2351     }
2352 
2353     qemu_system_reset(SHUTDOWN_CAUSE_NONE);
2354     mis->from_src_file = f;
2355 
2356     aio_context_acquire(aio_context);
2357     ret = qemu_loadvm_state(f);
2358     migration_incoming_state_destroy();
2359     aio_context_release(aio_context);
2360 
2361     bdrv_drain_all_end();
2362 
2363     if (ret < 0) {
2364         error_setg(errp, "Error %d while loading VM state", ret);
2365         return ret;
2366     }
2367 
2368     return 0;
2369 
2370 err_drain:
2371     bdrv_drain_all_end();
2372     return ret;
2373 }
2374 
2375 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
2376 {
2377     qemu_ram_set_idstr(mr->ram_block,
2378                        memory_region_name(mr), dev);
2379 }
2380 
2381 void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
2382 {
2383     qemu_ram_unset_idstr(mr->ram_block);
2384 }
2385 
2386 void vmstate_register_ram_global(MemoryRegion *mr)
2387 {
2388     vmstate_register_ram(mr, NULL);
2389 }
2390 
2391 bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
2392 {
2393     /* check needed if --only-migratable is specified */
2394     if (!migrate_get_current()->only_migratable) {
2395         return true;
2396     }
2397 
2398     return !(vmsd && vmsd->unmigratable);
2399 }
2400