xref: /openbmc/qemu/migration/savevm.c (revision dc5bd18f)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2009-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "hw/boards.h"
31 #include "hw/xen/xen.h"
32 #include "net/net.h"
33 #include "migration.h"
34 #include "migration/snapshot.h"
35 #include "migration/misc.h"
36 #include "migration/register.h"
37 #include "migration/global_state.h"
38 #include "ram.h"
39 #include "qemu-file-channel.h"
40 #include "qemu-file.h"
41 #include "savevm.h"
42 #include "postcopy-ram.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-commands-migration.h"
45 #include "qapi/qapi-commands-misc.h"
46 #include "qapi/qmp/qerror.h"
47 #include "qemu/error-report.h"
48 #include "sysemu/cpus.h"
49 #include "exec/memory.h"
50 #include "exec/target_page.h"
51 #include "trace.h"
52 #include "qemu/iov.h"
53 #include "block/snapshot.h"
54 #include "qemu/cutils.h"
55 #include "io/channel-buffer.h"
56 #include "io/channel-file.h"
57 
58 #ifndef ETH_P_RARP
59 #define ETH_P_RARP 0x8035
60 #endif
61 #define ARP_HTYPE_ETH 0x0001
62 #define ARP_PTYPE_IP 0x0800
63 #define ARP_OP_REQUEST_REV 0x3
64 
65 const unsigned int postcopy_ram_discard_version = 0;
66 
67 /* Subcommands for QEMU_VM_COMMAND */
68 enum qemu_vm_cmd {
69     MIG_CMD_INVALID = 0,   /* Must be 0 */
70     MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
71     MIG_CMD_PING,              /* Request a PONG on the RP */
72 
73     MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
74                                       warn we might want to do PC */
75     MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
76                                       pages as it's running. */
77     MIG_CMD_POSTCOPY_RUN,          /* Start execution */
78 
79     MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
80                                       were previously sent during
81                                       precopy but are dirty. */
82     MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
83     MIG_CMD_MAX
84 };
85 
86 #define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
87 static struct mig_cmd_args {
88     ssize_t     len; /* -1 = variable */
89     const char *name;
90 } mig_cmd_args[] = {
91     [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
92     [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
93     [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
94     [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
95     [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
96     [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
97     [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
98                                    .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
99     [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
100     [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
101 };
102 
103 /* Note for MIG_CMD_POSTCOPY_ADVISE:
104  * The format of arguments is depending on postcopy mode:
105  * - postcopy RAM only
106  *   uint64_t host page size
107  *   uint64_t taget page size
108  *
109  * - postcopy RAM and postcopy dirty bitmaps
110  *   format is the same as for postcopy RAM only
111  *
112  * - postcopy dirty bitmaps only
113  *   Nothing. Command length field is 0.
114  *
115  * Be careful: adding a new postcopy entity with some other parameters should
116  * not break format self-description ability. Good way is to introduce some
117  * generic extendable format with an exception for two old entities.
118  */
119 
120 static int announce_self_create(uint8_t *buf,
121                                 uint8_t *mac_addr)
122 {
123     /* Ethernet header. */
124     memset(buf, 0xff, 6);         /* destination MAC addr */
125     memcpy(buf + 6, mac_addr, 6); /* source MAC addr */
126     *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */
127 
128     /* RARP header. */
129     *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */
130     *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */
131     *(buf + 18) = 6; /* hardware addr length (ethernet) */
132     *(buf + 19) = 4; /* protocol addr length (IPv4) */
133     *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */
134     memcpy(buf + 22, mac_addr, 6); /* source hw addr */
135     memset(buf + 28, 0x00, 4);     /* source protocol addr */
136     memcpy(buf + 32, mac_addr, 6); /* target hw addr */
137     memset(buf + 38, 0x00, 4);     /* target protocol addr */
138 
139     /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */
140     memset(buf + 42, 0x00, 18);
141 
142     return 60; /* len (FCS will be added by hardware) */
143 }
144 
145 static void qemu_announce_self_iter(NICState *nic, void *opaque)
146 {
147     uint8_t buf[60];
148     int len;
149 
150     trace_qemu_announce_self_iter(qemu_ether_ntoa(&nic->conf->macaddr));
151     len = announce_self_create(buf, nic->conf->macaddr.a);
152 
153     qemu_send_packet_raw(qemu_get_queue(nic), buf, len);
154 }
155 
156 
157 static void qemu_announce_self_once(void *opaque)
158 {
159     static int count = SELF_ANNOUNCE_ROUNDS;
160     QEMUTimer *timer = *(QEMUTimer **)opaque;
161 
162     qemu_foreach_nic(qemu_announce_self_iter, NULL);
163 
164     if (--count) {
165         /* delay 50ms, 150ms, 250ms, ... */
166         timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
167                   self_announce_delay(count));
168     } else {
169             timer_del(timer);
170             timer_free(timer);
171     }
172 }
173 
174 void qemu_announce_self(void)
175 {
176     static QEMUTimer *timer;
177     timer = timer_new_ms(QEMU_CLOCK_REALTIME, qemu_announce_self_once, &timer);
178     qemu_announce_self_once(&timer);
179 }
180 
181 /***********************************************************/
182 /* savevm/loadvm support */
183 
184 static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
185                                    int64_t pos)
186 {
187     int ret;
188     QEMUIOVector qiov;
189 
190     qemu_iovec_init_external(&qiov, iov, iovcnt);
191     ret = bdrv_writev_vmstate(opaque, &qiov, pos);
192     if (ret < 0) {
193         return ret;
194     }
195 
196     return qiov.size;
197 }
198 
199 static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
200                                 size_t size)
201 {
202     return bdrv_load_vmstate(opaque, buf, pos, size);
203 }
204 
205 static int bdrv_fclose(void *opaque)
206 {
207     return bdrv_flush(opaque);
208 }
209 
210 static const QEMUFileOps bdrv_read_ops = {
211     .get_buffer = block_get_buffer,
212     .close =      bdrv_fclose
213 };
214 
215 static const QEMUFileOps bdrv_write_ops = {
216     .writev_buffer  = block_writev_buffer,
217     .close          = bdrv_fclose
218 };
219 
220 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
221 {
222     if (is_writable) {
223         return qemu_fopen_ops(bs, &bdrv_write_ops);
224     }
225     return qemu_fopen_ops(bs, &bdrv_read_ops);
226 }
227 
228 
229 /* QEMUFile timer support.
230  * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
231  */
232 
233 void timer_put(QEMUFile *f, QEMUTimer *ts)
234 {
235     uint64_t expire_time;
236 
237     expire_time = timer_expire_time_ns(ts);
238     qemu_put_be64(f, expire_time);
239 }
240 
241 void timer_get(QEMUFile *f, QEMUTimer *ts)
242 {
243     uint64_t expire_time;
244 
245     expire_time = qemu_get_be64(f);
246     if (expire_time != -1) {
247         timer_mod_ns(ts, expire_time);
248     } else {
249         timer_del(ts);
250     }
251 }
252 
253 
254 /* VMState timer support.
255  * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
256  */
257 
258 static int get_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field)
259 {
260     QEMUTimer *v = pv;
261     timer_get(f, v);
262     return 0;
263 }
264 
265 static int put_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field,
266                      QJSON *vmdesc)
267 {
268     QEMUTimer *v = pv;
269     timer_put(f, v);
270 
271     return 0;
272 }
273 
274 const VMStateInfo vmstate_info_timer = {
275     .name = "timer",
276     .get  = get_timer,
277     .put  = put_timer,
278 };
279 
280 
281 typedef struct CompatEntry {
282     char idstr[256];
283     int instance_id;
284 } CompatEntry;
285 
286 typedef struct SaveStateEntry {
287     QTAILQ_ENTRY(SaveStateEntry) entry;
288     char idstr[256];
289     int instance_id;
290     int alias_id;
291     int version_id;
292     /* version id read from the stream */
293     int load_version_id;
294     int section_id;
295     /* section id read from the stream */
296     int load_section_id;
297     SaveVMHandlers *ops;
298     const VMStateDescription *vmsd;
299     void *opaque;
300     CompatEntry *compat;
301     int is_ram;
302 } SaveStateEntry;
303 
304 typedef struct SaveState {
305     QTAILQ_HEAD(, SaveStateEntry) handlers;
306     int global_section_id;
307     uint32_t len;
308     const char *name;
309     uint32_t target_page_bits;
310 } SaveState;
311 
312 static SaveState savevm_state = {
313     .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
314     .global_section_id = 0,
315 };
316 
317 static int configuration_pre_save(void *opaque)
318 {
319     SaveState *state = opaque;
320     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
321 
322     state->len = strlen(current_name);
323     state->name = current_name;
324     state->target_page_bits = qemu_target_page_bits();
325 
326     return 0;
327 }
328 
329 static int configuration_pre_load(void *opaque)
330 {
331     SaveState *state = opaque;
332 
333     /* If there is no target-page-bits subsection it means the source
334      * predates the variable-target-page-bits support and is using the
335      * minimum possible value for this CPU.
336      */
337     state->target_page_bits = qemu_target_page_bits_min();
338     return 0;
339 }
340 
341 static int configuration_post_load(void *opaque, int version_id)
342 {
343     SaveState *state = opaque;
344     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
345 
346     if (strncmp(state->name, current_name, state->len) != 0) {
347         error_report("Machine type received is '%.*s' and local is '%s'",
348                      (int) state->len, state->name, current_name);
349         return -EINVAL;
350     }
351 
352     if (state->target_page_bits != qemu_target_page_bits()) {
353         error_report("Received TARGET_PAGE_BITS is %d but local is %d",
354                      state->target_page_bits, qemu_target_page_bits());
355         return -EINVAL;
356     }
357 
358     return 0;
359 }
360 
361 /* The target-page-bits subsection is present only if the
362  * target page size is not the same as the default (ie the
363  * minimum page size for a variable-page-size guest CPU).
364  * If it is present then it contains the actual target page
365  * bits for the machine, and migration will fail if the
366  * two ends don't agree about it.
367  */
368 static bool vmstate_target_page_bits_needed(void *opaque)
369 {
370     return qemu_target_page_bits()
371         > qemu_target_page_bits_min();
372 }
373 
374 static const VMStateDescription vmstate_target_page_bits = {
375     .name = "configuration/target-page-bits",
376     .version_id = 1,
377     .minimum_version_id = 1,
378     .needed = vmstate_target_page_bits_needed,
379     .fields = (VMStateField[]) {
380         VMSTATE_UINT32(target_page_bits, SaveState),
381         VMSTATE_END_OF_LIST()
382     }
383 };
384 
385 static const VMStateDescription vmstate_configuration = {
386     .name = "configuration",
387     .version_id = 1,
388     .pre_load = configuration_pre_load,
389     .post_load = configuration_post_load,
390     .pre_save = configuration_pre_save,
391     .fields = (VMStateField[]) {
392         VMSTATE_UINT32(len, SaveState),
393         VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
394         VMSTATE_END_OF_LIST()
395     },
396     .subsections = (const VMStateDescription*[]) {
397         &vmstate_target_page_bits,
398         NULL
399     }
400 };
401 
402 static void dump_vmstate_vmsd(FILE *out_file,
403                               const VMStateDescription *vmsd, int indent,
404                               bool is_subsection);
405 
406 static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
407                               int indent)
408 {
409     fprintf(out_file, "%*s{\n", indent, "");
410     indent += 2;
411     fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
412     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
413             field->version_id);
414     fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
415             field->field_exists ? "true" : "false");
416     fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
417     if (field->vmsd != NULL) {
418         fprintf(out_file, ",\n");
419         dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
420     }
421     fprintf(out_file, "\n%*s}", indent - 2, "");
422 }
423 
424 static void dump_vmstate_vmss(FILE *out_file,
425                               const VMStateDescription **subsection,
426                               int indent)
427 {
428     if (*subsection != NULL) {
429         dump_vmstate_vmsd(out_file, *subsection, indent, true);
430     }
431 }
432 
433 static void dump_vmstate_vmsd(FILE *out_file,
434                               const VMStateDescription *vmsd, int indent,
435                               bool is_subsection)
436 {
437     if (is_subsection) {
438         fprintf(out_file, "%*s{\n", indent, "");
439     } else {
440         fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
441     }
442     indent += 2;
443     fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
444     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
445             vmsd->version_id);
446     fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
447             vmsd->minimum_version_id);
448     if (vmsd->fields != NULL) {
449         const VMStateField *field = vmsd->fields;
450         bool first;
451 
452         fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
453         first = true;
454         while (field->name != NULL) {
455             if (field->flags & VMS_MUST_EXIST) {
456                 /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
457                 field++;
458                 continue;
459             }
460             if (!first) {
461                 fprintf(out_file, ",\n");
462             }
463             dump_vmstate_vmsf(out_file, field, indent + 2);
464             field++;
465             first = false;
466         }
467         fprintf(out_file, "\n%*s]", indent, "");
468     }
469     if (vmsd->subsections != NULL) {
470         const VMStateDescription **subsection = vmsd->subsections;
471         bool first;
472 
473         fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
474         first = true;
475         while (*subsection != NULL) {
476             if (!first) {
477                 fprintf(out_file, ",\n");
478             }
479             dump_vmstate_vmss(out_file, subsection, indent + 2);
480             subsection++;
481             first = false;
482         }
483         fprintf(out_file, "\n%*s]", indent, "");
484     }
485     fprintf(out_file, "\n%*s}", indent - 2, "");
486 }
487 
488 static void dump_machine_type(FILE *out_file)
489 {
490     MachineClass *mc;
491 
492     mc = MACHINE_GET_CLASS(current_machine);
493 
494     fprintf(out_file, "  \"vmschkmachine\": {\n");
495     fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
496     fprintf(out_file, "  },\n");
497 }
498 
499 void dump_vmstate_json_to_file(FILE *out_file)
500 {
501     GSList *list, *elt;
502     bool first;
503 
504     fprintf(out_file, "{\n");
505     dump_machine_type(out_file);
506 
507     first = true;
508     list = object_class_get_list(TYPE_DEVICE, true);
509     for (elt = list; elt; elt = elt->next) {
510         DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
511                                              TYPE_DEVICE);
512         const char *name;
513         int indent = 2;
514 
515         if (!dc->vmsd) {
516             continue;
517         }
518 
519         if (!first) {
520             fprintf(out_file, ",\n");
521         }
522         name = object_class_get_name(OBJECT_CLASS(dc));
523         fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
524         indent += 2;
525         fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
526         fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
527                 dc->vmsd->version_id);
528         fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
529                 dc->vmsd->minimum_version_id);
530 
531         dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
532 
533         fprintf(out_file, "\n%*s}", indent - 2, "");
534         first = false;
535     }
536     fprintf(out_file, "\n}\n");
537     fclose(out_file);
538 }
539 
540 static int calculate_new_instance_id(const char *idstr)
541 {
542     SaveStateEntry *se;
543     int instance_id = 0;
544 
545     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
546         if (strcmp(idstr, se->idstr) == 0
547             && instance_id <= se->instance_id) {
548             instance_id = se->instance_id + 1;
549         }
550     }
551     return instance_id;
552 }
553 
554 static int calculate_compat_instance_id(const char *idstr)
555 {
556     SaveStateEntry *se;
557     int instance_id = 0;
558 
559     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
560         if (!se->compat) {
561             continue;
562         }
563 
564         if (strcmp(idstr, se->compat->idstr) == 0
565             && instance_id <= se->compat->instance_id) {
566             instance_id = se->compat->instance_id + 1;
567         }
568     }
569     return instance_id;
570 }
571 
572 static inline MigrationPriority save_state_priority(SaveStateEntry *se)
573 {
574     if (se->vmsd) {
575         return se->vmsd->priority;
576     }
577     return MIG_PRI_DEFAULT;
578 }
579 
580 static void savevm_state_handler_insert(SaveStateEntry *nse)
581 {
582     MigrationPriority priority = save_state_priority(nse);
583     SaveStateEntry *se;
584 
585     assert(priority <= MIG_PRI_MAX);
586 
587     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
588         if (save_state_priority(se) < priority) {
589             break;
590         }
591     }
592 
593     if (se) {
594         QTAILQ_INSERT_BEFORE(se, nse, entry);
595     } else {
596         QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
597     }
598 }
599 
600 /* TODO: Individual devices generally have very little idea about the rest
601    of the system, so instance_id should be removed/replaced.
602    Meanwhile pass -1 as instance_id if you do not already have a clearly
603    distinguishing id for all instances of your device class. */
604 int register_savevm_live(DeviceState *dev,
605                          const char *idstr,
606                          int instance_id,
607                          int version_id,
608                          SaveVMHandlers *ops,
609                          void *opaque)
610 {
611     SaveStateEntry *se;
612 
613     se = g_new0(SaveStateEntry, 1);
614     se->version_id = version_id;
615     se->section_id = savevm_state.global_section_id++;
616     se->ops = ops;
617     se->opaque = opaque;
618     se->vmsd = NULL;
619     /* if this is a live_savem then set is_ram */
620     if (ops->save_setup != NULL) {
621         se->is_ram = 1;
622     }
623 
624     if (dev) {
625         char *id = qdev_get_dev_path(dev);
626         if (id) {
627             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
628                 sizeof(se->idstr)) {
629                 error_report("Path too long for VMState (%s)", id);
630                 g_free(id);
631                 g_free(se);
632 
633                 return -1;
634             }
635             g_free(id);
636 
637             se->compat = g_new0(CompatEntry, 1);
638             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr);
639             se->compat->instance_id = instance_id == -1 ?
640                          calculate_compat_instance_id(idstr) : instance_id;
641             instance_id = -1;
642         }
643     }
644     pstrcat(se->idstr, sizeof(se->idstr), idstr);
645 
646     if (instance_id == -1) {
647         se->instance_id = calculate_new_instance_id(se->idstr);
648     } else {
649         se->instance_id = instance_id;
650     }
651     assert(!se->compat || se->instance_id == 0);
652     savevm_state_handler_insert(se);
653     return 0;
654 }
655 
656 void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque)
657 {
658     SaveStateEntry *se, *new_se;
659     char id[256] = "";
660 
661     if (dev) {
662         char *path = qdev_get_dev_path(dev);
663         if (path) {
664             pstrcpy(id, sizeof(id), path);
665             pstrcat(id, sizeof(id), "/");
666             g_free(path);
667         }
668     }
669     pstrcat(id, sizeof(id), idstr);
670 
671     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
672         if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
673             QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
674             g_free(se->compat);
675             g_free(se);
676         }
677     }
678 }
679 
680 int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
681                                    const VMStateDescription *vmsd,
682                                    void *opaque, int alias_id,
683                                    int required_for_version,
684                                    Error **errp)
685 {
686     SaveStateEntry *se;
687 
688     /* If this triggers, alias support can be dropped for the vmsd. */
689     assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
690 
691     se = g_new0(SaveStateEntry, 1);
692     se->version_id = vmsd->version_id;
693     se->section_id = savevm_state.global_section_id++;
694     se->opaque = opaque;
695     se->vmsd = vmsd;
696     se->alias_id = alias_id;
697 
698     if (dev) {
699         char *id = qdev_get_dev_path(dev);
700         if (id) {
701             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
702                 sizeof(se->idstr)) {
703                 error_setg(errp, "Path too long for VMState (%s)", id);
704                 g_free(id);
705                 g_free(se);
706 
707                 return -1;
708             }
709             g_free(id);
710 
711             se->compat = g_new0(CompatEntry, 1);
712             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
713             se->compat->instance_id = instance_id == -1 ?
714                          calculate_compat_instance_id(vmsd->name) : instance_id;
715             instance_id = -1;
716         }
717     }
718     pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
719 
720     if (instance_id == -1) {
721         se->instance_id = calculate_new_instance_id(se->idstr);
722     } else {
723         se->instance_id = instance_id;
724     }
725     assert(!se->compat || se->instance_id == 0);
726     savevm_state_handler_insert(se);
727     return 0;
728 }
729 
730 void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd,
731                         void *opaque)
732 {
733     SaveStateEntry *se, *new_se;
734 
735     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
736         if (se->vmsd == vmsd && se->opaque == opaque) {
737             QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
738             g_free(se->compat);
739             g_free(se);
740         }
741     }
742 }
743 
744 static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
745 {
746     trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
747     if (!se->vmsd) {         /* Old style */
748         return se->ops->load_state(f, se->opaque, se->load_version_id);
749     }
750     return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
751 }
752 
753 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
754 {
755     int64_t old_offset, size;
756 
757     old_offset = qemu_ftell_fast(f);
758     se->ops->save_state(f, se->opaque);
759     size = qemu_ftell_fast(f) - old_offset;
760 
761     if (vmdesc) {
762         json_prop_int(vmdesc, "size", size);
763         json_start_array(vmdesc, "fields");
764         json_start_object(vmdesc, NULL);
765         json_prop_str(vmdesc, "name", "data");
766         json_prop_int(vmdesc, "size", size);
767         json_prop_str(vmdesc, "type", "buffer");
768         json_end_object(vmdesc);
769         json_end_array(vmdesc);
770     }
771 }
772 
773 static int vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
774 {
775     trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
776     if (!se->vmsd) {
777         vmstate_save_old_style(f, se, vmdesc);
778         return 0;
779     }
780     return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
781 }
782 
783 /*
784  * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
785  */
786 static void save_section_header(QEMUFile *f, SaveStateEntry *se,
787                                 uint8_t section_type)
788 {
789     qemu_put_byte(f, section_type);
790     qemu_put_be32(f, se->section_id);
791 
792     if (section_type == QEMU_VM_SECTION_FULL ||
793         section_type == QEMU_VM_SECTION_START) {
794         /* ID string */
795         size_t len = strlen(se->idstr);
796         qemu_put_byte(f, len);
797         qemu_put_buffer(f, (uint8_t *)se->idstr, len);
798 
799         qemu_put_be32(f, se->instance_id);
800         qemu_put_be32(f, se->version_id);
801     }
802 }
803 
804 /*
805  * Write a footer onto device sections that catches cases misformatted device
806  * sections.
807  */
808 static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
809 {
810     if (migrate_get_current()->send_section_footer) {
811         qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
812         qemu_put_be32(f, se->section_id);
813     }
814 }
815 
816 /**
817  * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
818  *                           command and associated data.
819  *
820  * @f: File to send command on
821  * @command: Command type to send
822  * @len: Length of associated data
823  * @data: Data associated with command.
824  */
825 static void qemu_savevm_command_send(QEMUFile *f,
826                                      enum qemu_vm_cmd command,
827                                      uint16_t len,
828                                      uint8_t *data)
829 {
830     trace_savevm_command_send(command, len);
831     qemu_put_byte(f, QEMU_VM_COMMAND);
832     qemu_put_be16(f, (uint16_t)command);
833     qemu_put_be16(f, len);
834     qemu_put_buffer(f, data, len);
835     qemu_fflush(f);
836 }
837 
838 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
839 {
840     uint32_t buf;
841 
842     trace_savevm_send_ping(value);
843     buf = cpu_to_be32(value);
844     qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
845 }
846 
847 void qemu_savevm_send_open_return_path(QEMUFile *f)
848 {
849     trace_savevm_send_open_return_path();
850     qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
851 }
852 
853 /* We have a buffer of data to send; we don't want that all to be loaded
854  * by the command itself, so the command contains just the length of the
855  * extra buffer that we then send straight after it.
856  * TODO: Must be a better way to organise that
857  *
858  * Returns:
859  *    0 on success
860  *    -ve on error
861  */
862 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
863 {
864     uint32_t tmp;
865 
866     if (len > MAX_VM_CMD_PACKAGED_SIZE) {
867         error_report("%s: Unreasonably large packaged state: %zu",
868                      __func__, len);
869         return -1;
870     }
871 
872     tmp = cpu_to_be32(len);
873 
874     trace_qemu_savevm_send_packaged();
875     qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
876 
877     qemu_put_buffer(f, buf, len);
878 
879     return 0;
880 }
881 
882 /* Send prior to any postcopy transfer */
883 void qemu_savevm_send_postcopy_advise(QEMUFile *f)
884 {
885     if (migrate_postcopy_ram()) {
886         uint64_t tmp[2];
887         tmp[0] = cpu_to_be64(ram_pagesize_summary());
888         tmp[1] = cpu_to_be64(qemu_target_page_size());
889 
890         trace_qemu_savevm_send_postcopy_advise();
891         qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
892                                  16, (uint8_t *)tmp);
893     } else {
894         qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
895     }
896 }
897 
898 /* Sent prior to starting the destination running in postcopy, discard pages
899  * that have already been sent but redirtied on the source.
900  * CMD_POSTCOPY_RAM_DISCARD consist of:
901  *      byte   version (0)
902  *      byte   Length of name field (not including 0)
903  *  n x byte   RAM block name
904  *      byte   0 terminator (just for safety)
905  *  n x        Byte ranges within the named RAMBlock
906  *      be64   Start of the range
907  *      be64   Length
908  *
909  *  name:  RAMBlock name that these entries are part of
910  *  len: Number of page entries
911  *  start_list: 'len' addresses
912  *  length_list: 'len' addresses
913  *
914  */
915 void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
916                                            uint16_t len,
917                                            uint64_t *start_list,
918                                            uint64_t *length_list)
919 {
920     uint8_t *buf;
921     uint16_t tmplen;
922     uint16_t t;
923     size_t name_len = strlen(name);
924 
925     trace_qemu_savevm_send_postcopy_ram_discard(name, len);
926     assert(name_len < 256);
927     buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
928     buf[0] = postcopy_ram_discard_version;
929     buf[1] = name_len;
930     memcpy(buf + 2, name, name_len);
931     tmplen = 2 + name_len;
932     buf[tmplen++] = '\0';
933 
934     for (t = 0; t < len; t++) {
935         stq_be_p(buf + tmplen, start_list[t]);
936         tmplen += 8;
937         stq_be_p(buf + tmplen, length_list[t]);
938         tmplen += 8;
939     }
940     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
941     g_free(buf);
942 }
943 
944 /* Get the destination into a state where it can receive postcopy data. */
945 void qemu_savevm_send_postcopy_listen(QEMUFile *f)
946 {
947     trace_savevm_send_postcopy_listen();
948     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
949 }
950 
951 /* Kick the destination into running */
952 void qemu_savevm_send_postcopy_run(QEMUFile *f)
953 {
954     trace_savevm_send_postcopy_run();
955     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
956 }
957 
958 bool qemu_savevm_state_blocked(Error **errp)
959 {
960     SaveStateEntry *se;
961 
962     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
963         if (se->vmsd && se->vmsd->unmigratable) {
964             error_setg(errp, "State blocked by non-migratable device '%s'",
965                        se->idstr);
966             return true;
967         }
968     }
969     return false;
970 }
971 
972 void qemu_savevm_state_header(QEMUFile *f)
973 {
974     trace_savevm_state_header();
975     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
976     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
977 
978     if (migrate_get_current()->send_configuration) {
979         qemu_put_byte(f, QEMU_VM_CONFIGURATION);
980         vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
981     }
982 }
983 
984 void qemu_savevm_state_setup(QEMUFile *f)
985 {
986     SaveStateEntry *se;
987     int ret;
988 
989     trace_savevm_state_setup();
990     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
991         if (!se->ops || !se->ops->save_setup) {
992             continue;
993         }
994         if (se->ops && se->ops->is_active) {
995             if (!se->ops->is_active(se->opaque)) {
996                 continue;
997             }
998         }
999         save_section_header(f, se, QEMU_VM_SECTION_START);
1000 
1001         ret = se->ops->save_setup(f, se->opaque);
1002         save_section_footer(f, se);
1003         if (ret < 0) {
1004             qemu_file_set_error(f, ret);
1005             break;
1006         }
1007     }
1008 }
1009 
1010 /*
1011  * this function has three return values:
1012  *   negative: there was one error, and we have -errno.
1013  *   0 : We haven't finished, caller have to go again
1014  *   1 : We have finished, we can go to complete phase
1015  */
1016 int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1017 {
1018     SaveStateEntry *se;
1019     int ret = 1;
1020 
1021     trace_savevm_state_iterate();
1022     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1023         if (!se->ops || !se->ops->save_live_iterate) {
1024             continue;
1025         }
1026         if (se->ops && se->ops->is_active) {
1027             if (!se->ops->is_active(se->opaque)) {
1028                 continue;
1029             }
1030         }
1031         /*
1032          * In the postcopy phase, any device that doesn't know how to
1033          * do postcopy should have saved it's state in the _complete
1034          * call that's already run, it might get confused if we call
1035          * iterate afterwards.
1036          */
1037         if (postcopy &&
1038             !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1039             continue;
1040         }
1041         if (qemu_file_rate_limit(f)) {
1042             return 0;
1043         }
1044         trace_savevm_section_start(se->idstr, se->section_id);
1045 
1046         save_section_header(f, se, QEMU_VM_SECTION_PART);
1047 
1048         ret = se->ops->save_live_iterate(f, se->opaque);
1049         trace_savevm_section_end(se->idstr, se->section_id, ret);
1050         save_section_footer(f, se);
1051 
1052         if (ret < 0) {
1053             qemu_file_set_error(f, ret);
1054         }
1055         if (ret <= 0) {
1056             /* Do not proceed to the next vmstate before this one reported
1057                completion of the current stage. This serializes the migration
1058                and reduces the probability that a faster changing state is
1059                synchronized over and over again. */
1060             break;
1061         }
1062     }
1063     return ret;
1064 }
1065 
1066 static bool should_send_vmdesc(void)
1067 {
1068     MachineState *machine = MACHINE(qdev_get_machine());
1069     bool in_postcopy = migration_in_postcopy();
1070     return !machine->suppress_vmdesc && !in_postcopy;
1071 }
1072 
1073 /*
1074  * Calls the save_live_complete_postcopy methods
1075  * causing the last few pages to be sent immediately and doing any associated
1076  * cleanup.
1077  * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1078  * all the other devices, but that happens at the point we switch to postcopy.
1079  */
1080 void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1081 {
1082     SaveStateEntry *se;
1083     int ret;
1084 
1085     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1086         if (!se->ops || !se->ops->save_live_complete_postcopy) {
1087             continue;
1088         }
1089         if (se->ops && se->ops->is_active) {
1090             if (!se->ops->is_active(se->opaque)) {
1091                 continue;
1092             }
1093         }
1094         trace_savevm_section_start(se->idstr, se->section_id);
1095         /* Section type */
1096         qemu_put_byte(f, QEMU_VM_SECTION_END);
1097         qemu_put_be32(f, se->section_id);
1098 
1099         ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1100         trace_savevm_section_end(se->idstr, se->section_id, ret);
1101         save_section_footer(f, se);
1102         if (ret < 0) {
1103             qemu_file_set_error(f, ret);
1104             return;
1105         }
1106     }
1107 
1108     qemu_put_byte(f, QEMU_VM_EOF);
1109     qemu_fflush(f);
1110 }
1111 
1112 int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1113                                        bool inactivate_disks)
1114 {
1115     QJSON *vmdesc;
1116     int vmdesc_len;
1117     SaveStateEntry *se;
1118     int ret;
1119     bool in_postcopy = migration_in_postcopy();
1120 
1121     trace_savevm_state_complete_precopy();
1122 
1123     cpu_synchronize_all_states();
1124 
1125     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1126         if (!se->ops ||
1127             (in_postcopy && se->ops->has_postcopy &&
1128              se->ops->has_postcopy(se->opaque)) ||
1129             (in_postcopy && !iterable_only) ||
1130             !se->ops->save_live_complete_precopy) {
1131             continue;
1132         }
1133 
1134         if (se->ops && se->ops->is_active) {
1135             if (!se->ops->is_active(se->opaque)) {
1136                 continue;
1137             }
1138         }
1139         trace_savevm_section_start(se->idstr, se->section_id);
1140 
1141         save_section_header(f, se, QEMU_VM_SECTION_END);
1142 
1143         ret = se->ops->save_live_complete_precopy(f, se->opaque);
1144         trace_savevm_section_end(se->idstr, se->section_id, ret);
1145         save_section_footer(f, se);
1146         if (ret < 0) {
1147             qemu_file_set_error(f, ret);
1148             return -1;
1149         }
1150     }
1151 
1152     if (iterable_only) {
1153         return 0;
1154     }
1155 
1156     vmdesc = qjson_new();
1157     json_prop_int(vmdesc, "page_size", qemu_target_page_size());
1158     json_start_array(vmdesc, "devices");
1159     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1160 
1161         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1162             continue;
1163         }
1164         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1165             trace_savevm_section_skip(se->idstr, se->section_id);
1166             continue;
1167         }
1168 
1169         trace_savevm_section_start(se->idstr, se->section_id);
1170 
1171         json_start_object(vmdesc, NULL);
1172         json_prop_str(vmdesc, "name", se->idstr);
1173         json_prop_int(vmdesc, "instance_id", se->instance_id);
1174 
1175         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1176         ret = vmstate_save(f, se, vmdesc);
1177         if (ret) {
1178             qemu_file_set_error(f, ret);
1179             return ret;
1180         }
1181         trace_savevm_section_end(se->idstr, se->section_id, 0);
1182         save_section_footer(f, se);
1183 
1184         json_end_object(vmdesc);
1185     }
1186 
1187     if (inactivate_disks) {
1188         /* Inactivate before sending QEMU_VM_EOF so that the
1189          * bdrv_invalidate_cache_all() on the other end won't fail. */
1190         ret = bdrv_inactivate_all();
1191         if (ret) {
1192             error_report("%s: bdrv_inactivate_all() failed (%d)",
1193                          __func__, ret);
1194             qemu_file_set_error(f, ret);
1195             return ret;
1196         }
1197     }
1198     if (!in_postcopy) {
1199         /* Postcopy stream will still be going */
1200         qemu_put_byte(f, QEMU_VM_EOF);
1201     }
1202 
1203     json_end_array(vmdesc);
1204     qjson_finish(vmdesc);
1205     vmdesc_len = strlen(qjson_get_str(vmdesc));
1206 
1207     if (should_send_vmdesc()) {
1208         qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1209         qemu_put_be32(f, vmdesc_len);
1210         qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len);
1211     }
1212     qjson_destroy(vmdesc);
1213 
1214     qemu_fflush(f);
1215     return 0;
1216 }
1217 
1218 /* Give an estimate of the amount left to be transferred,
1219  * the result is split into the amount for units that can and
1220  * for units that can't do postcopy.
1221  */
1222 void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1223                                uint64_t *res_non_postcopiable,
1224                                uint64_t *res_postcopiable)
1225 {
1226     SaveStateEntry *se;
1227 
1228     *res_non_postcopiable = 0;
1229     *res_postcopiable = 0;
1230 
1231 
1232     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1233         if (!se->ops || !se->ops->save_live_pending) {
1234             continue;
1235         }
1236         if (se->ops && se->ops->is_active) {
1237             if (!se->ops->is_active(se->opaque)) {
1238                 continue;
1239             }
1240         }
1241         se->ops->save_live_pending(f, se->opaque, threshold_size,
1242                                    res_non_postcopiable, res_postcopiable);
1243     }
1244 }
1245 
1246 void qemu_savevm_state_cleanup(void)
1247 {
1248     SaveStateEntry *se;
1249 
1250     trace_savevm_state_cleanup();
1251     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1252         if (se->ops && se->ops->save_cleanup) {
1253             se->ops->save_cleanup(se->opaque);
1254         }
1255     }
1256 }
1257 
1258 static int qemu_savevm_state(QEMUFile *f, Error **errp)
1259 {
1260     int ret;
1261     MigrationState *ms = migrate_get_current();
1262     MigrationStatus status;
1263 
1264     migrate_init(ms);
1265 
1266     ms->to_dst_file = f;
1267 
1268     if (migration_is_blocked(errp)) {
1269         ret = -EINVAL;
1270         goto done;
1271     }
1272 
1273     if (migrate_use_block()) {
1274         error_setg(errp, "Block migration and snapshots are incompatible");
1275         ret = -EINVAL;
1276         goto done;
1277     }
1278 
1279     qemu_mutex_unlock_iothread();
1280     qemu_savevm_state_header(f);
1281     qemu_savevm_state_setup(f);
1282     qemu_mutex_lock_iothread();
1283 
1284     while (qemu_file_get_error(f) == 0) {
1285         if (qemu_savevm_state_iterate(f, false) > 0) {
1286             break;
1287         }
1288     }
1289 
1290     ret = qemu_file_get_error(f);
1291     if (ret == 0) {
1292         qemu_savevm_state_complete_precopy(f, false, false);
1293         ret = qemu_file_get_error(f);
1294     }
1295     qemu_savevm_state_cleanup();
1296     if (ret != 0) {
1297         error_setg_errno(errp, -ret, "Error while writing VM state");
1298     }
1299 
1300 done:
1301     if (ret != 0) {
1302         status = MIGRATION_STATUS_FAILED;
1303     } else {
1304         status = MIGRATION_STATUS_COMPLETED;
1305     }
1306     migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1307 
1308     /* f is outer parameter, it should not stay in global migration state after
1309      * this function finished */
1310     ms->to_dst_file = NULL;
1311 
1312     return ret;
1313 }
1314 
1315 static int qemu_save_device_state(QEMUFile *f)
1316 {
1317     SaveStateEntry *se;
1318 
1319     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1320     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1321 
1322     cpu_synchronize_all_states();
1323 
1324     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1325         int ret;
1326 
1327         if (se->is_ram) {
1328             continue;
1329         }
1330         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1331             continue;
1332         }
1333         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1334             continue;
1335         }
1336 
1337         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1338 
1339         ret = vmstate_save(f, se, NULL);
1340         if (ret) {
1341             return ret;
1342         }
1343 
1344         save_section_footer(f, se);
1345     }
1346 
1347     qemu_put_byte(f, QEMU_VM_EOF);
1348 
1349     return qemu_file_get_error(f);
1350 }
1351 
1352 static SaveStateEntry *find_se(const char *idstr, int instance_id)
1353 {
1354     SaveStateEntry *se;
1355 
1356     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1357         if (!strcmp(se->idstr, idstr) &&
1358             (instance_id == se->instance_id ||
1359              instance_id == se->alias_id))
1360             return se;
1361         /* Migrating from an older version? */
1362         if (strstr(se->idstr, idstr) && se->compat) {
1363             if (!strcmp(se->compat->idstr, idstr) &&
1364                 (instance_id == se->compat->instance_id ||
1365                  instance_id == se->alias_id))
1366                 return se;
1367         }
1368     }
1369     return NULL;
1370 }
1371 
1372 enum LoadVMExitCodes {
1373     /* Allow a command to quit all layers of nested loadvm loops */
1374     LOADVM_QUIT     =  1,
1375 };
1376 
1377 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
1378 
1379 /* ------ incoming postcopy messages ------ */
1380 /* 'advise' arrives before any transfers just to tell us that a postcopy
1381  * *might* happen - it might be skipped if precopy transferred everything
1382  * quickly.
1383  */
1384 static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1385                                          uint16_t len)
1386 {
1387     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1388     uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1389 
1390     trace_loadvm_postcopy_handle_advise();
1391     if (ps != POSTCOPY_INCOMING_NONE) {
1392         error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1393         return -1;
1394     }
1395 
1396     switch (len) {
1397     case 0:
1398         if (migrate_postcopy_ram()) {
1399             error_report("RAM postcopy is enabled but have 0 byte advise");
1400             return -EINVAL;
1401         }
1402         return 0;
1403     case 8 + 8:
1404         if (!migrate_postcopy_ram()) {
1405             error_report("RAM postcopy is disabled but have 16 byte advise");
1406             return -EINVAL;
1407         }
1408         break;
1409     default:
1410         error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1411         return -EINVAL;
1412     }
1413 
1414     if (!postcopy_ram_supported_by_host(mis)) {
1415         postcopy_state_set(POSTCOPY_INCOMING_NONE);
1416         return -1;
1417     }
1418 
1419     remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1420     local_pagesize_summary = ram_pagesize_summary();
1421 
1422     if (remote_pagesize_summary != local_pagesize_summary)  {
1423         /*
1424          * This detects two potential causes of mismatch:
1425          *   a) A mismatch in host page sizes
1426          *      Some combinations of mismatch are probably possible but it gets
1427          *      a bit more complicated.  In particular we need to place whole
1428          *      host pages on the dest at once, and we need to ensure that we
1429          *      handle dirtying to make sure we never end up sending part of
1430          *      a hostpage on it's own.
1431          *   b) The use of different huge page sizes on source/destination
1432          *      a more fine grain test is performed during RAM block migration
1433          *      but this test here causes a nice early clear failure, and
1434          *      also fails when passed to an older qemu that doesn't
1435          *      do huge pages.
1436          */
1437         error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1438                                                              " d=%" PRIx64 ")",
1439                      remote_pagesize_summary, local_pagesize_summary);
1440         return -1;
1441     }
1442 
1443     remote_tps = qemu_get_be64(mis->from_src_file);
1444     if (remote_tps != qemu_target_page_size()) {
1445         /*
1446          * Again, some differences could be dealt with, but for now keep it
1447          * simple.
1448          */
1449         error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1450                      (int)remote_tps, qemu_target_page_size());
1451         return -1;
1452     }
1453 
1454     if (ram_postcopy_incoming_init(mis)) {
1455         return -1;
1456     }
1457 
1458     postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1459 
1460     return 0;
1461 }
1462 
1463 /* After postcopy we will be told to throw some pages away since they're
1464  * dirty and will have to be demand fetched.  Must happen before CPU is
1465  * started.
1466  * There can be 0..many of these messages, each encoding multiple pages.
1467  */
1468 static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1469                                               uint16_t len)
1470 {
1471     int tmp;
1472     char ramid[256];
1473     PostcopyState ps = postcopy_state_get();
1474 
1475     trace_loadvm_postcopy_ram_handle_discard();
1476 
1477     switch (ps) {
1478     case POSTCOPY_INCOMING_ADVISE:
1479         /* 1st discard */
1480         tmp = postcopy_ram_prepare_discard(mis);
1481         if (tmp) {
1482             return tmp;
1483         }
1484         break;
1485 
1486     case POSTCOPY_INCOMING_DISCARD:
1487         /* Expected state */
1488         break;
1489 
1490     default:
1491         error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1492                      ps);
1493         return -1;
1494     }
1495     /* We're expecting a
1496      *    Version (0)
1497      *    a RAM ID string (length byte, name, 0 term)
1498      *    then at least 1 16 byte chunk
1499     */
1500     if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1501         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1502         return -1;
1503     }
1504 
1505     tmp = qemu_get_byte(mis->from_src_file);
1506     if (tmp != postcopy_ram_discard_version) {
1507         error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1508         return -1;
1509     }
1510 
1511     if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1512         error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1513         return -1;
1514     }
1515     tmp = qemu_get_byte(mis->from_src_file);
1516     if (tmp != 0) {
1517         error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1518         return -1;
1519     }
1520 
1521     len -= 3 + strlen(ramid);
1522     if (len % 16) {
1523         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1524         return -1;
1525     }
1526     trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1527     while (len) {
1528         uint64_t start_addr, block_length;
1529         start_addr = qemu_get_be64(mis->from_src_file);
1530         block_length = qemu_get_be64(mis->from_src_file);
1531 
1532         len -= 16;
1533         int ret = ram_discard_range(ramid, start_addr, block_length);
1534         if (ret) {
1535             return ret;
1536         }
1537     }
1538     trace_loadvm_postcopy_ram_handle_discard_end();
1539 
1540     return 0;
1541 }
1542 
1543 /*
1544  * Triggered by a postcopy_listen command; this thread takes over reading
1545  * the input stream, leaving the main thread free to carry on loading the rest
1546  * of the device state (from RAM).
1547  * (TODO:This could do with being in a postcopy file - but there again it's
1548  * just another input loop, not that postcopy specific)
1549  */
1550 static void *postcopy_ram_listen_thread(void *opaque)
1551 {
1552     QEMUFile *f = opaque;
1553     MigrationIncomingState *mis = migration_incoming_get_current();
1554     int load_res;
1555 
1556     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1557                                    MIGRATION_STATUS_POSTCOPY_ACTIVE);
1558     qemu_sem_post(&mis->listen_thread_sem);
1559     trace_postcopy_ram_listen_thread_start();
1560 
1561     /*
1562      * Because we're a thread and not a coroutine we can't yield
1563      * in qemu_file, and thus we must be blocking now.
1564      */
1565     qemu_file_set_blocking(f, true);
1566     load_res = qemu_loadvm_state_main(f, mis);
1567     /* And non-blocking again so we don't block in any cleanup */
1568     qemu_file_set_blocking(f, false);
1569 
1570     trace_postcopy_ram_listen_thread_exit();
1571     if (load_res < 0) {
1572         error_report("%s: loadvm failed: %d", __func__, load_res);
1573         qemu_file_set_error(f, load_res);
1574         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1575                                        MIGRATION_STATUS_FAILED);
1576     } else {
1577         /*
1578          * This looks good, but it's possible that the device loading in the
1579          * main thread hasn't finished yet, and so we might not be in 'RUN'
1580          * state yet; wait for the end of the main thread.
1581          */
1582         qemu_event_wait(&mis->main_thread_load_event);
1583     }
1584     postcopy_ram_incoming_cleanup(mis);
1585 
1586     if (load_res < 0) {
1587         /*
1588          * If something went wrong then we have a bad state so exit;
1589          * depending how far we got it might be possible at this point
1590          * to leave the guest running and fire MCEs for pages that never
1591          * arrived as a desperate recovery step.
1592          */
1593         exit(EXIT_FAILURE);
1594     }
1595 
1596     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1597                                    MIGRATION_STATUS_COMPLETED);
1598     /*
1599      * If everything has worked fine, then the main thread has waited
1600      * for us to start, and we're the last use of the mis.
1601      * (If something broke then qemu will have to exit anyway since it's
1602      * got a bad migration state).
1603      */
1604     migration_incoming_state_destroy();
1605     qemu_loadvm_state_cleanup();
1606 
1607     return NULL;
1608 }
1609 
1610 /* After this message we must be able to immediately receive postcopy data */
1611 static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1612 {
1613     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1614     trace_loadvm_postcopy_handle_listen();
1615     if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1616         error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1617         return -1;
1618     }
1619     if (ps == POSTCOPY_INCOMING_ADVISE) {
1620         /*
1621          * A rare case, we entered listen without having to do any discards,
1622          * so do the setup that's normally done at the time of the 1st discard.
1623          */
1624         if (migrate_postcopy_ram()) {
1625             postcopy_ram_prepare_discard(mis);
1626         }
1627     }
1628 
1629     /*
1630      * Sensitise RAM - can now generate requests for blocks that don't exist
1631      * However, at this point the CPU shouldn't be running, and the IO
1632      * shouldn't be doing anything yet so don't actually expect requests
1633      */
1634     if (migrate_postcopy_ram()) {
1635         if (postcopy_ram_enable_notify(mis)) {
1636             return -1;
1637         }
1638     }
1639 
1640     if (mis->have_listen_thread) {
1641         error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
1642         return -1;
1643     }
1644 
1645     mis->have_listen_thread = true;
1646     /* Start up the listening thread and wait for it to signal ready */
1647     qemu_sem_init(&mis->listen_thread_sem, 0);
1648     qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1649                        postcopy_ram_listen_thread, mis->from_src_file,
1650                        QEMU_THREAD_DETACHED);
1651     qemu_sem_wait(&mis->listen_thread_sem);
1652     qemu_sem_destroy(&mis->listen_thread_sem);
1653 
1654     return 0;
1655 }
1656 
1657 
1658 typedef struct {
1659     QEMUBH *bh;
1660 } HandleRunBhData;
1661 
1662 static void loadvm_postcopy_handle_run_bh(void *opaque)
1663 {
1664     Error *local_err = NULL;
1665     HandleRunBhData *data = opaque;
1666 
1667     /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1668      * in migration.c
1669      */
1670     cpu_synchronize_all_post_init();
1671 
1672     qemu_announce_self();
1673 
1674     /* Make sure all file formats flush their mutable metadata.
1675      * If we get an error here, just don't restart the VM yet. */
1676     bdrv_invalidate_cache_all(&local_err);
1677     if (local_err) {
1678         error_report_err(local_err);
1679         local_err = NULL;
1680         autostart = false;
1681     }
1682 
1683     trace_loadvm_postcopy_handle_run_cpu_sync();
1684     cpu_synchronize_all_post_init();
1685 
1686     trace_loadvm_postcopy_handle_run_vmstart();
1687 
1688     if (autostart) {
1689         /* Hold onto your hats, starting the CPU */
1690         vm_start();
1691     } else {
1692         /* leave it paused and let management decide when to start the CPU */
1693         runstate_set(RUN_STATE_PAUSED);
1694     }
1695 
1696     qemu_bh_delete(data->bh);
1697     g_free(data);
1698 }
1699 
1700 /* After all discards we can start running and asking for pages */
1701 static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
1702 {
1703     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
1704     HandleRunBhData *data;
1705 
1706     trace_loadvm_postcopy_handle_run();
1707     if (ps != POSTCOPY_INCOMING_LISTENING) {
1708         error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
1709         return -1;
1710     }
1711 
1712     data = g_new(HandleRunBhData, 1);
1713     data->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, data);
1714     qemu_bh_schedule(data->bh);
1715 
1716     /* We need to finish reading the stream from the package
1717      * and also stop reading anything more from the stream that loaded the
1718      * package (since it's now being read by the listener thread).
1719      * LOADVM_QUIT will quit all the layers of nested loadvm loops.
1720      */
1721     return LOADVM_QUIT;
1722 }
1723 
1724 /**
1725  * Immediately following this command is a blob of data containing an embedded
1726  * chunk of migration stream; read it and load it.
1727  *
1728  * @mis: Incoming state
1729  * @length: Length of packaged data to read
1730  *
1731  * Returns: Negative values on error
1732  *
1733  */
1734 static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
1735 {
1736     int ret;
1737     size_t length;
1738     QIOChannelBuffer *bioc;
1739 
1740     length = qemu_get_be32(mis->from_src_file);
1741     trace_loadvm_handle_cmd_packaged(length);
1742 
1743     if (length > MAX_VM_CMD_PACKAGED_SIZE) {
1744         error_report("Unreasonably large packaged state: %zu", length);
1745         return -1;
1746     }
1747 
1748     bioc = qio_channel_buffer_new(length);
1749     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
1750     ret = qemu_get_buffer(mis->from_src_file,
1751                           bioc->data,
1752                           length);
1753     if (ret != length) {
1754         object_unref(OBJECT(bioc));
1755         error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
1756                      ret, length);
1757         return (ret < 0) ? ret : -EAGAIN;
1758     }
1759     bioc->usage += length;
1760     trace_loadvm_handle_cmd_packaged_received(ret);
1761 
1762     QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
1763 
1764     ret = qemu_loadvm_state_main(packf, mis);
1765     trace_loadvm_handle_cmd_packaged_main(ret);
1766     qemu_fclose(packf);
1767     object_unref(OBJECT(bioc));
1768 
1769     return ret;
1770 }
1771 
1772 /*
1773  * Process an incoming 'QEMU_VM_COMMAND'
1774  * 0           just a normal return
1775  * LOADVM_QUIT All good, but exit the loop
1776  * <0          Error
1777  */
1778 static int loadvm_process_command(QEMUFile *f)
1779 {
1780     MigrationIncomingState *mis = migration_incoming_get_current();
1781     uint16_t cmd;
1782     uint16_t len;
1783     uint32_t tmp32;
1784 
1785     cmd = qemu_get_be16(f);
1786     len = qemu_get_be16(f);
1787 
1788     /* Check validity before continue processing of cmds */
1789     if (qemu_file_get_error(f)) {
1790         return qemu_file_get_error(f);
1791     }
1792 
1793     trace_loadvm_process_command(cmd, len);
1794     if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
1795         error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
1796         return -EINVAL;
1797     }
1798 
1799     if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
1800         error_report("%s received with bad length - expecting %zu, got %d",
1801                      mig_cmd_args[cmd].name,
1802                      (size_t)mig_cmd_args[cmd].len, len);
1803         return -ERANGE;
1804     }
1805 
1806     switch (cmd) {
1807     case MIG_CMD_OPEN_RETURN_PATH:
1808         if (mis->to_src_file) {
1809             error_report("CMD_OPEN_RETURN_PATH called when RP already open");
1810             /* Not really a problem, so don't give up */
1811             return 0;
1812         }
1813         mis->to_src_file = qemu_file_get_return_path(f);
1814         if (!mis->to_src_file) {
1815             error_report("CMD_OPEN_RETURN_PATH failed");
1816             return -1;
1817         }
1818         break;
1819 
1820     case MIG_CMD_PING:
1821         tmp32 = qemu_get_be32(f);
1822         trace_loadvm_process_command_ping(tmp32);
1823         if (!mis->to_src_file) {
1824             error_report("CMD_PING (0x%x) received with no return path",
1825                          tmp32);
1826             return -1;
1827         }
1828         migrate_send_rp_pong(mis, tmp32);
1829         break;
1830 
1831     case MIG_CMD_PACKAGED:
1832         return loadvm_handle_cmd_packaged(mis);
1833 
1834     case MIG_CMD_POSTCOPY_ADVISE:
1835         return loadvm_postcopy_handle_advise(mis, len);
1836 
1837     case MIG_CMD_POSTCOPY_LISTEN:
1838         return loadvm_postcopy_handle_listen(mis);
1839 
1840     case MIG_CMD_POSTCOPY_RUN:
1841         return loadvm_postcopy_handle_run(mis);
1842 
1843     case MIG_CMD_POSTCOPY_RAM_DISCARD:
1844         return loadvm_postcopy_ram_handle_discard(mis, len);
1845     }
1846 
1847     return 0;
1848 }
1849 
1850 /*
1851  * Read a footer off the wire and check that it matches the expected section
1852  *
1853  * Returns: true if the footer was good
1854  *          false if there is a problem (and calls error_report to say why)
1855  */
1856 static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
1857 {
1858     int ret;
1859     uint8_t read_mark;
1860     uint32_t read_section_id;
1861 
1862     if (!migrate_get_current()->send_section_footer) {
1863         /* No footer to check */
1864         return true;
1865     }
1866 
1867     read_mark = qemu_get_byte(f);
1868 
1869     ret = qemu_file_get_error(f);
1870     if (ret) {
1871         error_report("%s: Read section footer failed: %d",
1872                      __func__, ret);
1873         return false;
1874     }
1875 
1876     if (read_mark != QEMU_VM_SECTION_FOOTER) {
1877         error_report("Missing section footer for %s", se->idstr);
1878         return false;
1879     }
1880 
1881     read_section_id = qemu_get_be32(f);
1882     if (read_section_id != se->load_section_id) {
1883         error_report("Mismatched section id in footer for %s -"
1884                      " read 0x%x expected 0x%x",
1885                      se->idstr, read_section_id, se->load_section_id);
1886         return false;
1887     }
1888 
1889     /* All good */
1890     return true;
1891 }
1892 
1893 static int
1894 qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
1895 {
1896     uint32_t instance_id, version_id, section_id;
1897     SaveStateEntry *se;
1898     char idstr[256];
1899     int ret;
1900 
1901     /* Read section start */
1902     section_id = qemu_get_be32(f);
1903     if (!qemu_get_counted_string(f, idstr)) {
1904         error_report("Unable to read ID string for section %u",
1905                      section_id);
1906         return -EINVAL;
1907     }
1908     instance_id = qemu_get_be32(f);
1909     version_id = qemu_get_be32(f);
1910 
1911     ret = qemu_file_get_error(f);
1912     if (ret) {
1913         error_report("%s: Failed to read instance/version ID: %d",
1914                      __func__, ret);
1915         return ret;
1916     }
1917 
1918     trace_qemu_loadvm_state_section_startfull(section_id, idstr,
1919             instance_id, version_id);
1920     /* Find savevm section */
1921     se = find_se(idstr, instance_id);
1922     if (se == NULL) {
1923         error_report("Unknown savevm section or instance '%s' %d",
1924                      idstr, instance_id);
1925         return -EINVAL;
1926     }
1927 
1928     /* Validate version */
1929     if (version_id > se->version_id) {
1930         error_report("savevm: unsupported version %d for '%s' v%d",
1931                      version_id, idstr, se->version_id);
1932         return -EINVAL;
1933     }
1934     se->load_version_id = version_id;
1935     se->load_section_id = section_id;
1936 
1937     /* Validate if it is a device's state */
1938     if (xen_enabled() && se->is_ram) {
1939         error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
1940         return -EINVAL;
1941     }
1942 
1943     ret = vmstate_load(f, se);
1944     if (ret < 0) {
1945         error_report("error while loading state for instance 0x%x of"
1946                      " device '%s'", instance_id, idstr);
1947         return ret;
1948     }
1949     if (!check_section_footer(f, se)) {
1950         return -EINVAL;
1951     }
1952 
1953     return 0;
1954 }
1955 
1956 static int
1957 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
1958 {
1959     uint32_t section_id;
1960     SaveStateEntry *se;
1961     int ret;
1962 
1963     section_id = qemu_get_be32(f);
1964 
1965     ret = qemu_file_get_error(f);
1966     if (ret) {
1967         error_report("%s: Failed to read section ID: %d",
1968                      __func__, ret);
1969         return ret;
1970     }
1971 
1972     trace_qemu_loadvm_state_section_partend(section_id);
1973     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1974         if (se->load_section_id == section_id) {
1975             break;
1976         }
1977     }
1978     if (se == NULL) {
1979         error_report("Unknown savevm section %d", section_id);
1980         return -EINVAL;
1981     }
1982 
1983     ret = vmstate_load(f, se);
1984     if (ret < 0) {
1985         error_report("error while loading state section id %d(%s)",
1986                      section_id, se->idstr);
1987         return ret;
1988     }
1989     if (!check_section_footer(f, se)) {
1990         return -EINVAL;
1991     }
1992 
1993     return 0;
1994 }
1995 
1996 static int qemu_loadvm_state_setup(QEMUFile *f)
1997 {
1998     SaveStateEntry *se;
1999     int ret;
2000 
2001     trace_loadvm_state_setup();
2002     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2003         if (!se->ops || !se->ops->load_setup) {
2004             continue;
2005         }
2006         if (se->ops && se->ops->is_active) {
2007             if (!se->ops->is_active(se->opaque)) {
2008                 continue;
2009             }
2010         }
2011 
2012         ret = se->ops->load_setup(f, se->opaque);
2013         if (ret < 0) {
2014             qemu_file_set_error(f, ret);
2015             error_report("Load state of device %s failed", se->idstr);
2016             return ret;
2017         }
2018     }
2019     return 0;
2020 }
2021 
2022 void qemu_loadvm_state_cleanup(void)
2023 {
2024     SaveStateEntry *se;
2025 
2026     trace_loadvm_state_cleanup();
2027     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2028         if (se->ops && se->ops->load_cleanup) {
2029             se->ops->load_cleanup(se->opaque);
2030         }
2031     }
2032 }
2033 
2034 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2035 {
2036     uint8_t section_type;
2037     int ret = 0;
2038 
2039     while (true) {
2040         section_type = qemu_get_byte(f);
2041 
2042         if (qemu_file_get_error(f)) {
2043             ret = qemu_file_get_error(f);
2044             break;
2045         }
2046 
2047         trace_qemu_loadvm_state_section(section_type);
2048         switch (section_type) {
2049         case QEMU_VM_SECTION_START:
2050         case QEMU_VM_SECTION_FULL:
2051             ret = qemu_loadvm_section_start_full(f, mis);
2052             if (ret < 0) {
2053                 goto out;
2054             }
2055             break;
2056         case QEMU_VM_SECTION_PART:
2057         case QEMU_VM_SECTION_END:
2058             ret = qemu_loadvm_section_part_end(f, mis);
2059             if (ret < 0) {
2060                 goto out;
2061             }
2062             break;
2063         case QEMU_VM_COMMAND:
2064             ret = loadvm_process_command(f);
2065             trace_qemu_loadvm_state_section_command(ret);
2066             if ((ret < 0) || (ret & LOADVM_QUIT)) {
2067                 goto out;
2068             }
2069             break;
2070         case QEMU_VM_EOF:
2071             /* This is the end of migration */
2072             goto out;
2073         default:
2074             error_report("Unknown savevm section type %d", section_type);
2075             ret = -EINVAL;
2076             goto out;
2077         }
2078     }
2079 
2080 out:
2081     if (ret < 0) {
2082         qemu_file_set_error(f, ret);
2083     }
2084     return ret;
2085 }
2086 
2087 int qemu_loadvm_state(QEMUFile *f)
2088 {
2089     MigrationIncomingState *mis = migration_incoming_get_current();
2090     Error *local_err = NULL;
2091     unsigned int v;
2092     int ret;
2093 
2094     if (qemu_savevm_state_blocked(&local_err)) {
2095         error_report_err(local_err);
2096         return -EINVAL;
2097     }
2098 
2099     v = qemu_get_be32(f);
2100     if (v != QEMU_VM_FILE_MAGIC) {
2101         error_report("Not a migration stream");
2102         return -EINVAL;
2103     }
2104 
2105     v = qemu_get_be32(f);
2106     if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2107         error_report("SaveVM v2 format is obsolete and don't work anymore");
2108         return -ENOTSUP;
2109     }
2110     if (v != QEMU_VM_FILE_VERSION) {
2111         error_report("Unsupported migration stream version");
2112         return -ENOTSUP;
2113     }
2114 
2115     if (qemu_loadvm_state_setup(f) != 0) {
2116         return -EINVAL;
2117     }
2118 
2119     if (migrate_get_current()->send_configuration) {
2120         if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2121             error_report("Configuration section missing");
2122             return -EINVAL;
2123         }
2124         ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2125 
2126         if (ret) {
2127             return ret;
2128         }
2129     }
2130 
2131     cpu_synchronize_all_pre_loadvm();
2132 
2133     ret = qemu_loadvm_state_main(f, mis);
2134     qemu_event_set(&mis->main_thread_load_event);
2135 
2136     trace_qemu_loadvm_state_post_main(ret);
2137 
2138     if (mis->have_listen_thread) {
2139         /* Listen thread still going, can't clean up yet */
2140         return ret;
2141     }
2142 
2143     if (ret == 0) {
2144         ret = qemu_file_get_error(f);
2145     }
2146 
2147     /*
2148      * Try to read in the VMDESC section as well, so that dumping tools that
2149      * intercept our migration stream have the chance to see it.
2150      */
2151 
2152     /* We've got to be careful; if we don't read the data and just shut the fd
2153      * then the sender can error if we close while it's still sending.
2154      * We also mustn't read data that isn't there; some transports (RDMA)
2155      * will stall waiting for that data when the source has already closed.
2156      */
2157     if (ret == 0 && should_send_vmdesc()) {
2158         uint8_t *buf;
2159         uint32_t size;
2160         uint8_t  section_type = qemu_get_byte(f);
2161 
2162         if (section_type != QEMU_VM_VMDESCRIPTION) {
2163             error_report("Expected vmdescription section, but got %d",
2164                          section_type);
2165             /*
2166              * It doesn't seem worth failing at this point since
2167              * we apparently have an otherwise valid VM state
2168              */
2169         } else {
2170             buf = g_malloc(0x1000);
2171             size = qemu_get_be32(f);
2172 
2173             while (size > 0) {
2174                 uint32_t read_chunk = MIN(size, 0x1000);
2175                 qemu_get_buffer(f, buf, read_chunk);
2176                 size -= read_chunk;
2177             }
2178             g_free(buf);
2179         }
2180     }
2181 
2182     qemu_loadvm_state_cleanup();
2183     cpu_synchronize_all_post_init();
2184 
2185     return ret;
2186 }
2187 
2188 int save_snapshot(const char *name, Error **errp)
2189 {
2190     BlockDriverState *bs, *bs1;
2191     QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1;
2192     int ret = -1;
2193     QEMUFile *f;
2194     int saved_vm_running;
2195     uint64_t vm_state_size;
2196     qemu_timeval tv;
2197     struct tm tm;
2198     AioContext *aio_context;
2199 
2200     if (!bdrv_all_can_snapshot(&bs)) {
2201         error_setg(errp, "Device '%s' is writable but does not support "
2202                    "snapshots", bdrv_get_device_name(bs));
2203         return ret;
2204     }
2205 
2206     /* Delete old snapshots of the same name */
2207     if (name) {
2208         ret = bdrv_all_delete_snapshot(name, &bs1, errp);
2209         if (ret < 0) {
2210             error_prepend(errp, "Error while deleting snapshot on device "
2211                           "'%s': ", bdrv_get_device_name(bs1));
2212             return ret;
2213         }
2214     }
2215 
2216     bs = bdrv_all_find_vmstate_bs();
2217     if (bs == NULL) {
2218         error_setg(errp, "No block device can accept snapshots");
2219         return ret;
2220     }
2221     aio_context = bdrv_get_aio_context(bs);
2222 
2223     saved_vm_running = runstate_is_running();
2224 
2225     ret = global_state_store();
2226     if (ret) {
2227         error_setg(errp, "Error saving global state");
2228         return ret;
2229     }
2230     vm_stop(RUN_STATE_SAVE_VM);
2231 
2232     bdrv_drain_all_begin();
2233 
2234     aio_context_acquire(aio_context);
2235 
2236     memset(sn, 0, sizeof(*sn));
2237 
2238     /* fill auxiliary fields */
2239     qemu_gettimeofday(&tv);
2240     sn->date_sec = tv.tv_sec;
2241     sn->date_nsec = tv.tv_usec * 1000;
2242     sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2243 
2244     if (name) {
2245         ret = bdrv_snapshot_find(bs, old_sn, name);
2246         if (ret >= 0) {
2247             pstrcpy(sn->name, sizeof(sn->name), old_sn->name);
2248             pstrcpy(sn->id_str, sizeof(sn->id_str), old_sn->id_str);
2249         } else {
2250             pstrcpy(sn->name, sizeof(sn->name), name);
2251         }
2252     } else {
2253         /* cast below needed for OpenBSD where tv_sec is still 'long' */
2254         localtime_r((const time_t *)&tv.tv_sec, &tm);
2255         strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm);
2256     }
2257 
2258     /* save the VM state */
2259     f = qemu_fopen_bdrv(bs, 1);
2260     if (!f) {
2261         error_setg(errp, "Could not open VM state file");
2262         goto the_end;
2263     }
2264     ret = qemu_savevm_state(f, errp);
2265     vm_state_size = qemu_ftell(f);
2266     qemu_fclose(f);
2267     if (ret < 0) {
2268         goto the_end;
2269     }
2270 
2271     /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2272      * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2273      * it only releases the lock once.  Therefore synchronous I/O will deadlock
2274      * unless we release the AioContext before bdrv_all_create_snapshot().
2275      */
2276     aio_context_release(aio_context);
2277     aio_context = NULL;
2278 
2279     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
2280     if (ret < 0) {
2281         error_setg(errp, "Error while creating snapshot on '%s'",
2282                    bdrv_get_device_name(bs));
2283         goto the_end;
2284     }
2285 
2286     ret = 0;
2287 
2288  the_end:
2289     if (aio_context) {
2290         aio_context_release(aio_context);
2291     }
2292 
2293     bdrv_drain_all_end();
2294 
2295     if (saved_vm_running) {
2296         vm_start();
2297     }
2298     return ret;
2299 }
2300 
2301 void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2302                                 Error **errp)
2303 {
2304     QEMUFile *f;
2305     QIOChannelFile *ioc;
2306     int saved_vm_running;
2307     int ret;
2308 
2309     if (!has_live) {
2310         /* live default to true so old version of Xen tool stack can have a
2311          * successfull live migration */
2312         live = true;
2313     }
2314 
2315     saved_vm_running = runstate_is_running();
2316     vm_stop(RUN_STATE_SAVE_VM);
2317     global_state_store_running();
2318 
2319     ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp);
2320     if (!ioc) {
2321         goto the_end;
2322     }
2323     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2324     f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2325     object_unref(OBJECT(ioc));
2326     ret = qemu_save_device_state(f);
2327     if (ret < 0 || qemu_fclose(f) < 0) {
2328         error_setg(errp, QERR_IO_ERROR);
2329     } else {
2330         /* libxl calls the QMP command "stop" before calling
2331          * "xen-save-devices-state" and in case of migration failure, libxl
2332          * would call "cont".
2333          * So call bdrv_inactivate_all (release locks) here to let the other
2334          * side of the migration take controle of the images.
2335          */
2336         if (live && !saved_vm_running) {
2337             ret = bdrv_inactivate_all();
2338             if (ret) {
2339                 error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2340                            __func__, ret);
2341             }
2342         }
2343     }
2344 
2345  the_end:
2346     if (saved_vm_running) {
2347         vm_start();
2348     }
2349 }
2350 
2351 void qmp_xen_load_devices_state(const char *filename, Error **errp)
2352 {
2353     QEMUFile *f;
2354     QIOChannelFile *ioc;
2355     int ret;
2356 
2357     /* Guest must be paused before loading the device state; the RAM state
2358      * will already have been loaded by xc
2359      */
2360     if (runstate_is_running()) {
2361         error_setg(errp, "Cannot update device state while vm is running");
2362         return;
2363     }
2364     vm_stop(RUN_STATE_RESTORE_VM);
2365 
2366     ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2367     if (!ioc) {
2368         return;
2369     }
2370     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2371     f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2372     object_unref(OBJECT(ioc));
2373 
2374     ret = qemu_loadvm_state(f);
2375     qemu_fclose(f);
2376     if (ret < 0) {
2377         error_setg(errp, QERR_IO_ERROR);
2378     }
2379     migration_incoming_state_destroy();
2380 }
2381 
2382 int load_snapshot(const char *name, Error **errp)
2383 {
2384     BlockDriverState *bs, *bs_vm_state;
2385     QEMUSnapshotInfo sn;
2386     QEMUFile *f;
2387     int ret;
2388     AioContext *aio_context;
2389     MigrationIncomingState *mis = migration_incoming_get_current();
2390 
2391     if (!bdrv_all_can_snapshot(&bs)) {
2392         error_setg(errp,
2393                    "Device '%s' is writable but does not support snapshots",
2394                    bdrv_get_device_name(bs));
2395         return -ENOTSUP;
2396     }
2397     ret = bdrv_all_find_snapshot(name, &bs);
2398     if (ret < 0) {
2399         error_setg(errp,
2400                    "Device '%s' does not have the requested snapshot '%s'",
2401                    bdrv_get_device_name(bs), name);
2402         return ret;
2403     }
2404 
2405     bs_vm_state = bdrv_all_find_vmstate_bs();
2406     if (!bs_vm_state) {
2407         error_setg(errp, "No block device supports snapshots");
2408         return -ENOTSUP;
2409     }
2410     aio_context = bdrv_get_aio_context(bs_vm_state);
2411 
2412     /* Don't even try to load empty VM states */
2413     aio_context_acquire(aio_context);
2414     ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
2415     aio_context_release(aio_context);
2416     if (ret < 0) {
2417         return ret;
2418     } else if (sn.vm_state_size == 0) {
2419         error_setg(errp, "This is a disk-only snapshot. Revert to it "
2420                    " offline using qemu-img");
2421         return -EINVAL;
2422     }
2423 
2424     /* Flush all IO requests so they don't interfere with the new state.  */
2425     bdrv_drain_all_begin();
2426 
2427     ret = bdrv_all_goto_snapshot(name, &bs, errp);
2428     if (ret < 0) {
2429         error_prepend(errp, "Could not load snapshot '%s' on '%s': ",
2430                       name, bdrv_get_device_name(bs));
2431         goto err_drain;
2432     }
2433 
2434     /* restore the VM state */
2435     f = qemu_fopen_bdrv(bs_vm_state, 0);
2436     if (!f) {
2437         error_setg(errp, "Could not open VM state file");
2438         ret = -EINVAL;
2439         goto err_drain;
2440     }
2441 
2442     qemu_system_reset(SHUTDOWN_CAUSE_NONE);
2443     mis->from_src_file = f;
2444 
2445     aio_context_acquire(aio_context);
2446     ret = qemu_loadvm_state(f);
2447     migration_incoming_state_destroy();
2448     aio_context_release(aio_context);
2449 
2450     bdrv_drain_all_end();
2451 
2452     if (ret < 0) {
2453         error_setg(errp, "Error %d while loading VM state", ret);
2454         return ret;
2455     }
2456 
2457     return 0;
2458 
2459 err_drain:
2460     bdrv_drain_all_end();
2461     return ret;
2462 }
2463 
2464 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
2465 {
2466     qemu_ram_set_idstr(mr->ram_block,
2467                        memory_region_name(mr), dev);
2468 }
2469 
2470 void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
2471 {
2472     qemu_ram_unset_idstr(mr->ram_block);
2473 }
2474 
2475 void vmstate_register_ram_global(MemoryRegion *mr)
2476 {
2477     vmstate_register_ram(mr, NULL);
2478 }
2479 
2480 bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
2481 {
2482     /* check needed if --only-migratable is specified */
2483     if (!migrate_get_current()->only_migratable) {
2484         return true;
2485     }
2486 
2487     return !(vmsd && vmsd->unmigratable);
2488 }
2489