xref: /openbmc/qemu/migration/savevm.c (revision cb8b8ef4578dc17c350fd4b27700a9f178e2dad0)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2009-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "hw/boards.h"
31 #include "hw/hw.h"
32 #include "hw/qdev.h"
33 #include "hw/xen/xen.h"
34 #include "net/net.h"
35 #include "sysemu/sysemu.h"
36 #include "qemu/timer.h"
37 #include "migration/migration.h"
38 #include "migration/snapshot.h"
39 #include "ram.h"
40 #include "qemu-file-channel.h"
41 #include "qemu-file.h"
42 #include "savevm.h"
43 #include "postcopy-ram.h"
44 #include "qapi/qmp/qerror.h"
45 #include "qemu/error-report.h"
46 #include "qemu/queue.h"
47 #include "sysemu/cpus.h"
48 #include "exec/memory.h"
49 #include "exec/target_page.h"
50 #include "qmp-commands.h"
51 #include "trace.h"
52 #include "qemu/bitops.h"
53 #include "qemu/iov.h"
54 #include "block/snapshot.h"
55 #include "qemu/cutils.h"
56 #include "io/channel-buffer.h"
57 #include "io/channel-file.h"
58 
59 #ifndef ETH_P_RARP
60 #define ETH_P_RARP 0x8035
61 #endif
62 #define ARP_HTYPE_ETH 0x0001
63 #define ARP_PTYPE_IP 0x0800
64 #define ARP_OP_REQUEST_REV 0x3
65 
66 const unsigned int postcopy_ram_discard_version = 0;
67 
68 static bool skip_section_footers;
69 
70 /* Subcommands for QEMU_VM_COMMAND */
71 enum qemu_vm_cmd {
72     MIG_CMD_INVALID = 0,   /* Must be 0 */
73     MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
74     MIG_CMD_PING,              /* Request a PONG on the RP */
75 
76     MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
77                                       warn we might want to do PC */
78     MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
79                                       pages as it's running. */
80     MIG_CMD_POSTCOPY_RUN,          /* Start execution */
81 
82     MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
83                                       were previously sent during
84                                       precopy but are dirty. */
85     MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
86     MIG_CMD_MAX
87 };
88 
89 #define MAX_VM_CMD_PACKAGED_SIZE (1ul << 24)
90 static struct mig_cmd_args {
91     ssize_t     len; /* -1 = variable */
92     const char *name;
93 } mig_cmd_args[] = {
94     [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
95     [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
96     [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
97     [MIG_CMD_POSTCOPY_ADVISE]  = { .len = 16, .name = "POSTCOPY_ADVISE" },
98     [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
99     [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
100     [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
101                                    .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
102     [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
103     [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
104 };
105 
106 static int announce_self_create(uint8_t *buf,
107                                 uint8_t *mac_addr)
108 {
109     /* Ethernet header. */
110     memset(buf, 0xff, 6);         /* destination MAC addr */
111     memcpy(buf + 6, mac_addr, 6); /* source MAC addr */
112     *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */
113 
114     /* RARP header. */
115     *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */
116     *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */
117     *(buf + 18) = 6; /* hardware addr length (ethernet) */
118     *(buf + 19) = 4; /* protocol addr length (IPv4) */
119     *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */
120     memcpy(buf + 22, mac_addr, 6); /* source hw addr */
121     memset(buf + 28, 0x00, 4);     /* source protocol addr */
122     memcpy(buf + 32, mac_addr, 6); /* target hw addr */
123     memset(buf + 38, 0x00, 4);     /* target protocol addr */
124 
125     /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */
126     memset(buf + 42, 0x00, 18);
127 
128     return 60; /* len (FCS will be added by hardware) */
129 }
130 
131 static void qemu_announce_self_iter(NICState *nic, void *opaque)
132 {
133     uint8_t buf[60];
134     int len;
135 
136     trace_qemu_announce_self_iter(qemu_ether_ntoa(&nic->conf->macaddr));
137     len = announce_self_create(buf, nic->conf->macaddr.a);
138 
139     qemu_send_packet_raw(qemu_get_queue(nic), buf, len);
140 }
141 
142 
143 static void qemu_announce_self_once(void *opaque)
144 {
145     static int count = SELF_ANNOUNCE_ROUNDS;
146     QEMUTimer *timer = *(QEMUTimer **)opaque;
147 
148     qemu_foreach_nic(qemu_announce_self_iter, NULL);
149 
150     if (--count) {
151         /* delay 50ms, 150ms, 250ms, ... */
152         timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
153                   self_announce_delay(count));
154     } else {
155             timer_del(timer);
156             timer_free(timer);
157     }
158 }
159 
160 void qemu_announce_self(void)
161 {
162     static QEMUTimer *timer;
163     timer = timer_new_ms(QEMU_CLOCK_REALTIME, qemu_announce_self_once, &timer);
164     qemu_announce_self_once(&timer);
165 }
166 
167 /***********************************************************/
168 /* savevm/loadvm support */
169 
170 static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
171                                    int64_t pos)
172 {
173     int ret;
174     QEMUIOVector qiov;
175 
176     qemu_iovec_init_external(&qiov, iov, iovcnt);
177     ret = bdrv_writev_vmstate(opaque, &qiov, pos);
178     if (ret < 0) {
179         return ret;
180     }
181 
182     return qiov.size;
183 }
184 
185 static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
186                                 size_t size)
187 {
188     return bdrv_load_vmstate(opaque, buf, pos, size);
189 }
190 
191 static int bdrv_fclose(void *opaque)
192 {
193     return bdrv_flush(opaque);
194 }
195 
196 static const QEMUFileOps bdrv_read_ops = {
197     .get_buffer = block_get_buffer,
198     .close =      bdrv_fclose
199 };
200 
201 static const QEMUFileOps bdrv_write_ops = {
202     .writev_buffer  = block_writev_buffer,
203     .close          = bdrv_fclose
204 };
205 
206 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
207 {
208     if (is_writable) {
209         return qemu_fopen_ops(bs, &bdrv_write_ops);
210     }
211     return qemu_fopen_ops(bs, &bdrv_read_ops);
212 }
213 
214 
215 /* QEMUFile timer support.
216  * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
217  */
218 
219 void timer_put(QEMUFile *f, QEMUTimer *ts)
220 {
221     uint64_t expire_time;
222 
223     expire_time = timer_expire_time_ns(ts);
224     qemu_put_be64(f, expire_time);
225 }
226 
227 void timer_get(QEMUFile *f, QEMUTimer *ts)
228 {
229     uint64_t expire_time;
230 
231     expire_time = qemu_get_be64(f);
232     if (expire_time != -1) {
233         timer_mod_ns(ts, expire_time);
234     } else {
235         timer_del(ts);
236     }
237 }
238 
239 
240 /* VMState timer support.
241  * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
242  */
243 
244 static int get_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field)
245 {
246     QEMUTimer *v = pv;
247     timer_get(f, v);
248     return 0;
249 }
250 
251 static int put_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field,
252                      QJSON *vmdesc)
253 {
254     QEMUTimer *v = pv;
255     timer_put(f, v);
256 
257     return 0;
258 }
259 
260 const VMStateInfo vmstate_info_timer = {
261     .name = "timer",
262     .get  = get_timer,
263     .put  = put_timer,
264 };
265 
266 
267 typedef struct CompatEntry {
268     char idstr[256];
269     int instance_id;
270 } CompatEntry;
271 
272 typedef struct SaveStateEntry {
273     QTAILQ_ENTRY(SaveStateEntry) entry;
274     char idstr[256];
275     int instance_id;
276     int alias_id;
277     int version_id;
278     /* version id read from the stream */
279     int load_version_id;
280     int section_id;
281     /* section id read from the stream */
282     int load_section_id;
283     SaveVMHandlers *ops;
284     const VMStateDescription *vmsd;
285     void *opaque;
286     CompatEntry *compat;
287     int is_ram;
288 } SaveStateEntry;
289 
290 typedef struct SaveState {
291     QTAILQ_HEAD(, SaveStateEntry) handlers;
292     int global_section_id;
293     bool skip_configuration;
294     uint32_t len;
295     const char *name;
296     uint32_t target_page_bits;
297 } SaveState;
298 
299 static SaveState savevm_state = {
300     .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
301     .global_section_id = 0,
302     .skip_configuration = false,
303 };
304 
305 void savevm_skip_configuration(void)
306 {
307     savevm_state.skip_configuration = true;
308 }
309 
310 
311 static void configuration_pre_save(void *opaque)
312 {
313     SaveState *state = opaque;
314     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
315 
316     state->len = strlen(current_name);
317     state->name = current_name;
318     state->target_page_bits = qemu_target_page_bits();
319 }
320 
321 static int configuration_pre_load(void *opaque)
322 {
323     SaveState *state = opaque;
324 
325     /* If there is no target-page-bits subsection it means the source
326      * predates the variable-target-page-bits support and is using the
327      * minimum possible value for this CPU.
328      */
329     state->target_page_bits = qemu_target_page_bits_min();
330     return 0;
331 }
332 
333 static int configuration_post_load(void *opaque, int version_id)
334 {
335     SaveState *state = opaque;
336     const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
337 
338     if (strncmp(state->name, current_name, state->len) != 0) {
339         error_report("Machine type received is '%.*s' and local is '%s'",
340                      (int) state->len, state->name, current_name);
341         return -EINVAL;
342     }
343 
344     if (state->target_page_bits != qemu_target_page_bits()) {
345         error_report("Received TARGET_PAGE_BITS is %d but local is %d",
346                      state->target_page_bits, qemu_target_page_bits());
347         return -EINVAL;
348     }
349 
350     return 0;
351 }
352 
353 /* The target-page-bits subsection is present only if the
354  * target page size is not the same as the default (ie the
355  * minimum page size for a variable-page-size guest CPU).
356  * If it is present then it contains the actual target page
357  * bits for the machine, and migration will fail if the
358  * two ends don't agree about it.
359  */
360 static bool vmstate_target_page_bits_needed(void *opaque)
361 {
362     return qemu_target_page_bits()
363         > qemu_target_page_bits_min();
364 }
365 
366 static const VMStateDescription vmstate_target_page_bits = {
367     .name = "configuration/target-page-bits",
368     .version_id = 1,
369     .minimum_version_id = 1,
370     .needed = vmstate_target_page_bits_needed,
371     .fields = (VMStateField[]) {
372         VMSTATE_UINT32(target_page_bits, SaveState),
373         VMSTATE_END_OF_LIST()
374     }
375 };
376 
377 static const VMStateDescription vmstate_configuration = {
378     .name = "configuration",
379     .version_id = 1,
380     .pre_load = configuration_pre_load,
381     .post_load = configuration_post_load,
382     .pre_save = configuration_pre_save,
383     .fields = (VMStateField[]) {
384         VMSTATE_UINT32(len, SaveState),
385         VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
386         VMSTATE_END_OF_LIST()
387     },
388     .subsections = (const VMStateDescription*[]) {
389         &vmstate_target_page_bits,
390         NULL
391     }
392 };
393 
394 static void dump_vmstate_vmsd(FILE *out_file,
395                               const VMStateDescription *vmsd, int indent,
396                               bool is_subsection);
397 
398 static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
399                               int indent)
400 {
401     fprintf(out_file, "%*s{\n", indent, "");
402     indent += 2;
403     fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
404     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
405             field->version_id);
406     fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
407             field->field_exists ? "true" : "false");
408     fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
409     if (field->vmsd != NULL) {
410         fprintf(out_file, ",\n");
411         dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
412     }
413     fprintf(out_file, "\n%*s}", indent - 2, "");
414 }
415 
416 static void dump_vmstate_vmss(FILE *out_file,
417                               const VMStateDescription **subsection,
418                               int indent)
419 {
420     if (*subsection != NULL) {
421         dump_vmstate_vmsd(out_file, *subsection, indent, true);
422     }
423 }
424 
425 static void dump_vmstate_vmsd(FILE *out_file,
426                               const VMStateDescription *vmsd, int indent,
427                               bool is_subsection)
428 {
429     if (is_subsection) {
430         fprintf(out_file, "%*s{\n", indent, "");
431     } else {
432         fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
433     }
434     indent += 2;
435     fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
436     fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
437             vmsd->version_id);
438     fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
439             vmsd->minimum_version_id);
440     if (vmsd->fields != NULL) {
441         const VMStateField *field = vmsd->fields;
442         bool first;
443 
444         fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
445         first = true;
446         while (field->name != NULL) {
447             if (field->flags & VMS_MUST_EXIST) {
448                 /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
449                 field++;
450                 continue;
451             }
452             if (!first) {
453                 fprintf(out_file, ",\n");
454             }
455             dump_vmstate_vmsf(out_file, field, indent + 2);
456             field++;
457             first = false;
458         }
459         fprintf(out_file, "\n%*s]", indent, "");
460     }
461     if (vmsd->subsections != NULL) {
462         const VMStateDescription **subsection = vmsd->subsections;
463         bool first;
464 
465         fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
466         first = true;
467         while (*subsection != NULL) {
468             if (!first) {
469                 fprintf(out_file, ",\n");
470             }
471             dump_vmstate_vmss(out_file, subsection, indent + 2);
472             subsection++;
473             first = false;
474         }
475         fprintf(out_file, "\n%*s]", indent, "");
476     }
477     fprintf(out_file, "\n%*s}", indent - 2, "");
478 }
479 
480 static void dump_machine_type(FILE *out_file)
481 {
482     MachineClass *mc;
483 
484     mc = MACHINE_GET_CLASS(current_machine);
485 
486     fprintf(out_file, "  \"vmschkmachine\": {\n");
487     fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
488     fprintf(out_file, "  },\n");
489 }
490 
491 void dump_vmstate_json_to_file(FILE *out_file)
492 {
493     GSList *list, *elt;
494     bool first;
495 
496     fprintf(out_file, "{\n");
497     dump_machine_type(out_file);
498 
499     first = true;
500     list = object_class_get_list(TYPE_DEVICE, true);
501     for (elt = list; elt; elt = elt->next) {
502         DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
503                                              TYPE_DEVICE);
504         const char *name;
505         int indent = 2;
506 
507         if (!dc->vmsd) {
508             continue;
509         }
510 
511         if (!first) {
512             fprintf(out_file, ",\n");
513         }
514         name = object_class_get_name(OBJECT_CLASS(dc));
515         fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
516         indent += 2;
517         fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
518         fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
519                 dc->vmsd->version_id);
520         fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
521                 dc->vmsd->minimum_version_id);
522 
523         dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
524 
525         fprintf(out_file, "\n%*s}", indent - 2, "");
526         first = false;
527     }
528     fprintf(out_file, "\n}\n");
529     fclose(out_file);
530 }
531 
532 static int calculate_new_instance_id(const char *idstr)
533 {
534     SaveStateEntry *se;
535     int instance_id = 0;
536 
537     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
538         if (strcmp(idstr, se->idstr) == 0
539             && instance_id <= se->instance_id) {
540             instance_id = se->instance_id + 1;
541         }
542     }
543     return instance_id;
544 }
545 
546 static int calculate_compat_instance_id(const char *idstr)
547 {
548     SaveStateEntry *se;
549     int instance_id = 0;
550 
551     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
552         if (!se->compat) {
553             continue;
554         }
555 
556         if (strcmp(idstr, se->compat->idstr) == 0
557             && instance_id <= se->compat->instance_id) {
558             instance_id = se->compat->instance_id + 1;
559         }
560     }
561     return instance_id;
562 }
563 
564 static inline MigrationPriority save_state_priority(SaveStateEntry *se)
565 {
566     if (se->vmsd) {
567         return se->vmsd->priority;
568     }
569     return MIG_PRI_DEFAULT;
570 }
571 
572 static void savevm_state_handler_insert(SaveStateEntry *nse)
573 {
574     MigrationPriority priority = save_state_priority(nse);
575     SaveStateEntry *se;
576 
577     assert(priority <= MIG_PRI_MAX);
578 
579     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
580         if (save_state_priority(se) < priority) {
581             break;
582         }
583     }
584 
585     if (se) {
586         QTAILQ_INSERT_BEFORE(se, nse, entry);
587     } else {
588         QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
589     }
590 }
591 
592 /* TODO: Individual devices generally have very little idea about the rest
593    of the system, so instance_id should be removed/replaced.
594    Meanwhile pass -1 as instance_id if you do not already have a clearly
595    distinguishing id for all instances of your device class. */
596 int register_savevm_live(DeviceState *dev,
597                          const char *idstr,
598                          int instance_id,
599                          int version_id,
600                          SaveVMHandlers *ops,
601                          void *opaque)
602 {
603     SaveStateEntry *se;
604 
605     se = g_new0(SaveStateEntry, 1);
606     se->version_id = version_id;
607     se->section_id = savevm_state.global_section_id++;
608     se->ops = ops;
609     se->opaque = opaque;
610     se->vmsd = NULL;
611     /* if this is a live_savem then set is_ram */
612     if (ops->save_live_setup != NULL) {
613         se->is_ram = 1;
614     }
615 
616     if (dev) {
617         char *id = qdev_get_dev_path(dev);
618         if (id) {
619             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
620                 sizeof(se->idstr)) {
621                 error_report("Path too long for VMState (%s)", id);
622                 g_free(id);
623                 g_free(se);
624 
625                 return -1;
626             }
627             g_free(id);
628 
629             se->compat = g_new0(CompatEntry, 1);
630             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr);
631             se->compat->instance_id = instance_id == -1 ?
632                          calculate_compat_instance_id(idstr) : instance_id;
633             instance_id = -1;
634         }
635     }
636     pstrcat(se->idstr, sizeof(se->idstr), idstr);
637 
638     if (instance_id == -1) {
639         se->instance_id = calculate_new_instance_id(se->idstr);
640     } else {
641         se->instance_id = instance_id;
642     }
643     assert(!se->compat || se->instance_id == 0);
644     savevm_state_handler_insert(se);
645     return 0;
646 }
647 
648 int register_savevm(DeviceState *dev,
649                     const char *idstr,
650                     int instance_id,
651                     int version_id,
652                     SaveStateHandler *save_state,
653                     LoadStateHandler *load_state,
654                     void *opaque)
655 {
656     SaveVMHandlers *ops = g_new0(SaveVMHandlers, 1);
657     ops->save_state = save_state;
658     ops->load_state = load_state;
659     return register_savevm_live(dev, idstr, instance_id, version_id,
660                                 ops, opaque);
661 }
662 
663 void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque)
664 {
665     SaveStateEntry *se, *new_se;
666     char id[256] = "";
667 
668     if (dev) {
669         char *path = qdev_get_dev_path(dev);
670         if (path) {
671             pstrcpy(id, sizeof(id), path);
672             pstrcat(id, sizeof(id), "/");
673             g_free(path);
674         }
675     }
676     pstrcat(id, sizeof(id), idstr);
677 
678     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
679         if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
680             QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
681             g_free(se->compat);
682             g_free(se->ops);
683             g_free(se);
684         }
685     }
686 }
687 
688 int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
689                                    const VMStateDescription *vmsd,
690                                    void *opaque, int alias_id,
691                                    int required_for_version,
692                                    Error **errp)
693 {
694     SaveStateEntry *se;
695 
696     /* If this triggers, alias support can be dropped for the vmsd. */
697     assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
698 
699     se = g_new0(SaveStateEntry, 1);
700     se->version_id = vmsd->version_id;
701     se->section_id = savevm_state.global_section_id++;
702     se->opaque = opaque;
703     se->vmsd = vmsd;
704     se->alias_id = alias_id;
705 
706     if (dev) {
707         char *id = qdev_get_dev_path(dev);
708         if (id) {
709             if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
710                 sizeof(se->idstr)) {
711                 error_setg(errp, "Path too long for VMState (%s)", id);
712                 g_free(id);
713                 g_free(se);
714 
715                 return -1;
716             }
717             g_free(id);
718 
719             se->compat = g_new0(CompatEntry, 1);
720             pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
721             se->compat->instance_id = instance_id == -1 ?
722                          calculate_compat_instance_id(vmsd->name) : instance_id;
723             instance_id = -1;
724         }
725     }
726     pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
727 
728     if (instance_id == -1) {
729         se->instance_id = calculate_new_instance_id(se->idstr);
730     } else {
731         se->instance_id = instance_id;
732     }
733     assert(!se->compat || se->instance_id == 0);
734     savevm_state_handler_insert(se);
735     return 0;
736 }
737 
738 void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd,
739                         void *opaque)
740 {
741     SaveStateEntry *se, *new_se;
742 
743     QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
744         if (se->vmsd == vmsd && se->opaque == opaque) {
745             QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
746             g_free(se->compat);
747             g_free(se);
748         }
749     }
750 }
751 
752 static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
753 {
754     trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
755     if (!se->vmsd) {         /* Old style */
756         return se->ops->load_state(f, se->opaque, se->load_version_id);
757     }
758     return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
759 }
760 
761 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
762 {
763     int64_t old_offset, size;
764 
765     old_offset = qemu_ftell_fast(f);
766     se->ops->save_state(f, se->opaque);
767     size = qemu_ftell_fast(f) - old_offset;
768 
769     if (vmdesc) {
770         json_prop_int(vmdesc, "size", size);
771         json_start_array(vmdesc, "fields");
772         json_start_object(vmdesc, NULL);
773         json_prop_str(vmdesc, "name", "data");
774         json_prop_int(vmdesc, "size", size);
775         json_prop_str(vmdesc, "type", "buffer");
776         json_end_object(vmdesc);
777         json_end_array(vmdesc);
778     }
779 }
780 
781 static void vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
782 {
783     trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
784     if (!se->vmsd) {
785         vmstate_save_old_style(f, se, vmdesc);
786         return;
787     }
788     vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
789 }
790 
791 void savevm_skip_section_footers(void)
792 {
793     skip_section_footers = true;
794 }
795 
796 /*
797  * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
798  */
799 static void save_section_header(QEMUFile *f, SaveStateEntry *se,
800                                 uint8_t section_type)
801 {
802     qemu_put_byte(f, section_type);
803     qemu_put_be32(f, se->section_id);
804 
805     if (section_type == QEMU_VM_SECTION_FULL ||
806         section_type == QEMU_VM_SECTION_START) {
807         /* ID string */
808         size_t len = strlen(se->idstr);
809         qemu_put_byte(f, len);
810         qemu_put_buffer(f, (uint8_t *)se->idstr, len);
811 
812         qemu_put_be32(f, se->instance_id);
813         qemu_put_be32(f, se->version_id);
814     }
815 }
816 
817 /*
818  * Write a footer onto device sections that catches cases misformatted device
819  * sections.
820  */
821 static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
822 {
823     if (!skip_section_footers) {
824         qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
825         qemu_put_be32(f, se->section_id);
826     }
827 }
828 
829 /**
830  * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
831  *                           command and associated data.
832  *
833  * @f: File to send command on
834  * @command: Command type to send
835  * @len: Length of associated data
836  * @data: Data associated with command.
837  */
838 static void qemu_savevm_command_send(QEMUFile *f,
839                                      enum qemu_vm_cmd command,
840                                      uint16_t len,
841                                      uint8_t *data)
842 {
843     trace_savevm_command_send(command, len);
844     qemu_put_byte(f, QEMU_VM_COMMAND);
845     qemu_put_be16(f, (uint16_t)command);
846     qemu_put_be16(f, len);
847     qemu_put_buffer(f, data, len);
848     qemu_fflush(f);
849 }
850 
851 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
852 {
853     uint32_t buf;
854 
855     trace_savevm_send_ping(value);
856     buf = cpu_to_be32(value);
857     qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
858 }
859 
860 void qemu_savevm_send_open_return_path(QEMUFile *f)
861 {
862     trace_savevm_send_open_return_path();
863     qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
864 }
865 
866 /* We have a buffer of data to send; we don't want that all to be loaded
867  * by the command itself, so the command contains just the length of the
868  * extra buffer that we then send straight after it.
869  * TODO: Must be a better way to organise that
870  *
871  * Returns:
872  *    0 on success
873  *    -ve on error
874  */
875 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
876 {
877     uint32_t tmp;
878 
879     if (len > MAX_VM_CMD_PACKAGED_SIZE) {
880         error_report("%s: Unreasonably large packaged state: %zu",
881                      __func__, len);
882         return -1;
883     }
884 
885     tmp = cpu_to_be32(len);
886 
887     trace_qemu_savevm_send_packaged();
888     qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
889 
890     qemu_put_buffer(f, buf, len);
891 
892     return 0;
893 }
894 
895 /* Send prior to any postcopy transfer */
896 void qemu_savevm_send_postcopy_advise(QEMUFile *f)
897 {
898     uint64_t tmp[2];
899     tmp[0] = cpu_to_be64(ram_pagesize_summary());
900     tmp[1] = cpu_to_be64(qemu_target_page_size());
901 
902     trace_qemu_savevm_send_postcopy_advise();
903     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp);
904 }
905 
906 /* Sent prior to starting the destination running in postcopy, discard pages
907  * that have already been sent but redirtied on the source.
908  * CMD_POSTCOPY_RAM_DISCARD consist of:
909  *      byte   version (0)
910  *      byte   Length of name field (not including 0)
911  *  n x byte   RAM block name
912  *      byte   0 terminator (just for safety)
913  *  n x        Byte ranges within the named RAMBlock
914  *      be64   Start of the range
915  *      be64   Length
916  *
917  *  name:  RAMBlock name that these entries are part of
918  *  len: Number of page entries
919  *  start_list: 'len' addresses
920  *  length_list: 'len' addresses
921  *
922  */
923 void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
924                                            uint16_t len,
925                                            uint64_t *start_list,
926                                            uint64_t *length_list)
927 {
928     uint8_t *buf;
929     uint16_t tmplen;
930     uint16_t t;
931     size_t name_len = strlen(name);
932 
933     trace_qemu_savevm_send_postcopy_ram_discard(name, len);
934     assert(name_len < 256);
935     buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
936     buf[0] = postcopy_ram_discard_version;
937     buf[1] = name_len;
938     memcpy(buf + 2, name, name_len);
939     tmplen = 2 + name_len;
940     buf[tmplen++] = '\0';
941 
942     for (t = 0; t < len; t++) {
943         stq_be_p(buf + tmplen, start_list[t]);
944         tmplen += 8;
945         stq_be_p(buf + tmplen, length_list[t]);
946         tmplen += 8;
947     }
948     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
949     g_free(buf);
950 }
951 
952 /* Get the destination into a state where it can receive postcopy data. */
953 void qemu_savevm_send_postcopy_listen(QEMUFile *f)
954 {
955     trace_savevm_send_postcopy_listen();
956     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
957 }
958 
959 /* Kick the destination into running */
960 void qemu_savevm_send_postcopy_run(QEMUFile *f)
961 {
962     trace_savevm_send_postcopy_run();
963     qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
964 }
965 
966 bool qemu_savevm_state_blocked(Error **errp)
967 {
968     SaveStateEntry *se;
969 
970     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
971         if (se->vmsd && se->vmsd->unmigratable) {
972             error_setg(errp, "State blocked by non-migratable device '%s'",
973                        se->idstr);
974             return true;
975         }
976     }
977     return false;
978 }
979 
980 static bool enforce_config_section(void)
981 {
982     MachineState *machine = MACHINE(qdev_get_machine());
983     return machine->enforce_config_section;
984 }
985 
986 void qemu_savevm_state_header(QEMUFile *f)
987 {
988     trace_savevm_state_header();
989     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
990     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
991 
992     if (!savevm_state.skip_configuration || enforce_config_section()) {
993         qemu_put_byte(f, QEMU_VM_CONFIGURATION);
994         vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
995     }
996 
997 }
998 
999 void qemu_savevm_state_begin(QEMUFile *f)
1000 {
1001     SaveStateEntry *se;
1002     int ret;
1003 
1004     trace_savevm_state_begin();
1005     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1006         if (!se->ops || !se->ops->save_live_setup) {
1007             continue;
1008         }
1009         if (se->ops && se->ops->is_active) {
1010             if (!se->ops->is_active(se->opaque)) {
1011                 continue;
1012             }
1013         }
1014         save_section_header(f, se, QEMU_VM_SECTION_START);
1015 
1016         ret = se->ops->save_live_setup(f, se->opaque);
1017         save_section_footer(f, se);
1018         if (ret < 0) {
1019             qemu_file_set_error(f, ret);
1020             break;
1021         }
1022     }
1023 }
1024 
1025 /*
1026  * this function has three return values:
1027  *   negative: there was one error, and we have -errno.
1028  *   0 : We haven't finished, caller have to go again
1029  *   1 : We have finished, we can go to complete phase
1030  */
1031 int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1032 {
1033     SaveStateEntry *se;
1034     int ret = 1;
1035 
1036     trace_savevm_state_iterate();
1037     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1038         if (!se->ops || !se->ops->save_live_iterate) {
1039             continue;
1040         }
1041         if (se->ops && se->ops->is_active) {
1042             if (!se->ops->is_active(se->opaque)) {
1043                 continue;
1044             }
1045         }
1046         /*
1047          * In the postcopy phase, any device that doesn't know how to
1048          * do postcopy should have saved it's state in the _complete
1049          * call that's already run, it might get confused if we call
1050          * iterate afterwards.
1051          */
1052         if (postcopy && !se->ops->save_live_complete_postcopy) {
1053             continue;
1054         }
1055         if (qemu_file_rate_limit(f)) {
1056             return 0;
1057         }
1058         trace_savevm_section_start(se->idstr, se->section_id);
1059 
1060         save_section_header(f, se, QEMU_VM_SECTION_PART);
1061 
1062         ret = se->ops->save_live_iterate(f, se->opaque);
1063         trace_savevm_section_end(se->idstr, se->section_id, ret);
1064         save_section_footer(f, se);
1065 
1066         if (ret < 0) {
1067             qemu_file_set_error(f, ret);
1068         }
1069         if (ret <= 0) {
1070             /* Do not proceed to the next vmstate before this one reported
1071                completion of the current stage. This serializes the migration
1072                and reduces the probability that a faster changing state is
1073                synchronized over and over again. */
1074             break;
1075         }
1076     }
1077     return ret;
1078 }
1079 
1080 static bool should_send_vmdesc(void)
1081 {
1082     MachineState *machine = MACHINE(qdev_get_machine());
1083     bool in_postcopy = migration_in_postcopy();
1084     return !machine->suppress_vmdesc && !in_postcopy;
1085 }
1086 
1087 /*
1088  * Calls the save_live_complete_postcopy methods
1089  * causing the last few pages to be sent immediately and doing any associated
1090  * cleanup.
1091  * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1092  * all the other devices, but that happens at the point we switch to postcopy.
1093  */
1094 void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1095 {
1096     SaveStateEntry *se;
1097     int ret;
1098 
1099     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1100         if (!se->ops || !se->ops->save_live_complete_postcopy) {
1101             continue;
1102         }
1103         if (se->ops && se->ops->is_active) {
1104             if (!se->ops->is_active(se->opaque)) {
1105                 continue;
1106             }
1107         }
1108         trace_savevm_section_start(se->idstr, se->section_id);
1109         /* Section type */
1110         qemu_put_byte(f, QEMU_VM_SECTION_END);
1111         qemu_put_be32(f, se->section_id);
1112 
1113         ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1114         trace_savevm_section_end(se->idstr, se->section_id, ret);
1115         save_section_footer(f, se);
1116         if (ret < 0) {
1117             qemu_file_set_error(f, ret);
1118             return;
1119         }
1120     }
1121 
1122     qemu_put_byte(f, QEMU_VM_EOF);
1123     qemu_fflush(f);
1124 }
1125 
1126 void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
1127 {
1128     QJSON *vmdesc;
1129     int vmdesc_len;
1130     SaveStateEntry *se;
1131     int ret;
1132     bool in_postcopy = migration_in_postcopy();
1133 
1134     trace_savevm_state_complete_precopy();
1135 
1136     cpu_synchronize_all_states();
1137 
1138     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1139         if (!se->ops ||
1140             (in_postcopy && se->ops->save_live_complete_postcopy) ||
1141             (in_postcopy && !iterable_only) ||
1142             !se->ops->save_live_complete_precopy) {
1143             continue;
1144         }
1145 
1146         if (se->ops && se->ops->is_active) {
1147             if (!se->ops->is_active(se->opaque)) {
1148                 continue;
1149             }
1150         }
1151         trace_savevm_section_start(se->idstr, se->section_id);
1152 
1153         save_section_header(f, se, QEMU_VM_SECTION_END);
1154 
1155         ret = se->ops->save_live_complete_precopy(f, se->opaque);
1156         trace_savevm_section_end(se->idstr, se->section_id, ret);
1157         save_section_footer(f, se);
1158         if (ret < 0) {
1159             qemu_file_set_error(f, ret);
1160             return;
1161         }
1162     }
1163 
1164     if (iterable_only) {
1165         return;
1166     }
1167 
1168     vmdesc = qjson_new();
1169     json_prop_int(vmdesc, "page_size", qemu_target_page_size());
1170     json_start_array(vmdesc, "devices");
1171     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1172 
1173         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1174             continue;
1175         }
1176         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1177             trace_savevm_section_skip(se->idstr, se->section_id);
1178             continue;
1179         }
1180 
1181         trace_savevm_section_start(se->idstr, se->section_id);
1182 
1183         json_start_object(vmdesc, NULL);
1184         json_prop_str(vmdesc, "name", se->idstr);
1185         json_prop_int(vmdesc, "instance_id", se->instance_id);
1186 
1187         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1188         vmstate_save(f, se, vmdesc);
1189         trace_savevm_section_end(se->idstr, se->section_id, 0);
1190         save_section_footer(f, se);
1191 
1192         json_end_object(vmdesc);
1193     }
1194 
1195     if (!in_postcopy) {
1196         /* Postcopy stream will still be going */
1197         qemu_put_byte(f, QEMU_VM_EOF);
1198     }
1199 
1200     json_end_array(vmdesc);
1201     qjson_finish(vmdesc);
1202     vmdesc_len = strlen(qjson_get_str(vmdesc));
1203 
1204     if (should_send_vmdesc()) {
1205         qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1206         qemu_put_be32(f, vmdesc_len);
1207         qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len);
1208     }
1209     qjson_destroy(vmdesc);
1210 
1211     qemu_fflush(f);
1212 }
1213 
1214 /* Give an estimate of the amount left to be transferred,
1215  * the result is split into the amount for units that can and
1216  * for units that can't do postcopy.
1217  */
1218 void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1219                                uint64_t *res_non_postcopiable,
1220                                uint64_t *res_postcopiable)
1221 {
1222     SaveStateEntry *se;
1223 
1224     *res_non_postcopiable = 0;
1225     *res_postcopiable = 0;
1226 
1227 
1228     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1229         if (!se->ops || !se->ops->save_live_pending) {
1230             continue;
1231         }
1232         if (se->ops && se->ops->is_active) {
1233             if (!se->ops->is_active(se->opaque)) {
1234                 continue;
1235             }
1236         }
1237         se->ops->save_live_pending(f, se->opaque, threshold_size,
1238                                    res_non_postcopiable, res_postcopiable);
1239     }
1240 }
1241 
1242 void qemu_savevm_state_cleanup(void)
1243 {
1244     SaveStateEntry *se;
1245 
1246     trace_savevm_state_cleanup();
1247     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1248         if (se->ops && se->ops->cleanup) {
1249             se->ops->cleanup(se->opaque);
1250         }
1251     }
1252 }
1253 
1254 static int qemu_savevm_state(QEMUFile *f, Error **errp)
1255 {
1256     int ret;
1257     MigrationState *ms = migrate_init();
1258     MigrationStatus status;
1259     ms->to_dst_file = f;
1260 
1261     if (migration_is_blocked(errp)) {
1262         ret = -EINVAL;
1263         goto done;
1264     }
1265 
1266     if (migrate_use_block()) {
1267         error_setg(errp, "Block migration and snapshots are incompatible");
1268         ret = -EINVAL;
1269         goto done;
1270     }
1271 
1272     qemu_mutex_unlock_iothread();
1273     qemu_savevm_state_header(f);
1274     qemu_savevm_state_begin(f);
1275     qemu_mutex_lock_iothread();
1276 
1277     while (qemu_file_get_error(f) == 0) {
1278         if (qemu_savevm_state_iterate(f, false) > 0) {
1279             break;
1280         }
1281     }
1282 
1283     ret = qemu_file_get_error(f);
1284     if (ret == 0) {
1285         qemu_savevm_state_complete_precopy(f, false);
1286         ret = qemu_file_get_error(f);
1287     }
1288     qemu_savevm_state_cleanup();
1289     if (ret != 0) {
1290         error_setg_errno(errp, -ret, "Error while writing VM state");
1291     }
1292 
1293 done:
1294     if (ret != 0) {
1295         status = MIGRATION_STATUS_FAILED;
1296     } else {
1297         status = MIGRATION_STATUS_COMPLETED;
1298     }
1299     migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1300 
1301     /* f is outer parameter, it should not stay in global migration state after
1302      * this function finished */
1303     ms->to_dst_file = NULL;
1304 
1305     return ret;
1306 }
1307 
1308 static int qemu_save_device_state(QEMUFile *f)
1309 {
1310     SaveStateEntry *se;
1311 
1312     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1313     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1314 
1315     cpu_synchronize_all_states();
1316 
1317     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1318         if (se->is_ram) {
1319             continue;
1320         }
1321         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1322             continue;
1323         }
1324         if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1325             continue;
1326         }
1327 
1328         save_section_header(f, se, QEMU_VM_SECTION_FULL);
1329 
1330         vmstate_save(f, se, NULL);
1331 
1332         save_section_footer(f, se);
1333     }
1334 
1335     qemu_put_byte(f, QEMU_VM_EOF);
1336 
1337     return qemu_file_get_error(f);
1338 }
1339 
1340 static SaveStateEntry *find_se(const char *idstr, int instance_id)
1341 {
1342     SaveStateEntry *se;
1343 
1344     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1345         if (!strcmp(se->idstr, idstr) &&
1346             (instance_id == se->instance_id ||
1347              instance_id == se->alias_id))
1348             return se;
1349         /* Migrating from an older version? */
1350         if (strstr(se->idstr, idstr) && se->compat) {
1351             if (!strcmp(se->compat->idstr, idstr) &&
1352                 (instance_id == se->compat->instance_id ||
1353                  instance_id == se->alias_id))
1354                 return se;
1355         }
1356     }
1357     return NULL;
1358 }
1359 
1360 enum LoadVMExitCodes {
1361     /* Allow a command to quit all layers of nested loadvm loops */
1362     LOADVM_QUIT     =  1,
1363 };
1364 
1365 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
1366 
1367 /* ------ incoming postcopy messages ------ */
1368 /* 'advise' arrives before any transfers just to tell us that a postcopy
1369  * *might* happen - it might be skipped if precopy transferred everything
1370  * quickly.
1371  */
1372 static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
1373 {
1374     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1375     uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1376 
1377     trace_loadvm_postcopy_handle_advise();
1378     if (ps != POSTCOPY_INCOMING_NONE) {
1379         error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1380         return -1;
1381     }
1382 
1383     if (!postcopy_ram_supported_by_host()) {
1384         postcopy_state_set(POSTCOPY_INCOMING_NONE);
1385         return -1;
1386     }
1387 
1388     remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1389     local_pagesize_summary = ram_pagesize_summary();
1390 
1391     if (remote_pagesize_summary != local_pagesize_summary)  {
1392         /*
1393          * This detects two potential causes of mismatch:
1394          *   a) A mismatch in host page sizes
1395          *      Some combinations of mismatch are probably possible but it gets
1396          *      a bit more complicated.  In particular we need to place whole
1397          *      host pages on the dest at once, and we need to ensure that we
1398          *      handle dirtying to make sure we never end up sending part of
1399          *      a hostpage on it's own.
1400          *   b) The use of different huge page sizes on source/destination
1401          *      a more fine grain test is performed during RAM block migration
1402          *      but this test here causes a nice early clear failure, and
1403          *      also fails when passed to an older qemu that doesn't
1404          *      do huge pages.
1405          */
1406         error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1407                                                              " d=%" PRIx64 ")",
1408                      remote_pagesize_summary, local_pagesize_summary);
1409         return -1;
1410     }
1411 
1412     remote_tps = qemu_get_be64(mis->from_src_file);
1413     if (remote_tps != qemu_target_page_size()) {
1414         /*
1415          * Again, some differences could be dealt with, but for now keep it
1416          * simple.
1417          */
1418         error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1419                      (int)remote_tps, qemu_target_page_size());
1420         return -1;
1421     }
1422 
1423     if (ram_postcopy_incoming_init(mis)) {
1424         return -1;
1425     }
1426 
1427     postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1428 
1429     return 0;
1430 }
1431 
1432 /* After postcopy we will be told to throw some pages away since they're
1433  * dirty and will have to be demand fetched.  Must happen before CPU is
1434  * started.
1435  * There can be 0..many of these messages, each encoding multiple pages.
1436  */
1437 static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1438                                               uint16_t len)
1439 {
1440     int tmp;
1441     char ramid[256];
1442     PostcopyState ps = postcopy_state_get();
1443 
1444     trace_loadvm_postcopy_ram_handle_discard();
1445 
1446     switch (ps) {
1447     case POSTCOPY_INCOMING_ADVISE:
1448         /* 1st discard */
1449         tmp = postcopy_ram_prepare_discard(mis);
1450         if (tmp) {
1451             return tmp;
1452         }
1453         break;
1454 
1455     case POSTCOPY_INCOMING_DISCARD:
1456         /* Expected state */
1457         break;
1458 
1459     default:
1460         error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1461                      ps);
1462         return -1;
1463     }
1464     /* We're expecting a
1465      *    Version (0)
1466      *    a RAM ID string (length byte, name, 0 term)
1467      *    then at least 1 16 byte chunk
1468     */
1469     if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1470         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1471         return -1;
1472     }
1473 
1474     tmp = qemu_get_byte(mis->from_src_file);
1475     if (tmp != postcopy_ram_discard_version) {
1476         error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1477         return -1;
1478     }
1479 
1480     if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1481         error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1482         return -1;
1483     }
1484     tmp = qemu_get_byte(mis->from_src_file);
1485     if (tmp != 0) {
1486         error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1487         return -1;
1488     }
1489 
1490     len -= 3 + strlen(ramid);
1491     if (len % 16) {
1492         error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1493         return -1;
1494     }
1495     trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1496     while (len) {
1497         uint64_t start_addr, block_length;
1498         start_addr = qemu_get_be64(mis->from_src_file);
1499         block_length = qemu_get_be64(mis->from_src_file);
1500 
1501         len -= 16;
1502         int ret = ram_discard_range(ramid, start_addr, block_length);
1503         if (ret) {
1504             return ret;
1505         }
1506     }
1507     trace_loadvm_postcopy_ram_handle_discard_end();
1508 
1509     return 0;
1510 }
1511 
1512 /*
1513  * Triggered by a postcopy_listen command; this thread takes over reading
1514  * the input stream, leaving the main thread free to carry on loading the rest
1515  * of the device state (from RAM).
1516  * (TODO:This could do with being in a postcopy file - but there again it's
1517  * just another input loop, not that postcopy specific)
1518  */
1519 static void *postcopy_ram_listen_thread(void *opaque)
1520 {
1521     QEMUFile *f = opaque;
1522     MigrationIncomingState *mis = migration_incoming_get_current();
1523     int load_res;
1524 
1525     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1526                                    MIGRATION_STATUS_POSTCOPY_ACTIVE);
1527     qemu_sem_post(&mis->listen_thread_sem);
1528     trace_postcopy_ram_listen_thread_start();
1529 
1530     /*
1531      * Because we're a thread and not a coroutine we can't yield
1532      * in qemu_file, and thus we must be blocking now.
1533      */
1534     qemu_file_set_blocking(f, true);
1535     load_res = qemu_loadvm_state_main(f, mis);
1536     /* And non-blocking again so we don't block in any cleanup */
1537     qemu_file_set_blocking(f, false);
1538 
1539     trace_postcopy_ram_listen_thread_exit();
1540     if (load_res < 0) {
1541         error_report("%s: loadvm failed: %d", __func__, load_res);
1542         qemu_file_set_error(f, load_res);
1543         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1544                                        MIGRATION_STATUS_FAILED);
1545     } else {
1546         /*
1547          * This looks good, but it's possible that the device loading in the
1548          * main thread hasn't finished yet, and so we might not be in 'RUN'
1549          * state yet; wait for the end of the main thread.
1550          */
1551         qemu_event_wait(&mis->main_thread_load_event);
1552     }
1553     postcopy_ram_incoming_cleanup(mis);
1554 
1555     if (load_res < 0) {
1556         /*
1557          * If something went wrong then we have a bad state so exit;
1558          * depending how far we got it might be possible at this point
1559          * to leave the guest running and fire MCEs for pages that never
1560          * arrived as a desperate recovery step.
1561          */
1562         exit(EXIT_FAILURE);
1563     }
1564 
1565     migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1566                                    MIGRATION_STATUS_COMPLETED);
1567     /*
1568      * If everything has worked fine, then the main thread has waited
1569      * for us to start, and we're the last use of the mis.
1570      * (If something broke then qemu will have to exit anyway since it's
1571      * got a bad migration state).
1572      */
1573     migration_incoming_state_destroy();
1574 
1575 
1576     return NULL;
1577 }
1578 
1579 /* After this message we must be able to immediately receive postcopy data */
1580 static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1581 {
1582     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1583     trace_loadvm_postcopy_handle_listen();
1584     if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1585         error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1586         return -1;
1587     }
1588     if (ps == POSTCOPY_INCOMING_ADVISE) {
1589         /*
1590          * A rare case, we entered listen without having to do any discards,
1591          * so do the setup that's normally done at the time of the 1st discard.
1592          */
1593         postcopy_ram_prepare_discard(mis);
1594     }
1595 
1596     /*
1597      * Sensitise RAM - can now generate requests for blocks that don't exist
1598      * However, at this point the CPU shouldn't be running, and the IO
1599      * shouldn't be doing anything yet so don't actually expect requests
1600      */
1601     if (postcopy_ram_enable_notify(mis)) {
1602         return -1;
1603     }
1604 
1605     if (mis->have_listen_thread) {
1606         error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
1607         return -1;
1608     }
1609 
1610     mis->have_listen_thread = true;
1611     /* Start up the listening thread and wait for it to signal ready */
1612     qemu_sem_init(&mis->listen_thread_sem, 0);
1613     qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1614                        postcopy_ram_listen_thread, mis->from_src_file,
1615                        QEMU_THREAD_DETACHED);
1616     qemu_sem_wait(&mis->listen_thread_sem);
1617     qemu_sem_destroy(&mis->listen_thread_sem);
1618 
1619     return 0;
1620 }
1621 
1622 
1623 typedef struct {
1624     QEMUBH *bh;
1625 } HandleRunBhData;
1626 
1627 static void loadvm_postcopy_handle_run_bh(void *opaque)
1628 {
1629     Error *local_err = NULL;
1630     HandleRunBhData *data = opaque;
1631 
1632     /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1633      * in migration.c
1634      */
1635     cpu_synchronize_all_post_init();
1636 
1637     qemu_announce_self();
1638 
1639     /* Make sure all file formats flush their mutable metadata.
1640      * If we get an error here, just don't restart the VM yet. */
1641     bdrv_invalidate_cache_all(&local_err);
1642     if (local_err) {
1643         error_report_err(local_err);
1644         local_err = NULL;
1645         autostart = false;
1646     }
1647 
1648     trace_loadvm_postcopy_handle_run_cpu_sync();
1649     cpu_synchronize_all_post_init();
1650 
1651     trace_loadvm_postcopy_handle_run_vmstart();
1652 
1653     if (autostart) {
1654         /* Hold onto your hats, starting the CPU */
1655         vm_start();
1656     } else {
1657         /* leave it paused and let management decide when to start the CPU */
1658         runstate_set(RUN_STATE_PAUSED);
1659     }
1660 
1661     qemu_bh_delete(data->bh);
1662     g_free(data);
1663 }
1664 
1665 /* After all discards we can start running and asking for pages */
1666 static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
1667 {
1668     PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
1669     HandleRunBhData *data;
1670 
1671     trace_loadvm_postcopy_handle_run();
1672     if (ps != POSTCOPY_INCOMING_LISTENING) {
1673         error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
1674         return -1;
1675     }
1676 
1677     data = g_new(HandleRunBhData, 1);
1678     data->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, data);
1679     qemu_bh_schedule(data->bh);
1680 
1681     /* We need to finish reading the stream from the package
1682      * and also stop reading anything more from the stream that loaded the
1683      * package (since it's now being read by the listener thread).
1684      * LOADVM_QUIT will quit all the layers of nested loadvm loops.
1685      */
1686     return LOADVM_QUIT;
1687 }
1688 
1689 /**
1690  * Immediately following this command is a blob of data containing an embedded
1691  * chunk of migration stream; read it and load it.
1692  *
1693  * @mis: Incoming state
1694  * @length: Length of packaged data to read
1695  *
1696  * Returns: Negative values on error
1697  *
1698  */
1699 static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
1700 {
1701     int ret;
1702     size_t length;
1703     QIOChannelBuffer *bioc;
1704 
1705     length = qemu_get_be32(mis->from_src_file);
1706     trace_loadvm_handle_cmd_packaged(length);
1707 
1708     if (length > MAX_VM_CMD_PACKAGED_SIZE) {
1709         error_report("Unreasonably large packaged state: %zu", length);
1710         return -1;
1711     }
1712 
1713     bioc = qio_channel_buffer_new(length);
1714     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
1715     ret = qemu_get_buffer(mis->from_src_file,
1716                           bioc->data,
1717                           length);
1718     if (ret != length) {
1719         object_unref(OBJECT(bioc));
1720         error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
1721                      ret, length);
1722         return (ret < 0) ? ret : -EAGAIN;
1723     }
1724     bioc->usage += length;
1725     trace_loadvm_handle_cmd_packaged_received(ret);
1726 
1727     QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
1728 
1729     ret = qemu_loadvm_state_main(packf, mis);
1730     trace_loadvm_handle_cmd_packaged_main(ret);
1731     qemu_fclose(packf);
1732     object_unref(OBJECT(bioc));
1733 
1734     return ret;
1735 }
1736 
1737 /*
1738  * Process an incoming 'QEMU_VM_COMMAND'
1739  * 0           just a normal return
1740  * LOADVM_QUIT All good, but exit the loop
1741  * <0          Error
1742  */
1743 static int loadvm_process_command(QEMUFile *f)
1744 {
1745     MigrationIncomingState *mis = migration_incoming_get_current();
1746     uint16_t cmd;
1747     uint16_t len;
1748     uint32_t tmp32;
1749 
1750     cmd = qemu_get_be16(f);
1751     len = qemu_get_be16(f);
1752 
1753     trace_loadvm_process_command(cmd, len);
1754     if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
1755         error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
1756         return -EINVAL;
1757     }
1758 
1759     if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
1760         error_report("%s received with bad length - expecting %zu, got %d",
1761                      mig_cmd_args[cmd].name,
1762                      (size_t)mig_cmd_args[cmd].len, len);
1763         return -ERANGE;
1764     }
1765 
1766     switch (cmd) {
1767     case MIG_CMD_OPEN_RETURN_PATH:
1768         if (mis->to_src_file) {
1769             error_report("CMD_OPEN_RETURN_PATH called when RP already open");
1770             /* Not really a problem, so don't give up */
1771             return 0;
1772         }
1773         mis->to_src_file = qemu_file_get_return_path(f);
1774         if (!mis->to_src_file) {
1775             error_report("CMD_OPEN_RETURN_PATH failed");
1776             return -1;
1777         }
1778         break;
1779 
1780     case MIG_CMD_PING:
1781         tmp32 = qemu_get_be32(f);
1782         trace_loadvm_process_command_ping(tmp32);
1783         if (!mis->to_src_file) {
1784             error_report("CMD_PING (0x%x) received with no return path",
1785                          tmp32);
1786             return -1;
1787         }
1788         migrate_send_rp_pong(mis, tmp32);
1789         break;
1790 
1791     case MIG_CMD_PACKAGED:
1792         return loadvm_handle_cmd_packaged(mis);
1793 
1794     case MIG_CMD_POSTCOPY_ADVISE:
1795         return loadvm_postcopy_handle_advise(mis);
1796 
1797     case MIG_CMD_POSTCOPY_LISTEN:
1798         return loadvm_postcopy_handle_listen(mis);
1799 
1800     case MIG_CMD_POSTCOPY_RUN:
1801         return loadvm_postcopy_handle_run(mis);
1802 
1803     case MIG_CMD_POSTCOPY_RAM_DISCARD:
1804         return loadvm_postcopy_ram_handle_discard(mis, len);
1805     }
1806 
1807     return 0;
1808 }
1809 
1810 /*
1811  * Read a footer off the wire and check that it matches the expected section
1812  *
1813  * Returns: true if the footer was good
1814  *          false if there is a problem (and calls error_report to say why)
1815  */
1816 static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
1817 {
1818     uint8_t read_mark;
1819     uint32_t read_section_id;
1820 
1821     if (skip_section_footers) {
1822         /* No footer to check */
1823         return true;
1824     }
1825 
1826     read_mark = qemu_get_byte(f);
1827 
1828     if (read_mark != QEMU_VM_SECTION_FOOTER) {
1829         error_report("Missing section footer for %s", se->idstr);
1830         return false;
1831     }
1832 
1833     read_section_id = qemu_get_be32(f);
1834     if (read_section_id != se->load_section_id) {
1835         error_report("Mismatched section id in footer for %s -"
1836                      " read 0x%x expected 0x%x",
1837                      se->idstr, read_section_id, se->load_section_id);
1838         return false;
1839     }
1840 
1841     /* All good */
1842     return true;
1843 }
1844 
1845 static int
1846 qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
1847 {
1848     uint32_t instance_id, version_id, section_id;
1849     SaveStateEntry *se;
1850     char idstr[256];
1851     int ret;
1852 
1853     /* Read section start */
1854     section_id = qemu_get_be32(f);
1855     if (!qemu_get_counted_string(f, idstr)) {
1856         error_report("Unable to read ID string for section %u",
1857                      section_id);
1858         return -EINVAL;
1859     }
1860     instance_id = qemu_get_be32(f);
1861     version_id = qemu_get_be32(f);
1862 
1863     trace_qemu_loadvm_state_section_startfull(section_id, idstr,
1864             instance_id, version_id);
1865     /* Find savevm section */
1866     se = find_se(idstr, instance_id);
1867     if (se == NULL) {
1868         error_report("Unknown savevm section or instance '%s' %d",
1869                      idstr, instance_id);
1870         return -EINVAL;
1871     }
1872 
1873     /* Validate version */
1874     if (version_id > se->version_id) {
1875         error_report("savevm: unsupported version %d for '%s' v%d",
1876                      version_id, idstr, se->version_id);
1877         return -EINVAL;
1878     }
1879     se->load_version_id = version_id;
1880     se->load_section_id = section_id;
1881 
1882     /* Validate if it is a device's state */
1883     if (xen_enabled() && se->is_ram) {
1884         error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
1885         return -EINVAL;
1886     }
1887 
1888     ret = vmstate_load(f, se);
1889     if (ret < 0) {
1890         error_report("error while loading state for instance 0x%x of"
1891                      " device '%s'", instance_id, idstr);
1892         return ret;
1893     }
1894     if (!check_section_footer(f, se)) {
1895         return -EINVAL;
1896     }
1897 
1898     return 0;
1899 }
1900 
1901 static int
1902 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
1903 {
1904     uint32_t section_id;
1905     SaveStateEntry *se;
1906     int ret;
1907 
1908     section_id = qemu_get_be32(f);
1909 
1910     trace_qemu_loadvm_state_section_partend(section_id);
1911     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1912         if (se->load_section_id == section_id) {
1913             break;
1914         }
1915     }
1916     if (se == NULL) {
1917         error_report("Unknown savevm section %d", section_id);
1918         return -EINVAL;
1919     }
1920 
1921     ret = vmstate_load(f, se);
1922     if (ret < 0) {
1923         error_report("error while loading state section id %d(%s)",
1924                      section_id, se->idstr);
1925         return ret;
1926     }
1927     if (!check_section_footer(f, se)) {
1928         return -EINVAL;
1929     }
1930 
1931     return 0;
1932 }
1933 
1934 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
1935 {
1936     uint8_t section_type;
1937     int ret = 0;
1938 
1939     while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
1940         ret = 0;
1941         trace_qemu_loadvm_state_section(section_type);
1942         switch (section_type) {
1943         case QEMU_VM_SECTION_START:
1944         case QEMU_VM_SECTION_FULL:
1945             ret = qemu_loadvm_section_start_full(f, mis);
1946             if (ret < 0) {
1947                 goto out;
1948             }
1949             break;
1950         case QEMU_VM_SECTION_PART:
1951         case QEMU_VM_SECTION_END:
1952             ret = qemu_loadvm_section_part_end(f, mis);
1953             if (ret < 0) {
1954                 goto out;
1955             }
1956             break;
1957         case QEMU_VM_COMMAND:
1958             ret = loadvm_process_command(f);
1959             trace_qemu_loadvm_state_section_command(ret);
1960             if ((ret < 0) || (ret & LOADVM_QUIT)) {
1961                 goto out;
1962             }
1963             break;
1964         default:
1965             error_report("Unknown savevm section type %d", section_type);
1966             ret = -EINVAL;
1967             goto out;
1968         }
1969     }
1970 
1971 out:
1972     if (ret < 0) {
1973         qemu_file_set_error(f, ret);
1974     }
1975     return ret;
1976 }
1977 
1978 int qemu_loadvm_state(QEMUFile *f)
1979 {
1980     MigrationIncomingState *mis = migration_incoming_get_current();
1981     Error *local_err = NULL;
1982     unsigned int v;
1983     int ret;
1984 
1985     if (qemu_savevm_state_blocked(&local_err)) {
1986         error_report_err(local_err);
1987         return -EINVAL;
1988     }
1989 
1990     v = qemu_get_be32(f);
1991     if (v != QEMU_VM_FILE_MAGIC) {
1992         error_report("Not a migration stream");
1993         return -EINVAL;
1994     }
1995 
1996     v = qemu_get_be32(f);
1997     if (v == QEMU_VM_FILE_VERSION_COMPAT) {
1998         error_report("SaveVM v2 format is obsolete and don't work anymore");
1999         return -ENOTSUP;
2000     }
2001     if (v != QEMU_VM_FILE_VERSION) {
2002         error_report("Unsupported migration stream version");
2003         return -ENOTSUP;
2004     }
2005 
2006     if (!savevm_state.skip_configuration || enforce_config_section()) {
2007         if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2008             error_report("Configuration section missing");
2009             return -EINVAL;
2010         }
2011         ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2012 
2013         if (ret) {
2014             return ret;
2015         }
2016     }
2017 
2018     ret = qemu_loadvm_state_main(f, mis);
2019     qemu_event_set(&mis->main_thread_load_event);
2020 
2021     trace_qemu_loadvm_state_post_main(ret);
2022 
2023     if (mis->have_listen_thread) {
2024         /* Listen thread still going, can't clean up yet */
2025         return ret;
2026     }
2027 
2028     if (ret == 0) {
2029         ret = qemu_file_get_error(f);
2030     }
2031 
2032     /*
2033      * Try to read in the VMDESC section as well, so that dumping tools that
2034      * intercept our migration stream have the chance to see it.
2035      */
2036 
2037     /* We've got to be careful; if we don't read the data and just shut the fd
2038      * then the sender can error if we close while it's still sending.
2039      * We also mustn't read data that isn't there; some transports (RDMA)
2040      * will stall waiting for that data when the source has already closed.
2041      */
2042     if (ret == 0 && should_send_vmdesc()) {
2043         uint8_t *buf;
2044         uint32_t size;
2045         uint8_t  section_type = qemu_get_byte(f);
2046 
2047         if (section_type != QEMU_VM_VMDESCRIPTION) {
2048             error_report("Expected vmdescription section, but got %d",
2049                          section_type);
2050             /*
2051              * It doesn't seem worth failing at this point since
2052              * we apparently have an otherwise valid VM state
2053              */
2054         } else {
2055             buf = g_malloc(0x1000);
2056             size = qemu_get_be32(f);
2057 
2058             while (size > 0) {
2059                 uint32_t read_chunk = MIN(size, 0x1000);
2060                 qemu_get_buffer(f, buf, read_chunk);
2061                 size -= read_chunk;
2062             }
2063             g_free(buf);
2064         }
2065     }
2066 
2067     cpu_synchronize_all_post_init();
2068 
2069     return ret;
2070 }
2071 
2072 int save_snapshot(const char *name, Error **errp)
2073 {
2074     BlockDriverState *bs, *bs1;
2075     QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1;
2076     int ret = -1;
2077     QEMUFile *f;
2078     int saved_vm_running;
2079     uint64_t vm_state_size;
2080     qemu_timeval tv;
2081     struct tm tm;
2082     AioContext *aio_context;
2083 
2084     if (!bdrv_all_can_snapshot(&bs)) {
2085         error_setg(errp, "Device '%s' is writable but does not support "
2086                    "snapshots", bdrv_get_device_name(bs));
2087         return ret;
2088     }
2089 
2090     /* Delete old snapshots of the same name */
2091     if (name) {
2092         ret = bdrv_all_delete_snapshot(name, &bs1, errp);
2093         if (ret < 0) {
2094             error_prepend(errp, "Error while deleting snapshot on device "
2095                           "'%s': ", bdrv_get_device_name(bs1));
2096             return ret;
2097         }
2098     }
2099 
2100     bs = bdrv_all_find_vmstate_bs();
2101     if (bs == NULL) {
2102         error_setg(errp, "No block device can accept snapshots");
2103         return ret;
2104     }
2105     aio_context = bdrv_get_aio_context(bs);
2106 
2107     saved_vm_running = runstate_is_running();
2108 
2109     ret = global_state_store();
2110     if (ret) {
2111         error_setg(errp, "Error saving global state");
2112         return ret;
2113     }
2114     vm_stop(RUN_STATE_SAVE_VM);
2115 
2116     aio_context_acquire(aio_context);
2117 
2118     memset(sn, 0, sizeof(*sn));
2119 
2120     /* fill auxiliary fields */
2121     qemu_gettimeofday(&tv);
2122     sn->date_sec = tv.tv_sec;
2123     sn->date_nsec = tv.tv_usec * 1000;
2124     sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2125 
2126     if (name) {
2127         ret = bdrv_snapshot_find(bs, old_sn, name);
2128         if (ret >= 0) {
2129             pstrcpy(sn->name, sizeof(sn->name), old_sn->name);
2130             pstrcpy(sn->id_str, sizeof(sn->id_str), old_sn->id_str);
2131         } else {
2132             pstrcpy(sn->name, sizeof(sn->name), name);
2133         }
2134     } else {
2135         /* cast below needed for OpenBSD where tv_sec is still 'long' */
2136         localtime_r((const time_t *)&tv.tv_sec, &tm);
2137         strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm);
2138     }
2139 
2140     /* save the VM state */
2141     f = qemu_fopen_bdrv(bs, 1);
2142     if (!f) {
2143         error_setg(errp, "Could not open VM state file");
2144         goto the_end;
2145     }
2146     ret = qemu_savevm_state(f, errp);
2147     vm_state_size = qemu_ftell(f);
2148     qemu_fclose(f);
2149     if (ret < 0) {
2150         goto the_end;
2151     }
2152 
2153     ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
2154     if (ret < 0) {
2155         error_setg(errp, "Error while creating snapshot on '%s'",
2156                    bdrv_get_device_name(bs));
2157         goto the_end;
2158     }
2159 
2160     ret = 0;
2161 
2162  the_end:
2163     aio_context_release(aio_context);
2164     if (saved_vm_running) {
2165         vm_start();
2166     }
2167     return ret;
2168 }
2169 
2170 void qmp_xen_save_devices_state(const char *filename, Error **errp)
2171 {
2172     QEMUFile *f;
2173     QIOChannelFile *ioc;
2174     int saved_vm_running;
2175     int ret;
2176 
2177     saved_vm_running = runstate_is_running();
2178     vm_stop(RUN_STATE_SAVE_VM);
2179     global_state_store_running();
2180 
2181     ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp);
2182     if (!ioc) {
2183         goto the_end;
2184     }
2185     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2186     f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2187     ret = qemu_save_device_state(f);
2188     qemu_fclose(f);
2189     if (ret < 0) {
2190         error_setg(errp, QERR_IO_ERROR);
2191     }
2192 
2193  the_end:
2194     if (saved_vm_running) {
2195         vm_start();
2196     }
2197 }
2198 
2199 void qmp_xen_load_devices_state(const char *filename, Error **errp)
2200 {
2201     QEMUFile *f;
2202     QIOChannelFile *ioc;
2203     int ret;
2204 
2205     /* Guest must be paused before loading the device state; the RAM state
2206      * will already have been loaded by xc
2207      */
2208     if (runstate_is_running()) {
2209         error_setg(errp, "Cannot update device state while vm is running");
2210         return;
2211     }
2212     vm_stop(RUN_STATE_RESTORE_VM);
2213 
2214     ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2215     if (!ioc) {
2216         return;
2217     }
2218     qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2219     f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2220 
2221     ret = qemu_loadvm_state(f);
2222     qemu_fclose(f);
2223     if (ret < 0) {
2224         error_setg(errp, QERR_IO_ERROR);
2225     }
2226     migration_incoming_state_destroy();
2227 }
2228 
2229 int load_snapshot(const char *name, Error **errp)
2230 {
2231     BlockDriverState *bs, *bs_vm_state;
2232     QEMUSnapshotInfo sn;
2233     QEMUFile *f;
2234     int ret;
2235     AioContext *aio_context;
2236     MigrationIncomingState *mis = migration_incoming_get_current();
2237 
2238     if (!bdrv_all_can_snapshot(&bs)) {
2239         error_setg(errp,
2240                    "Device '%s' is writable but does not support snapshots",
2241                    bdrv_get_device_name(bs));
2242         return -ENOTSUP;
2243     }
2244     ret = bdrv_all_find_snapshot(name, &bs);
2245     if (ret < 0) {
2246         error_setg(errp,
2247                    "Device '%s' does not have the requested snapshot '%s'",
2248                    bdrv_get_device_name(bs), name);
2249         return ret;
2250     }
2251 
2252     bs_vm_state = bdrv_all_find_vmstate_bs();
2253     if (!bs_vm_state) {
2254         error_setg(errp, "No block device supports snapshots");
2255         return -ENOTSUP;
2256     }
2257     aio_context = bdrv_get_aio_context(bs_vm_state);
2258 
2259     /* Don't even try to load empty VM states */
2260     aio_context_acquire(aio_context);
2261     ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
2262     aio_context_release(aio_context);
2263     if (ret < 0) {
2264         return ret;
2265     } else if (sn.vm_state_size == 0) {
2266         error_setg(errp, "This is a disk-only snapshot. Revert to it "
2267                    " offline using qemu-img");
2268         return -EINVAL;
2269     }
2270 
2271     /* Flush all IO requests so they don't interfere with the new state.  */
2272     bdrv_drain_all();
2273 
2274     ret = bdrv_all_goto_snapshot(name, &bs);
2275     if (ret < 0) {
2276         error_setg(errp, "Error %d while activating snapshot '%s' on '%s'",
2277                      ret, name, bdrv_get_device_name(bs));
2278         return ret;
2279     }
2280 
2281     /* restore the VM state */
2282     f = qemu_fopen_bdrv(bs_vm_state, 0);
2283     if (!f) {
2284         error_setg(errp, "Could not open VM state file");
2285         return -EINVAL;
2286     }
2287 
2288     qemu_system_reset(SHUTDOWN_CAUSE_NONE);
2289     mis->from_src_file = f;
2290 
2291     aio_context_acquire(aio_context);
2292     ret = qemu_loadvm_state(f);
2293     qemu_fclose(f);
2294     aio_context_release(aio_context);
2295 
2296     migration_incoming_state_destroy();
2297     if (ret < 0) {
2298         error_setg(errp, "Error %d while loading VM state", ret);
2299         return ret;
2300     }
2301 
2302     return 0;
2303 }
2304 
2305 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
2306 {
2307     qemu_ram_set_idstr(mr->ram_block,
2308                        memory_region_name(mr), dev);
2309 }
2310 
2311 void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
2312 {
2313     qemu_ram_unset_idstr(mr->ram_block);
2314 }
2315 
2316 void vmstate_register_ram_global(MemoryRegion *mr)
2317 {
2318     vmstate_register_ram(mr, NULL);
2319 }
2320 
2321 bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
2322 {
2323     /* check needed if --only-migratable is specified */
2324     if (!only_migratable) {
2325         return true;
2326     }
2327 
2328     return !(vmsd && vmsd->unmigratable);
2329 }
2330