1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2009-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "hw/boards.h" 31 #include "net/net.h" 32 #include "migration.h" 33 #include "migration/snapshot.h" 34 #include "migration/vmstate.h" 35 #include "migration/misc.h" 36 #include "migration/register.h" 37 #include "migration/global_state.h" 38 #include "migration/channel-block.h" 39 #include "ram.h" 40 #include "qemu-file.h" 41 #include "savevm.h" 42 #include "postcopy-ram.h" 43 #include "qapi/error.h" 44 #include "qapi/qapi-commands-migration.h" 45 #include "qapi/clone-visitor.h" 46 #include "qapi/qapi-builtin-visit.h" 47 #include "qapi/qmp/qerror.h" 48 #include "qemu/error-report.h" 49 #include "sysemu/cpus.h" 50 #include "exec/memory.h" 51 #include "exec/target_page.h" 52 #include "trace.h" 53 #include "qemu/iov.h" 54 #include "qemu/job.h" 55 #include "qemu/main-loop.h" 56 #include "block/snapshot.h" 57 #include "qemu/cutils.h" 58 #include "io/channel-buffer.h" 59 #include "io/channel-file.h" 60 #include "sysemu/replay.h" 61 #include "sysemu/runstate.h" 62 #include "sysemu/sysemu.h" 63 #include "sysemu/xen.h" 64 #include "migration/colo.h" 65 #include "qemu/bitmap.h" 66 #include "net/announce.h" 67 #include "qemu/yank.h" 68 #include "yank_functions.h" 69 #include "sysemu/qtest.h" 70 #include "options.h" 71 72 const unsigned int postcopy_ram_discard_version; 73 74 /* Subcommands for QEMU_VM_COMMAND */ 75 enum qemu_vm_cmd { 76 MIG_CMD_INVALID = 0, /* Must be 0 */ 77 MIG_CMD_OPEN_RETURN_PATH, /* Tell the dest to open the Return path */ 78 MIG_CMD_PING, /* Request a PONG on the RP */ 79 80 MIG_CMD_POSTCOPY_ADVISE, /* Prior to any page transfers, just 81 warn we might want to do PC */ 82 MIG_CMD_POSTCOPY_LISTEN, /* Start listening for incoming 83 pages as it's running. */ 84 MIG_CMD_POSTCOPY_RUN, /* Start execution */ 85 86 MIG_CMD_POSTCOPY_RAM_DISCARD, /* A list of pages to discard that 87 were previously sent during 88 precopy but are dirty. */ 89 MIG_CMD_PACKAGED, /* Send a wrapped stream within this stream */ 90 MIG_CMD_ENABLE_COLO, /* Enable COLO */ 91 MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */ 92 MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */ 93 MIG_CMD_MAX 94 }; 95 96 #define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX 97 static struct mig_cmd_args { 98 ssize_t len; /* -1 = variable */ 99 const char *name; 100 } mig_cmd_args[] = { 101 [MIG_CMD_INVALID] = { .len = -1, .name = "INVALID" }, 102 [MIG_CMD_OPEN_RETURN_PATH] = { .len = 0, .name = "OPEN_RETURN_PATH" }, 103 [MIG_CMD_PING] = { .len = sizeof(uint32_t), .name = "PING" }, 104 [MIG_CMD_POSTCOPY_ADVISE] = { .len = -1, .name = "POSTCOPY_ADVISE" }, 105 [MIG_CMD_POSTCOPY_LISTEN] = { .len = 0, .name = "POSTCOPY_LISTEN" }, 106 [MIG_CMD_POSTCOPY_RUN] = { .len = 0, .name = "POSTCOPY_RUN" }, 107 [MIG_CMD_POSTCOPY_RAM_DISCARD] = { 108 .len = -1, .name = "POSTCOPY_RAM_DISCARD" }, 109 [MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" }, 110 [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, 111 [MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" }, 112 [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, 113 }; 114 115 /* Note for MIG_CMD_POSTCOPY_ADVISE: 116 * The format of arguments is depending on postcopy mode: 117 * - postcopy RAM only 118 * uint64_t host page size 119 * uint64_t taget page size 120 * 121 * - postcopy RAM and postcopy dirty bitmaps 122 * format is the same as for postcopy RAM only 123 * 124 * - postcopy dirty bitmaps only 125 * Nothing. Command length field is 0. 126 * 127 * Be careful: adding a new postcopy entity with some other parameters should 128 * not break format self-description ability. Good way is to introduce some 129 * generic extendable format with an exception for two old entities. 130 */ 131 132 /***********************************************************/ 133 /* savevm/loadvm support */ 134 135 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable) 136 { 137 if (is_writable) { 138 return qemu_file_new_output(QIO_CHANNEL(qio_channel_block_new(bs))); 139 } else { 140 return qemu_file_new_input(QIO_CHANNEL(qio_channel_block_new(bs))); 141 } 142 } 143 144 145 /* QEMUFile timer support. 146 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c 147 */ 148 149 void timer_put(QEMUFile *f, QEMUTimer *ts) 150 { 151 uint64_t expire_time; 152 153 expire_time = timer_expire_time_ns(ts); 154 qemu_put_be64(f, expire_time); 155 } 156 157 void timer_get(QEMUFile *f, QEMUTimer *ts) 158 { 159 uint64_t expire_time; 160 161 expire_time = qemu_get_be64(f); 162 if (expire_time != -1) { 163 timer_mod_ns(ts, expire_time); 164 } else { 165 timer_del(ts); 166 } 167 } 168 169 170 /* VMState timer support. 171 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c 172 */ 173 174 static int get_timer(QEMUFile *f, void *pv, size_t size, 175 const VMStateField *field) 176 { 177 QEMUTimer *v = pv; 178 timer_get(f, v); 179 return 0; 180 } 181 182 static int put_timer(QEMUFile *f, void *pv, size_t size, 183 const VMStateField *field, JSONWriter *vmdesc) 184 { 185 QEMUTimer *v = pv; 186 timer_put(f, v); 187 188 return 0; 189 } 190 191 const VMStateInfo vmstate_info_timer = { 192 .name = "timer", 193 .get = get_timer, 194 .put = put_timer, 195 }; 196 197 198 typedef struct CompatEntry { 199 char idstr[256]; 200 int instance_id; 201 } CompatEntry; 202 203 typedef struct SaveStateEntry { 204 QTAILQ_ENTRY(SaveStateEntry) entry; 205 char idstr[256]; 206 uint32_t instance_id; 207 int alias_id; 208 int version_id; 209 /* version id read from the stream */ 210 int load_version_id; 211 int section_id; 212 /* section id read from the stream */ 213 int load_section_id; 214 const SaveVMHandlers *ops; 215 const VMStateDescription *vmsd; 216 void *opaque; 217 CompatEntry *compat; 218 int is_ram; 219 } SaveStateEntry; 220 221 typedef struct SaveState { 222 QTAILQ_HEAD(, SaveStateEntry) handlers; 223 SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1]; 224 int global_section_id; 225 uint32_t len; 226 const char *name; 227 uint32_t target_page_bits; 228 uint32_t caps_count; 229 MigrationCapability *capabilities; 230 QemuUUID uuid; 231 } SaveState; 232 233 static SaveState savevm_state = { 234 .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), 235 .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL }, 236 .global_section_id = 0, 237 }; 238 239 static bool should_validate_capability(int capability) 240 { 241 assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX); 242 /* Validate only new capabilities to keep compatibility. */ 243 switch (capability) { 244 case MIGRATION_CAPABILITY_X_IGNORE_SHARED: 245 return true; 246 default: 247 return false; 248 } 249 } 250 251 static uint32_t get_validatable_capabilities_count(void) 252 { 253 MigrationState *s = migrate_get_current(); 254 uint32_t result = 0; 255 int i; 256 for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { 257 if (should_validate_capability(i) && s->capabilities[i]) { 258 result++; 259 } 260 } 261 return result; 262 } 263 264 static int configuration_pre_save(void *opaque) 265 { 266 SaveState *state = opaque; 267 const char *current_name = MACHINE_GET_CLASS(current_machine)->name; 268 MigrationState *s = migrate_get_current(); 269 int i, j; 270 271 state->len = strlen(current_name); 272 state->name = current_name; 273 state->target_page_bits = qemu_target_page_bits(); 274 275 state->caps_count = get_validatable_capabilities_count(); 276 state->capabilities = g_renew(MigrationCapability, state->capabilities, 277 state->caps_count); 278 for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) { 279 if (should_validate_capability(i) && s->capabilities[i]) { 280 state->capabilities[j++] = i; 281 } 282 } 283 state->uuid = qemu_uuid; 284 285 return 0; 286 } 287 288 static int configuration_post_save(void *opaque) 289 { 290 SaveState *state = opaque; 291 292 g_free(state->capabilities); 293 state->capabilities = NULL; 294 state->caps_count = 0; 295 return 0; 296 } 297 298 static int configuration_pre_load(void *opaque) 299 { 300 SaveState *state = opaque; 301 302 /* If there is no target-page-bits subsection it means the source 303 * predates the variable-target-page-bits support and is using the 304 * minimum possible value for this CPU. 305 */ 306 state->target_page_bits = qemu_target_page_bits_min(); 307 return 0; 308 } 309 310 static bool configuration_validate_capabilities(SaveState *state) 311 { 312 bool ret = true; 313 MigrationState *s = migrate_get_current(); 314 unsigned long *source_caps_bm; 315 int i; 316 317 source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX); 318 for (i = 0; i < state->caps_count; i++) { 319 MigrationCapability capability = state->capabilities[i]; 320 set_bit(capability, source_caps_bm); 321 } 322 323 for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { 324 bool source_state, target_state; 325 if (!should_validate_capability(i)) { 326 continue; 327 } 328 source_state = test_bit(i, source_caps_bm); 329 target_state = s->capabilities[i]; 330 if (source_state != target_state) { 331 error_report("Capability %s is %s, but received capability is %s", 332 MigrationCapability_str(i), 333 target_state ? "on" : "off", 334 source_state ? "on" : "off"); 335 ret = false; 336 /* Don't break here to report all failed capabilities */ 337 } 338 } 339 340 g_free(source_caps_bm); 341 return ret; 342 } 343 344 static int configuration_post_load(void *opaque, int version_id) 345 { 346 SaveState *state = opaque; 347 const char *current_name = MACHINE_GET_CLASS(current_machine)->name; 348 int ret = 0; 349 350 if (strncmp(state->name, current_name, state->len) != 0) { 351 error_report("Machine type received is '%.*s' and local is '%s'", 352 (int) state->len, state->name, current_name); 353 ret = -EINVAL; 354 goto out; 355 } 356 357 if (state->target_page_bits != qemu_target_page_bits()) { 358 error_report("Received TARGET_PAGE_BITS is %d but local is %d", 359 state->target_page_bits, qemu_target_page_bits()); 360 ret = -EINVAL; 361 goto out; 362 } 363 364 if (!configuration_validate_capabilities(state)) { 365 ret = -EINVAL; 366 goto out; 367 } 368 369 out: 370 g_free((void *)state->name); 371 state->name = NULL; 372 state->len = 0; 373 g_free(state->capabilities); 374 state->capabilities = NULL; 375 state->caps_count = 0; 376 377 return ret; 378 } 379 380 static int get_capability(QEMUFile *f, void *pv, size_t size, 381 const VMStateField *field) 382 { 383 MigrationCapability *capability = pv; 384 char capability_str[UINT8_MAX + 1]; 385 uint8_t len; 386 int i; 387 388 len = qemu_get_byte(f); 389 qemu_get_buffer(f, (uint8_t *)capability_str, len); 390 capability_str[len] = '\0'; 391 for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { 392 if (!strcmp(MigrationCapability_str(i), capability_str)) { 393 *capability = i; 394 return 0; 395 } 396 } 397 error_report("Received unknown capability %s", capability_str); 398 return -EINVAL; 399 } 400 401 static int put_capability(QEMUFile *f, void *pv, size_t size, 402 const VMStateField *field, JSONWriter *vmdesc) 403 { 404 MigrationCapability *capability = pv; 405 const char *capability_str = MigrationCapability_str(*capability); 406 size_t len = strlen(capability_str); 407 assert(len <= UINT8_MAX); 408 409 qemu_put_byte(f, len); 410 qemu_put_buffer(f, (uint8_t *)capability_str, len); 411 return 0; 412 } 413 414 static const VMStateInfo vmstate_info_capability = { 415 .name = "capability", 416 .get = get_capability, 417 .put = put_capability, 418 }; 419 420 /* The target-page-bits subsection is present only if the 421 * target page size is not the same as the default (ie the 422 * minimum page size for a variable-page-size guest CPU). 423 * If it is present then it contains the actual target page 424 * bits for the machine, and migration will fail if the 425 * two ends don't agree about it. 426 */ 427 static bool vmstate_target_page_bits_needed(void *opaque) 428 { 429 return qemu_target_page_bits() 430 > qemu_target_page_bits_min(); 431 } 432 433 static const VMStateDescription vmstate_target_page_bits = { 434 .name = "configuration/target-page-bits", 435 .version_id = 1, 436 .minimum_version_id = 1, 437 .needed = vmstate_target_page_bits_needed, 438 .fields = (VMStateField[]) { 439 VMSTATE_UINT32(target_page_bits, SaveState), 440 VMSTATE_END_OF_LIST() 441 } 442 }; 443 444 static bool vmstate_capabilites_needed(void *opaque) 445 { 446 return get_validatable_capabilities_count() > 0; 447 } 448 449 static const VMStateDescription vmstate_capabilites = { 450 .name = "configuration/capabilities", 451 .version_id = 1, 452 .minimum_version_id = 1, 453 .needed = vmstate_capabilites_needed, 454 .fields = (VMStateField[]) { 455 VMSTATE_UINT32_V(caps_count, SaveState, 1), 456 VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1, 457 vmstate_info_capability, 458 MigrationCapability), 459 VMSTATE_END_OF_LIST() 460 } 461 }; 462 463 static bool vmstate_uuid_needed(void *opaque) 464 { 465 return qemu_uuid_set && migrate_validate_uuid(); 466 } 467 468 static int vmstate_uuid_post_load(void *opaque, int version_id) 469 { 470 SaveState *state = opaque; 471 char uuid_src[UUID_FMT_LEN + 1]; 472 char uuid_dst[UUID_FMT_LEN + 1]; 473 474 if (!qemu_uuid_set) { 475 /* 476 * It's warning because user might not know UUID in some cases, 477 * e.g. load an old snapshot 478 */ 479 qemu_uuid_unparse(&state->uuid, uuid_src); 480 warn_report("UUID is received %s, but local uuid isn't set", 481 uuid_src); 482 return 0; 483 } 484 if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) { 485 qemu_uuid_unparse(&state->uuid, uuid_src); 486 qemu_uuid_unparse(&qemu_uuid, uuid_dst); 487 error_report("UUID received is %s and local is %s", uuid_src, uuid_dst); 488 return -EINVAL; 489 } 490 return 0; 491 } 492 493 static const VMStateDescription vmstate_uuid = { 494 .name = "configuration/uuid", 495 .version_id = 1, 496 .minimum_version_id = 1, 497 .needed = vmstate_uuid_needed, 498 .post_load = vmstate_uuid_post_load, 499 .fields = (VMStateField[]) { 500 VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1), 501 VMSTATE_END_OF_LIST() 502 } 503 }; 504 505 static const VMStateDescription vmstate_configuration = { 506 .name = "configuration", 507 .version_id = 1, 508 .pre_load = configuration_pre_load, 509 .post_load = configuration_post_load, 510 .pre_save = configuration_pre_save, 511 .post_save = configuration_post_save, 512 .fields = (VMStateField[]) { 513 VMSTATE_UINT32(len, SaveState), 514 VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len), 515 VMSTATE_END_OF_LIST() 516 }, 517 .subsections = (const VMStateDescription *[]) { 518 &vmstate_target_page_bits, 519 &vmstate_capabilites, 520 &vmstate_uuid, 521 NULL 522 } 523 }; 524 525 static void dump_vmstate_vmsd(FILE *out_file, 526 const VMStateDescription *vmsd, int indent, 527 bool is_subsection); 528 529 static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field, 530 int indent) 531 { 532 fprintf(out_file, "%*s{\n", indent, ""); 533 indent += 2; 534 fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name); 535 fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", 536 field->version_id); 537 fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "", 538 field->field_exists ? "true" : "false"); 539 fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size); 540 if (field->vmsd != NULL) { 541 fprintf(out_file, ",\n"); 542 dump_vmstate_vmsd(out_file, field->vmsd, indent, false); 543 } 544 fprintf(out_file, "\n%*s}", indent - 2, ""); 545 } 546 547 static void dump_vmstate_vmss(FILE *out_file, 548 const VMStateDescription **subsection, 549 int indent) 550 { 551 if (*subsection != NULL) { 552 dump_vmstate_vmsd(out_file, *subsection, indent, true); 553 } 554 } 555 556 static void dump_vmstate_vmsd(FILE *out_file, 557 const VMStateDescription *vmsd, int indent, 558 bool is_subsection) 559 { 560 if (is_subsection) { 561 fprintf(out_file, "%*s{\n", indent, ""); 562 } else { 563 fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description"); 564 } 565 indent += 2; 566 fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name); 567 fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", 568 vmsd->version_id); 569 fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "", 570 vmsd->minimum_version_id); 571 if (vmsd->fields != NULL) { 572 const VMStateField *field = vmsd->fields; 573 bool first; 574 575 fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, ""); 576 first = true; 577 while (field->name != NULL) { 578 if (field->flags & VMS_MUST_EXIST) { 579 /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */ 580 field++; 581 continue; 582 } 583 if (!first) { 584 fprintf(out_file, ",\n"); 585 } 586 dump_vmstate_vmsf(out_file, field, indent + 2); 587 field++; 588 first = false; 589 } 590 assert(field->flags == VMS_END); 591 fprintf(out_file, "\n%*s]", indent, ""); 592 } 593 if (vmsd->subsections != NULL) { 594 const VMStateDescription **subsection = vmsd->subsections; 595 bool first; 596 597 fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, ""); 598 first = true; 599 while (*subsection != NULL) { 600 if (!first) { 601 fprintf(out_file, ",\n"); 602 } 603 dump_vmstate_vmss(out_file, subsection, indent + 2); 604 subsection++; 605 first = false; 606 } 607 fprintf(out_file, "\n%*s]", indent, ""); 608 } 609 fprintf(out_file, "\n%*s}", indent - 2, ""); 610 } 611 612 static void dump_machine_type(FILE *out_file) 613 { 614 MachineClass *mc; 615 616 mc = MACHINE_GET_CLASS(current_machine); 617 618 fprintf(out_file, " \"vmschkmachine\": {\n"); 619 fprintf(out_file, " \"Name\": \"%s\"\n", mc->name); 620 fprintf(out_file, " },\n"); 621 } 622 623 void dump_vmstate_json_to_file(FILE *out_file) 624 { 625 GSList *list, *elt; 626 bool first; 627 628 fprintf(out_file, "{\n"); 629 dump_machine_type(out_file); 630 631 first = true; 632 list = object_class_get_list(TYPE_DEVICE, true); 633 for (elt = list; elt; elt = elt->next) { 634 DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data, 635 TYPE_DEVICE); 636 const char *name; 637 int indent = 2; 638 639 if (!dc->vmsd) { 640 continue; 641 } 642 643 if (!first) { 644 fprintf(out_file, ",\n"); 645 } 646 name = object_class_get_name(OBJECT_CLASS(dc)); 647 fprintf(out_file, "%*s\"%s\": {\n", indent, "", name); 648 indent += 2; 649 fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name); 650 fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", 651 dc->vmsd->version_id); 652 fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "", 653 dc->vmsd->minimum_version_id); 654 655 dump_vmstate_vmsd(out_file, dc->vmsd, indent, false); 656 657 fprintf(out_file, "\n%*s}", indent - 2, ""); 658 first = false; 659 } 660 fprintf(out_file, "\n}\n"); 661 fclose(out_file); 662 g_slist_free(list); 663 } 664 665 static uint32_t calculate_new_instance_id(const char *idstr) 666 { 667 SaveStateEntry *se; 668 uint32_t instance_id = 0; 669 670 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 671 if (strcmp(idstr, se->idstr) == 0 672 && instance_id <= se->instance_id) { 673 instance_id = se->instance_id + 1; 674 } 675 } 676 /* Make sure we never loop over without being noticed */ 677 assert(instance_id != VMSTATE_INSTANCE_ID_ANY); 678 return instance_id; 679 } 680 681 static int calculate_compat_instance_id(const char *idstr) 682 { 683 SaveStateEntry *se; 684 int instance_id = 0; 685 686 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 687 if (!se->compat) { 688 continue; 689 } 690 691 if (strcmp(idstr, se->compat->idstr) == 0 692 && instance_id <= se->compat->instance_id) { 693 instance_id = se->compat->instance_id + 1; 694 } 695 } 696 return instance_id; 697 } 698 699 static inline MigrationPriority save_state_priority(SaveStateEntry *se) 700 { 701 if (se->vmsd) { 702 return se->vmsd->priority; 703 } 704 return MIG_PRI_DEFAULT; 705 } 706 707 static void savevm_state_handler_insert(SaveStateEntry *nse) 708 { 709 MigrationPriority priority = save_state_priority(nse); 710 SaveStateEntry *se; 711 int i; 712 713 assert(priority <= MIG_PRI_MAX); 714 715 for (i = priority - 1; i >= 0; i--) { 716 se = savevm_state.handler_pri_head[i]; 717 if (se != NULL) { 718 assert(save_state_priority(se) < priority); 719 break; 720 } 721 } 722 723 if (i >= 0) { 724 QTAILQ_INSERT_BEFORE(se, nse, entry); 725 } else { 726 QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry); 727 } 728 729 if (savevm_state.handler_pri_head[priority] == NULL) { 730 savevm_state.handler_pri_head[priority] = nse; 731 } 732 } 733 734 static void savevm_state_handler_remove(SaveStateEntry *se) 735 { 736 SaveStateEntry *next; 737 MigrationPriority priority = save_state_priority(se); 738 739 if (se == savevm_state.handler_pri_head[priority]) { 740 next = QTAILQ_NEXT(se, entry); 741 if (next != NULL && save_state_priority(next) == priority) { 742 savevm_state.handler_pri_head[priority] = next; 743 } else { 744 savevm_state.handler_pri_head[priority] = NULL; 745 } 746 } 747 QTAILQ_REMOVE(&savevm_state.handlers, se, entry); 748 } 749 750 /* TODO: Individual devices generally have very little idea about the rest 751 of the system, so instance_id should be removed/replaced. 752 Meanwhile pass -1 as instance_id if you do not already have a clearly 753 distinguishing id for all instances of your device class. */ 754 int register_savevm_live(const char *idstr, 755 uint32_t instance_id, 756 int version_id, 757 const SaveVMHandlers *ops, 758 void *opaque) 759 { 760 SaveStateEntry *se; 761 762 se = g_new0(SaveStateEntry, 1); 763 se->version_id = version_id; 764 se->section_id = savevm_state.global_section_id++; 765 se->ops = ops; 766 se->opaque = opaque; 767 se->vmsd = NULL; 768 /* if this is a live_savem then set is_ram */ 769 if (ops->save_setup != NULL) { 770 se->is_ram = 1; 771 } 772 773 pstrcat(se->idstr, sizeof(se->idstr), idstr); 774 775 if (instance_id == VMSTATE_INSTANCE_ID_ANY) { 776 se->instance_id = calculate_new_instance_id(se->idstr); 777 } else { 778 se->instance_id = instance_id; 779 } 780 assert(!se->compat || se->instance_id == 0); 781 savevm_state_handler_insert(se); 782 return 0; 783 } 784 785 void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque) 786 { 787 SaveStateEntry *se, *new_se; 788 char id[256] = ""; 789 790 if (obj) { 791 char *oid = vmstate_if_get_id(obj); 792 if (oid) { 793 pstrcpy(id, sizeof(id), oid); 794 pstrcat(id, sizeof(id), "/"); 795 g_free(oid); 796 } 797 } 798 pstrcat(id, sizeof(id), idstr); 799 800 QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { 801 if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) { 802 savevm_state_handler_remove(se); 803 g_free(se->compat); 804 g_free(se); 805 } 806 } 807 } 808 809 /* 810 * Perform some basic checks on vmsd's at registration 811 * time. 812 */ 813 static void vmstate_check(const VMStateDescription *vmsd) 814 { 815 const VMStateField *field = vmsd->fields; 816 const VMStateDescription **subsection = vmsd->subsections; 817 818 if (field) { 819 while (field->name) { 820 if (field->flags & (VMS_STRUCT | VMS_VSTRUCT)) { 821 /* Recurse to sub structures */ 822 vmstate_check(field->vmsd); 823 } 824 /* Carry on */ 825 field++; 826 } 827 /* Check for the end of field list canary */ 828 if (field->flags != VMS_END) { 829 error_report("VMSTATE not ending with VMS_END: %s", vmsd->name); 830 g_assert_not_reached(); 831 } 832 } 833 834 while (subsection && *subsection) { 835 /* 836 * The name of a subsection should start with the name of the 837 * current object. 838 */ 839 assert(!strncmp(vmsd->name, (*subsection)->name, strlen(vmsd->name))); 840 vmstate_check(*subsection); 841 subsection++; 842 } 843 } 844 845 int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id, 846 const VMStateDescription *vmsd, 847 void *opaque, int alias_id, 848 int required_for_version, 849 Error **errp) 850 { 851 SaveStateEntry *se; 852 853 /* If this triggers, alias support can be dropped for the vmsd. */ 854 assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id); 855 856 se = g_new0(SaveStateEntry, 1); 857 se->version_id = vmsd->version_id; 858 se->section_id = savevm_state.global_section_id++; 859 se->opaque = opaque; 860 se->vmsd = vmsd; 861 se->alias_id = alias_id; 862 863 if (obj) { 864 char *id = vmstate_if_get_id(obj); 865 if (id) { 866 if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >= 867 sizeof(se->idstr)) { 868 error_setg(errp, "Path too long for VMState (%s)", id); 869 g_free(id); 870 g_free(se); 871 872 return -1; 873 } 874 g_free(id); 875 876 se->compat = g_new0(CompatEntry, 1); 877 pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name); 878 se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ? 879 calculate_compat_instance_id(vmsd->name) : instance_id; 880 instance_id = VMSTATE_INSTANCE_ID_ANY; 881 } 882 } 883 pstrcat(se->idstr, sizeof(se->idstr), vmsd->name); 884 885 if (instance_id == VMSTATE_INSTANCE_ID_ANY) { 886 se->instance_id = calculate_new_instance_id(se->idstr); 887 } else { 888 se->instance_id = instance_id; 889 } 890 891 /* Perform a recursive sanity check during the test runs */ 892 if (qtest_enabled()) { 893 vmstate_check(vmsd); 894 } 895 assert(!se->compat || se->instance_id == 0); 896 savevm_state_handler_insert(se); 897 return 0; 898 } 899 900 void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd, 901 void *opaque) 902 { 903 SaveStateEntry *se, *new_se; 904 905 QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { 906 if (se->vmsd == vmsd && se->opaque == opaque) { 907 savevm_state_handler_remove(se); 908 g_free(se->compat); 909 g_free(se); 910 } 911 } 912 } 913 914 static int vmstate_load(QEMUFile *f, SaveStateEntry *se) 915 { 916 trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); 917 if (!se->vmsd) { /* Old style */ 918 return se->ops->load_state(f, se->opaque, se->load_version_id); 919 } 920 return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id); 921 } 922 923 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, 924 JSONWriter *vmdesc) 925 { 926 int64_t old_offset, size; 927 928 old_offset = qemu_file_total_transferred_fast(f); 929 se->ops->save_state(f, se->opaque); 930 size = qemu_file_total_transferred_fast(f) - old_offset; 931 932 if (vmdesc) { 933 json_writer_int64(vmdesc, "size", size); 934 json_writer_start_array(vmdesc, "fields"); 935 json_writer_start_object(vmdesc, NULL); 936 json_writer_str(vmdesc, "name", "data"); 937 json_writer_int64(vmdesc, "size", size); 938 json_writer_str(vmdesc, "type", "buffer"); 939 json_writer_end_object(vmdesc); 940 json_writer_end_array(vmdesc); 941 } 942 } 943 944 /* 945 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL) 946 */ 947 static void save_section_header(QEMUFile *f, SaveStateEntry *se, 948 uint8_t section_type) 949 { 950 qemu_put_byte(f, section_type); 951 qemu_put_be32(f, se->section_id); 952 953 if (section_type == QEMU_VM_SECTION_FULL || 954 section_type == QEMU_VM_SECTION_START) { 955 /* ID string */ 956 size_t len = strlen(se->idstr); 957 qemu_put_byte(f, len); 958 qemu_put_buffer(f, (uint8_t *)se->idstr, len); 959 960 qemu_put_be32(f, se->instance_id); 961 qemu_put_be32(f, se->version_id); 962 } 963 } 964 965 /* 966 * Write a footer onto device sections that catches cases misformatted device 967 * sections. 968 */ 969 static void save_section_footer(QEMUFile *f, SaveStateEntry *se) 970 { 971 if (migrate_get_current()->send_section_footer) { 972 qemu_put_byte(f, QEMU_VM_SECTION_FOOTER); 973 qemu_put_be32(f, se->section_id); 974 } 975 } 976 977 static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc) 978 { 979 int ret; 980 981 if ((!se->ops || !se->ops->save_state) && !se->vmsd) { 982 return 0; 983 } 984 if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { 985 trace_savevm_section_skip(se->idstr, se->section_id); 986 return 0; 987 } 988 989 trace_savevm_section_start(se->idstr, se->section_id); 990 save_section_header(f, se, QEMU_VM_SECTION_FULL); 991 if (vmdesc) { 992 json_writer_start_object(vmdesc, NULL); 993 json_writer_str(vmdesc, "name", se->idstr); 994 json_writer_int64(vmdesc, "instance_id", se->instance_id); 995 } 996 997 trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); 998 if (!se->vmsd) { 999 vmstate_save_old_style(f, se, vmdesc); 1000 } else { 1001 ret = vmstate_save_state(f, se->vmsd, se->opaque, vmdesc); 1002 if (ret) { 1003 return ret; 1004 } 1005 } 1006 1007 trace_savevm_section_end(se->idstr, se->section_id, 0); 1008 save_section_footer(f, se); 1009 if (vmdesc) { 1010 json_writer_end_object(vmdesc); 1011 } 1012 return 0; 1013 } 1014 /** 1015 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the 1016 * command and associated data. 1017 * 1018 * @f: File to send command on 1019 * @command: Command type to send 1020 * @len: Length of associated data 1021 * @data: Data associated with command. 1022 */ 1023 static void qemu_savevm_command_send(QEMUFile *f, 1024 enum qemu_vm_cmd command, 1025 uint16_t len, 1026 uint8_t *data) 1027 { 1028 trace_savevm_command_send(command, len); 1029 qemu_put_byte(f, QEMU_VM_COMMAND); 1030 qemu_put_be16(f, (uint16_t)command); 1031 qemu_put_be16(f, len); 1032 qemu_put_buffer(f, data, len); 1033 qemu_fflush(f); 1034 } 1035 1036 void qemu_savevm_send_colo_enable(QEMUFile *f) 1037 { 1038 trace_savevm_send_colo_enable(); 1039 qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL); 1040 } 1041 1042 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value) 1043 { 1044 uint32_t buf; 1045 1046 trace_savevm_send_ping(value); 1047 buf = cpu_to_be32(value); 1048 qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf); 1049 } 1050 1051 void qemu_savevm_send_open_return_path(QEMUFile *f) 1052 { 1053 trace_savevm_send_open_return_path(); 1054 qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL); 1055 } 1056 1057 /* We have a buffer of data to send; we don't want that all to be loaded 1058 * by the command itself, so the command contains just the length of the 1059 * extra buffer that we then send straight after it. 1060 * TODO: Must be a better way to organise that 1061 * 1062 * Returns: 1063 * 0 on success 1064 * -ve on error 1065 */ 1066 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len) 1067 { 1068 uint32_t tmp; 1069 1070 if (len > MAX_VM_CMD_PACKAGED_SIZE) { 1071 error_report("%s: Unreasonably large packaged state: %zu", 1072 __func__, len); 1073 return -1; 1074 } 1075 1076 tmp = cpu_to_be32(len); 1077 1078 trace_qemu_savevm_send_packaged(); 1079 qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp); 1080 1081 qemu_put_buffer(f, buf, len); 1082 1083 return 0; 1084 } 1085 1086 /* Send prior to any postcopy transfer */ 1087 void qemu_savevm_send_postcopy_advise(QEMUFile *f) 1088 { 1089 if (migrate_postcopy_ram()) { 1090 uint64_t tmp[2]; 1091 tmp[0] = cpu_to_be64(ram_pagesize_summary()); 1092 tmp[1] = cpu_to_be64(qemu_target_page_size()); 1093 1094 trace_qemu_savevm_send_postcopy_advise(); 1095 qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 1096 16, (uint8_t *)tmp); 1097 } else { 1098 qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL); 1099 } 1100 } 1101 1102 /* Sent prior to starting the destination running in postcopy, discard pages 1103 * that have already been sent but redirtied on the source. 1104 * CMD_POSTCOPY_RAM_DISCARD consist of: 1105 * byte version (0) 1106 * byte Length of name field (not including 0) 1107 * n x byte RAM block name 1108 * byte 0 terminator (just for safety) 1109 * n x Byte ranges within the named RAMBlock 1110 * be64 Start of the range 1111 * be64 Length 1112 * 1113 * name: RAMBlock name that these entries are part of 1114 * len: Number of page entries 1115 * start_list: 'len' addresses 1116 * length_list: 'len' addresses 1117 * 1118 */ 1119 void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, 1120 uint16_t len, 1121 uint64_t *start_list, 1122 uint64_t *length_list) 1123 { 1124 uint8_t *buf; 1125 uint16_t tmplen; 1126 uint16_t t; 1127 size_t name_len = strlen(name); 1128 1129 trace_qemu_savevm_send_postcopy_ram_discard(name, len); 1130 assert(name_len < 256); 1131 buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len); 1132 buf[0] = postcopy_ram_discard_version; 1133 buf[1] = name_len; 1134 memcpy(buf + 2, name, name_len); 1135 tmplen = 2 + name_len; 1136 buf[tmplen++] = '\0'; 1137 1138 for (t = 0; t < len; t++) { 1139 stq_be_p(buf + tmplen, start_list[t]); 1140 tmplen += 8; 1141 stq_be_p(buf + tmplen, length_list[t]); 1142 tmplen += 8; 1143 } 1144 qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf); 1145 g_free(buf); 1146 } 1147 1148 /* Get the destination into a state where it can receive postcopy data. */ 1149 void qemu_savevm_send_postcopy_listen(QEMUFile *f) 1150 { 1151 trace_savevm_send_postcopy_listen(); 1152 qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL); 1153 } 1154 1155 /* Kick the destination into running */ 1156 void qemu_savevm_send_postcopy_run(QEMUFile *f) 1157 { 1158 trace_savevm_send_postcopy_run(); 1159 qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL); 1160 } 1161 1162 void qemu_savevm_send_postcopy_resume(QEMUFile *f) 1163 { 1164 trace_savevm_send_postcopy_resume(); 1165 qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL); 1166 } 1167 1168 void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name) 1169 { 1170 size_t len; 1171 char buf[256]; 1172 1173 trace_savevm_send_recv_bitmap(block_name); 1174 1175 buf[0] = len = strlen(block_name); 1176 memcpy(buf + 1, block_name, len); 1177 1178 qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf); 1179 } 1180 1181 bool qemu_savevm_state_blocked(Error **errp) 1182 { 1183 SaveStateEntry *se; 1184 1185 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1186 if (se->vmsd && se->vmsd->unmigratable) { 1187 error_setg(errp, "State blocked by non-migratable device '%s'", 1188 se->idstr); 1189 return true; 1190 } 1191 } 1192 return false; 1193 } 1194 1195 void qemu_savevm_non_migratable_list(strList **reasons) 1196 { 1197 SaveStateEntry *se; 1198 1199 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1200 if (se->vmsd && se->vmsd->unmigratable) { 1201 QAPI_LIST_PREPEND(*reasons, 1202 g_strdup_printf("non-migratable device: %s", 1203 se->idstr)); 1204 } 1205 } 1206 } 1207 1208 void qemu_savevm_state_header(QEMUFile *f) 1209 { 1210 trace_savevm_state_header(); 1211 qemu_put_be32(f, QEMU_VM_FILE_MAGIC); 1212 qemu_put_be32(f, QEMU_VM_FILE_VERSION); 1213 1214 if (migrate_get_current()->send_configuration) { 1215 qemu_put_byte(f, QEMU_VM_CONFIGURATION); 1216 vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); 1217 } 1218 } 1219 1220 bool qemu_savevm_state_guest_unplug_pending(void) 1221 { 1222 SaveStateEntry *se; 1223 1224 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1225 if (se->vmsd && se->vmsd->dev_unplug_pending && 1226 se->vmsd->dev_unplug_pending(se->opaque)) { 1227 return true; 1228 } 1229 } 1230 1231 return false; 1232 } 1233 1234 void qemu_savevm_state_setup(QEMUFile *f) 1235 { 1236 MigrationState *ms = migrate_get_current(); 1237 SaveStateEntry *se; 1238 Error *local_err = NULL; 1239 int ret; 1240 1241 ms->vmdesc = json_writer_new(false); 1242 json_writer_start_object(ms->vmdesc, NULL); 1243 json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size()); 1244 json_writer_start_array(ms->vmdesc, "devices"); 1245 1246 trace_savevm_state_setup(); 1247 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1248 if (se->vmsd && se->vmsd->early_setup) { 1249 ret = vmstate_save(f, se, ms->vmdesc); 1250 if (ret) { 1251 qemu_file_set_error(f, ret); 1252 break; 1253 } 1254 continue; 1255 } 1256 1257 if (!se->ops || !se->ops->save_setup) { 1258 continue; 1259 } 1260 if (se->ops->is_active) { 1261 if (!se->ops->is_active(se->opaque)) { 1262 continue; 1263 } 1264 } 1265 save_section_header(f, se, QEMU_VM_SECTION_START); 1266 1267 ret = se->ops->save_setup(f, se->opaque); 1268 save_section_footer(f, se); 1269 if (ret < 0) { 1270 qemu_file_set_error(f, ret); 1271 break; 1272 } 1273 } 1274 1275 if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) { 1276 error_report_err(local_err); 1277 } 1278 } 1279 1280 int qemu_savevm_state_resume_prepare(MigrationState *s) 1281 { 1282 SaveStateEntry *se; 1283 int ret; 1284 1285 trace_savevm_state_resume_prepare(); 1286 1287 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1288 if (!se->ops || !se->ops->resume_prepare) { 1289 continue; 1290 } 1291 if (se->ops->is_active) { 1292 if (!se->ops->is_active(se->opaque)) { 1293 continue; 1294 } 1295 } 1296 ret = se->ops->resume_prepare(s, se->opaque); 1297 if (ret < 0) { 1298 return ret; 1299 } 1300 } 1301 1302 return 0; 1303 } 1304 1305 /* 1306 * this function has three return values: 1307 * negative: there was one error, and we have -errno. 1308 * 0 : We haven't finished, caller have to go again 1309 * 1 : We have finished, we can go to complete phase 1310 */ 1311 int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) 1312 { 1313 SaveStateEntry *se; 1314 int ret = 1; 1315 1316 trace_savevm_state_iterate(); 1317 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1318 if (!se->ops || !se->ops->save_live_iterate) { 1319 continue; 1320 } 1321 if (se->ops->is_active && 1322 !se->ops->is_active(se->opaque)) { 1323 continue; 1324 } 1325 if (se->ops->is_active_iterate && 1326 !se->ops->is_active_iterate(se->opaque)) { 1327 continue; 1328 } 1329 /* 1330 * In the postcopy phase, any device that doesn't know how to 1331 * do postcopy should have saved it's state in the _complete 1332 * call that's already run, it might get confused if we call 1333 * iterate afterwards. 1334 */ 1335 if (postcopy && 1336 !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) { 1337 continue; 1338 } 1339 if (qemu_file_rate_limit(f)) { 1340 return 0; 1341 } 1342 trace_savevm_section_start(se->idstr, se->section_id); 1343 1344 save_section_header(f, se, QEMU_VM_SECTION_PART); 1345 1346 ret = se->ops->save_live_iterate(f, se->opaque); 1347 trace_savevm_section_end(se->idstr, se->section_id, ret); 1348 save_section_footer(f, se); 1349 1350 if (ret < 0) { 1351 error_report("failed to save SaveStateEntry with id(name): " 1352 "%d(%s): %d", 1353 se->section_id, se->idstr, ret); 1354 qemu_file_set_error(f, ret); 1355 } 1356 if (ret <= 0) { 1357 /* Do not proceed to the next vmstate before this one reported 1358 completion of the current stage. This serializes the migration 1359 and reduces the probability that a faster changing state is 1360 synchronized over and over again. */ 1361 break; 1362 } 1363 } 1364 return ret; 1365 } 1366 1367 static bool should_send_vmdesc(void) 1368 { 1369 MachineState *machine = MACHINE(qdev_get_machine()); 1370 bool in_postcopy = migration_in_postcopy(); 1371 return !machine->suppress_vmdesc && !in_postcopy; 1372 } 1373 1374 /* 1375 * Calls the save_live_complete_postcopy methods 1376 * causing the last few pages to be sent immediately and doing any associated 1377 * cleanup. 1378 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete 1379 * all the other devices, but that happens at the point we switch to postcopy. 1380 */ 1381 void qemu_savevm_state_complete_postcopy(QEMUFile *f) 1382 { 1383 SaveStateEntry *se; 1384 int ret; 1385 1386 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1387 if (!se->ops || !se->ops->save_live_complete_postcopy) { 1388 continue; 1389 } 1390 if (se->ops->is_active) { 1391 if (!se->ops->is_active(se->opaque)) { 1392 continue; 1393 } 1394 } 1395 trace_savevm_section_start(se->idstr, se->section_id); 1396 /* Section type */ 1397 qemu_put_byte(f, QEMU_VM_SECTION_END); 1398 qemu_put_be32(f, se->section_id); 1399 1400 ret = se->ops->save_live_complete_postcopy(f, se->opaque); 1401 trace_savevm_section_end(se->idstr, se->section_id, ret); 1402 save_section_footer(f, se); 1403 if (ret < 0) { 1404 qemu_file_set_error(f, ret); 1405 return; 1406 } 1407 } 1408 1409 qemu_put_byte(f, QEMU_VM_EOF); 1410 qemu_fflush(f); 1411 } 1412 1413 static 1414 int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy) 1415 { 1416 SaveStateEntry *se; 1417 int ret; 1418 1419 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1420 if (!se->ops || 1421 (in_postcopy && se->ops->has_postcopy && 1422 se->ops->has_postcopy(se->opaque)) || 1423 !se->ops->save_live_complete_precopy) { 1424 continue; 1425 } 1426 1427 if (se->ops->is_active) { 1428 if (!se->ops->is_active(se->opaque)) { 1429 continue; 1430 } 1431 } 1432 trace_savevm_section_start(se->idstr, se->section_id); 1433 1434 save_section_header(f, se, QEMU_VM_SECTION_END); 1435 1436 ret = se->ops->save_live_complete_precopy(f, se->opaque); 1437 trace_savevm_section_end(se->idstr, se->section_id, ret); 1438 save_section_footer(f, se); 1439 if (ret < 0) { 1440 qemu_file_set_error(f, ret); 1441 return -1; 1442 } 1443 } 1444 1445 return 0; 1446 } 1447 1448 int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f, 1449 bool in_postcopy, 1450 bool inactivate_disks) 1451 { 1452 MigrationState *ms = migrate_get_current(); 1453 JSONWriter *vmdesc = ms->vmdesc; 1454 int vmdesc_len; 1455 SaveStateEntry *se; 1456 int ret; 1457 1458 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1459 if (se->vmsd && se->vmsd->early_setup) { 1460 /* Already saved during qemu_savevm_state_setup(). */ 1461 continue; 1462 } 1463 1464 ret = vmstate_save(f, se, vmdesc); 1465 if (ret) { 1466 qemu_file_set_error(f, ret); 1467 return ret; 1468 } 1469 } 1470 1471 if (inactivate_disks) { 1472 /* Inactivate before sending QEMU_VM_EOF so that the 1473 * bdrv_activate_all() on the other end won't fail. */ 1474 ret = bdrv_inactivate_all(); 1475 if (ret) { 1476 error_report("%s: bdrv_inactivate_all() failed (%d)", 1477 __func__, ret); 1478 qemu_file_set_error(f, ret); 1479 return ret; 1480 } 1481 } 1482 if (!in_postcopy) { 1483 /* Postcopy stream will still be going */ 1484 qemu_put_byte(f, QEMU_VM_EOF); 1485 } 1486 1487 json_writer_end_array(vmdesc); 1488 json_writer_end_object(vmdesc); 1489 vmdesc_len = strlen(json_writer_get(vmdesc)); 1490 1491 if (should_send_vmdesc()) { 1492 qemu_put_byte(f, QEMU_VM_VMDESCRIPTION); 1493 qemu_put_be32(f, vmdesc_len); 1494 qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len); 1495 } 1496 1497 /* Free it now to detect any inconsistencies. */ 1498 json_writer_free(vmdesc); 1499 ms->vmdesc = NULL; 1500 1501 return 0; 1502 } 1503 1504 int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only, 1505 bool inactivate_disks) 1506 { 1507 int ret; 1508 Error *local_err = NULL; 1509 bool in_postcopy = migration_in_postcopy(); 1510 1511 if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) { 1512 error_report_err(local_err); 1513 } 1514 1515 trace_savevm_state_complete_precopy(); 1516 1517 cpu_synchronize_all_states(); 1518 1519 if (!in_postcopy || iterable_only) { 1520 ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy); 1521 if (ret) { 1522 return ret; 1523 } 1524 } 1525 1526 if (iterable_only) { 1527 goto flush; 1528 } 1529 1530 ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy, 1531 inactivate_disks); 1532 if (ret) { 1533 return ret; 1534 } 1535 1536 flush: 1537 qemu_fflush(f); 1538 return 0; 1539 } 1540 1541 /* Give an estimate of the amount left to be transferred, 1542 * the result is split into the amount for units that can and 1543 * for units that can't do postcopy. 1544 */ 1545 void qemu_savevm_state_pending_estimate(uint64_t *must_precopy, 1546 uint64_t *can_postcopy) 1547 { 1548 SaveStateEntry *se; 1549 1550 *must_precopy = 0; 1551 *can_postcopy = 0; 1552 1553 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1554 if (!se->ops || !se->ops->state_pending_estimate) { 1555 continue; 1556 } 1557 if (se->ops->is_active) { 1558 if (!se->ops->is_active(se->opaque)) { 1559 continue; 1560 } 1561 } 1562 se->ops->state_pending_estimate(se->opaque, must_precopy, can_postcopy); 1563 } 1564 } 1565 1566 void qemu_savevm_state_pending_exact(uint64_t *must_precopy, 1567 uint64_t *can_postcopy) 1568 { 1569 SaveStateEntry *se; 1570 1571 *must_precopy = 0; 1572 *can_postcopy = 0; 1573 1574 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1575 if (!se->ops || !se->ops->state_pending_exact) { 1576 continue; 1577 } 1578 if (se->ops->is_active) { 1579 if (!se->ops->is_active(se->opaque)) { 1580 continue; 1581 } 1582 } 1583 se->ops->state_pending_exact(se->opaque, must_precopy, can_postcopy); 1584 } 1585 } 1586 1587 void qemu_savevm_state_cleanup(void) 1588 { 1589 SaveStateEntry *se; 1590 Error *local_err = NULL; 1591 1592 if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) { 1593 error_report_err(local_err); 1594 } 1595 1596 trace_savevm_state_cleanup(); 1597 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1598 if (se->ops && se->ops->save_cleanup) { 1599 se->ops->save_cleanup(se->opaque); 1600 } 1601 } 1602 } 1603 1604 static int qemu_savevm_state(QEMUFile *f, Error **errp) 1605 { 1606 int ret; 1607 MigrationState *ms = migrate_get_current(); 1608 MigrationStatus status; 1609 1610 if (migration_is_running(ms->state)) { 1611 error_setg(errp, QERR_MIGRATION_ACTIVE); 1612 return -EINVAL; 1613 } 1614 1615 if (migrate_block()) { 1616 error_setg(errp, "Block migration and snapshots are incompatible"); 1617 return -EINVAL; 1618 } 1619 1620 migrate_init(ms); 1621 memset(&ram_counters, 0, sizeof(ram_counters)); 1622 memset(&compression_counters, 0, sizeof(compression_counters)); 1623 ms->to_dst_file = f; 1624 1625 qemu_mutex_unlock_iothread(); 1626 qemu_savevm_state_header(f); 1627 qemu_savevm_state_setup(f); 1628 qemu_mutex_lock_iothread(); 1629 1630 while (qemu_file_get_error(f) == 0) { 1631 if (qemu_savevm_state_iterate(f, false) > 0) { 1632 break; 1633 } 1634 } 1635 1636 ret = qemu_file_get_error(f); 1637 if (ret == 0) { 1638 qemu_savevm_state_complete_precopy(f, false, false); 1639 ret = qemu_file_get_error(f); 1640 } 1641 qemu_savevm_state_cleanup(); 1642 if (ret != 0) { 1643 error_setg_errno(errp, -ret, "Error while writing VM state"); 1644 } 1645 1646 if (ret != 0) { 1647 status = MIGRATION_STATUS_FAILED; 1648 } else { 1649 status = MIGRATION_STATUS_COMPLETED; 1650 } 1651 migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status); 1652 1653 /* f is outer parameter, it should not stay in global migration state after 1654 * this function finished */ 1655 ms->to_dst_file = NULL; 1656 1657 return ret; 1658 } 1659 1660 void qemu_savevm_live_state(QEMUFile *f) 1661 { 1662 /* save QEMU_VM_SECTION_END section */ 1663 qemu_savevm_state_complete_precopy(f, true, false); 1664 qemu_put_byte(f, QEMU_VM_EOF); 1665 } 1666 1667 int qemu_save_device_state(QEMUFile *f) 1668 { 1669 SaveStateEntry *se; 1670 1671 if (!migration_in_colo_state()) { 1672 qemu_put_be32(f, QEMU_VM_FILE_MAGIC); 1673 qemu_put_be32(f, QEMU_VM_FILE_VERSION); 1674 } 1675 cpu_synchronize_all_states(); 1676 1677 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1678 int ret; 1679 1680 if (se->is_ram) { 1681 continue; 1682 } 1683 ret = vmstate_save(f, se, NULL); 1684 if (ret) { 1685 return ret; 1686 } 1687 } 1688 1689 qemu_put_byte(f, QEMU_VM_EOF); 1690 1691 return qemu_file_get_error(f); 1692 } 1693 1694 static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id) 1695 { 1696 SaveStateEntry *se; 1697 1698 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 1699 if (!strcmp(se->idstr, idstr) && 1700 (instance_id == se->instance_id || 1701 instance_id == se->alias_id)) 1702 return se; 1703 /* Migrating from an older version? */ 1704 if (strstr(se->idstr, idstr) && se->compat) { 1705 if (!strcmp(se->compat->idstr, idstr) && 1706 (instance_id == se->compat->instance_id || 1707 instance_id == se->alias_id)) 1708 return se; 1709 } 1710 } 1711 return NULL; 1712 } 1713 1714 enum LoadVMExitCodes { 1715 /* Allow a command to quit all layers of nested loadvm loops */ 1716 LOADVM_QUIT = 1, 1717 }; 1718 1719 /* ------ incoming postcopy messages ------ */ 1720 /* 'advise' arrives before any transfers just to tell us that a postcopy 1721 * *might* happen - it might be skipped if precopy transferred everything 1722 * quickly. 1723 */ 1724 static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, 1725 uint16_t len) 1726 { 1727 PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE); 1728 uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps; 1729 size_t page_size = qemu_target_page_size(); 1730 Error *local_err = NULL; 1731 1732 trace_loadvm_postcopy_handle_advise(); 1733 if (ps != POSTCOPY_INCOMING_NONE) { 1734 error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps); 1735 return -1; 1736 } 1737 1738 switch (len) { 1739 case 0: 1740 if (migrate_postcopy_ram()) { 1741 error_report("RAM postcopy is enabled but have 0 byte advise"); 1742 return -EINVAL; 1743 } 1744 return 0; 1745 case 8 + 8: 1746 if (!migrate_postcopy_ram()) { 1747 error_report("RAM postcopy is disabled but have 16 byte advise"); 1748 return -EINVAL; 1749 } 1750 break; 1751 default: 1752 error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len); 1753 return -EINVAL; 1754 } 1755 1756 if (!postcopy_ram_supported_by_host(mis)) { 1757 postcopy_state_set(POSTCOPY_INCOMING_NONE); 1758 return -1; 1759 } 1760 1761 remote_pagesize_summary = qemu_get_be64(mis->from_src_file); 1762 local_pagesize_summary = ram_pagesize_summary(); 1763 1764 if (remote_pagesize_summary != local_pagesize_summary) { 1765 /* 1766 * This detects two potential causes of mismatch: 1767 * a) A mismatch in host page sizes 1768 * Some combinations of mismatch are probably possible but it gets 1769 * a bit more complicated. In particular we need to place whole 1770 * host pages on the dest at once, and we need to ensure that we 1771 * handle dirtying to make sure we never end up sending part of 1772 * a hostpage on it's own. 1773 * b) The use of different huge page sizes on source/destination 1774 * a more fine grain test is performed during RAM block migration 1775 * but this test here causes a nice early clear failure, and 1776 * also fails when passed to an older qemu that doesn't 1777 * do huge pages. 1778 */ 1779 error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64 1780 " d=%" PRIx64 ")", 1781 remote_pagesize_summary, local_pagesize_summary); 1782 return -1; 1783 } 1784 1785 remote_tps = qemu_get_be64(mis->from_src_file); 1786 if (remote_tps != page_size) { 1787 /* 1788 * Again, some differences could be dealt with, but for now keep it 1789 * simple. 1790 */ 1791 error_report("Postcopy needs matching target page sizes (s=%d d=%zd)", 1792 (int)remote_tps, page_size); 1793 return -1; 1794 } 1795 1796 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) { 1797 error_report_err(local_err); 1798 return -1; 1799 } 1800 1801 if (ram_postcopy_incoming_init(mis)) { 1802 return -1; 1803 } 1804 1805 return 0; 1806 } 1807 1808 /* After postcopy we will be told to throw some pages away since they're 1809 * dirty and will have to be demand fetched. Must happen before CPU is 1810 * started. 1811 * There can be 0..many of these messages, each encoding multiple pages. 1812 */ 1813 static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, 1814 uint16_t len) 1815 { 1816 int tmp; 1817 char ramid[256]; 1818 PostcopyState ps = postcopy_state_get(); 1819 1820 trace_loadvm_postcopy_ram_handle_discard(); 1821 1822 switch (ps) { 1823 case POSTCOPY_INCOMING_ADVISE: 1824 /* 1st discard */ 1825 tmp = postcopy_ram_prepare_discard(mis); 1826 if (tmp) { 1827 return tmp; 1828 } 1829 break; 1830 1831 case POSTCOPY_INCOMING_DISCARD: 1832 /* Expected state */ 1833 break; 1834 1835 default: 1836 error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)", 1837 ps); 1838 return -1; 1839 } 1840 /* We're expecting a 1841 * Version (0) 1842 * a RAM ID string (length byte, name, 0 term) 1843 * then at least 1 16 byte chunk 1844 */ 1845 if (len < (1 + 1 + 1 + 1 + 2 * 8)) { 1846 error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); 1847 return -1; 1848 } 1849 1850 tmp = qemu_get_byte(mis->from_src_file); 1851 if (tmp != postcopy_ram_discard_version) { 1852 error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp); 1853 return -1; 1854 } 1855 1856 if (!qemu_get_counted_string(mis->from_src_file, ramid)) { 1857 error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID"); 1858 return -1; 1859 } 1860 tmp = qemu_get_byte(mis->from_src_file); 1861 if (tmp != 0) { 1862 error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp); 1863 return -1; 1864 } 1865 1866 len -= 3 + strlen(ramid); 1867 if (len % 16) { 1868 error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); 1869 return -1; 1870 } 1871 trace_loadvm_postcopy_ram_handle_discard_header(ramid, len); 1872 while (len) { 1873 uint64_t start_addr, block_length; 1874 start_addr = qemu_get_be64(mis->from_src_file); 1875 block_length = qemu_get_be64(mis->from_src_file); 1876 1877 len -= 16; 1878 int ret = ram_discard_range(ramid, start_addr, block_length); 1879 if (ret) { 1880 return ret; 1881 } 1882 } 1883 trace_loadvm_postcopy_ram_handle_discard_end(); 1884 1885 return 0; 1886 } 1887 1888 /* 1889 * Triggered by a postcopy_listen command; this thread takes over reading 1890 * the input stream, leaving the main thread free to carry on loading the rest 1891 * of the device state (from RAM). 1892 * (TODO:This could do with being in a postcopy file - but there again it's 1893 * just another input loop, not that postcopy specific) 1894 */ 1895 static void *postcopy_ram_listen_thread(void *opaque) 1896 { 1897 MigrationIncomingState *mis = migration_incoming_get_current(); 1898 QEMUFile *f = mis->from_src_file; 1899 int load_res; 1900 MigrationState *migr = migrate_get_current(); 1901 1902 object_ref(OBJECT(migr)); 1903 1904 migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, 1905 MIGRATION_STATUS_POSTCOPY_ACTIVE); 1906 qemu_sem_post(&mis->thread_sync_sem); 1907 trace_postcopy_ram_listen_thread_start(); 1908 1909 rcu_register_thread(); 1910 /* 1911 * Because we're a thread and not a coroutine we can't yield 1912 * in qemu_file, and thus we must be blocking now. 1913 */ 1914 qemu_file_set_blocking(f, true); 1915 load_res = qemu_loadvm_state_main(f, mis); 1916 1917 /* 1918 * This is tricky, but, mis->from_src_file can change after it 1919 * returns, when postcopy recovery happened. In the future, we may 1920 * want a wrapper for the QEMUFile handle. 1921 */ 1922 f = mis->from_src_file; 1923 1924 /* And non-blocking again so we don't block in any cleanup */ 1925 qemu_file_set_blocking(f, false); 1926 1927 trace_postcopy_ram_listen_thread_exit(); 1928 if (load_res < 0) { 1929 qemu_file_set_error(f, load_res); 1930 dirty_bitmap_mig_cancel_incoming(); 1931 if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING && 1932 !migrate_postcopy_ram() && migrate_dirty_bitmaps()) 1933 { 1934 error_report("%s: loadvm failed during postcopy: %d. All states " 1935 "are migrated except dirty bitmaps. Some dirty " 1936 "bitmaps may be lost, and present migrated dirty " 1937 "bitmaps are correctly migrated and valid.", 1938 __func__, load_res); 1939 load_res = 0; /* prevent further exit() */ 1940 } else { 1941 error_report("%s: loadvm failed: %d", __func__, load_res); 1942 migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 1943 MIGRATION_STATUS_FAILED); 1944 } 1945 } 1946 if (load_res >= 0) { 1947 /* 1948 * This looks good, but it's possible that the device loading in the 1949 * main thread hasn't finished yet, and so we might not be in 'RUN' 1950 * state yet; wait for the end of the main thread. 1951 */ 1952 qemu_event_wait(&mis->main_thread_load_event); 1953 } 1954 postcopy_ram_incoming_cleanup(mis); 1955 1956 if (load_res < 0) { 1957 /* 1958 * If something went wrong then we have a bad state so exit; 1959 * depending how far we got it might be possible at this point 1960 * to leave the guest running and fire MCEs for pages that never 1961 * arrived as a desperate recovery step. 1962 */ 1963 rcu_unregister_thread(); 1964 exit(EXIT_FAILURE); 1965 } 1966 1967 migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 1968 MIGRATION_STATUS_COMPLETED); 1969 /* 1970 * If everything has worked fine, then the main thread has waited 1971 * for us to start, and we're the last use of the mis. 1972 * (If something broke then qemu will have to exit anyway since it's 1973 * got a bad migration state). 1974 */ 1975 migration_incoming_state_destroy(); 1976 qemu_loadvm_state_cleanup(); 1977 1978 rcu_unregister_thread(); 1979 mis->have_listen_thread = false; 1980 postcopy_state_set(POSTCOPY_INCOMING_END); 1981 1982 object_unref(OBJECT(migr)); 1983 1984 return NULL; 1985 } 1986 1987 /* After this message we must be able to immediately receive postcopy data */ 1988 static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) 1989 { 1990 PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING); 1991 Error *local_err = NULL; 1992 1993 trace_loadvm_postcopy_handle_listen("enter"); 1994 1995 if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) { 1996 error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps); 1997 return -1; 1998 } 1999 if (ps == POSTCOPY_INCOMING_ADVISE) { 2000 /* 2001 * A rare case, we entered listen without having to do any discards, 2002 * so do the setup that's normally done at the time of the 1st discard. 2003 */ 2004 if (migrate_postcopy_ram()) { 2005 postcopy_ram_prepare_discard(mis); 2006 } 2007 } 2008 2009 trace_loadvm_postcopy_handle_listen("after discard"); 2010 2011 /* 2012 * Sensitise RAM - can now generate requests for blocks that don't exist 2013 * However, at this point the CPU shouldn't be running, and the IO 2014 * shouldn't be doing anything yet so don't actually expect requests 2015 */ 2016 if (migrate_postcopy_ram()) { 2017 if (postcopy_ram_incoming_setup(mis)) { 2018 postcopy_ram_incoming_cleanup(mis); 2019 return -1; 2020 } 2021 } 2022 2023 trace_loadvm_postcopy_handle_listen("after uffd"); 2024 2025 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) { 2026 error_report_err(local_err); 2027 return -1; 2028 } 2029 2030 mis->have_listen_thread = true; 2031 postcopy_thread_create(mis, &mis->listen_thread, "postcopy/listen", 2032 postcopy_ram_listen_thread, QEMU_THREAD_DETACHED); 2033 trace_loadvm_postcopy_handle_listen("return"); 2034 2035 return 0; 2036 } 2037 2038 static void loadvm_postcopy_handle_run_bh(void *opaque) 2039 { 2040 Error *local_err = NULL; 2041 MigrationIncomingState *mis = opaque; 2042 2043 trace_loadvm_postcopy_handle_run_bh("enter"); 2044 2045 /* TODO we should move all of this lot into postcopy_ram.c or a shared code 2046 * in migration.c 2047 */ 2048 cpu_synchronize_all_post_init(); 2049 2050 trace_loadvm_postcopy_handle_run_bh("after cpu sync"); 2051 2052 qemu_announce_self(&mis->announce_timer, migrate_announce_params()); 2053 2054 trace_loadvm_postcopy_handle_run_bh("after announce"); 2055 2056 /* Make sure all file formats throw away their mutable metadata. 2057 * If we get an error here, just don't restart the VM yet. */ 2058 bdrv_activate_all(&local_err); 2059 if (local_err) { 2060 error_report_err(local_err); 2061 local_err = NULL; 2062 autostart = false; 2063 } 2064 2065 trace_loadvm_postcopy_handle_run_bh("after invalidate cache"); 2066 2067 dirty_bitmap_mig_before_vm_start(); 2068 2069 if (autostart) { 2070 /* Hold onto your hats, starting the CPU */ 2071 vm_start(); 2072 } else { 2073 /* leave it paused and let management decide when to start the CPU */ 2074 runstate_set(RUN_STATE_PAUSED); 2075 } 2076 2077 qemu_bh_delete(mis->bh); 2078 2079 trace_loadvm_postcopy_handle_run_bh("return"); 2080 } 2081 2082 /* After all discards we can start running and asking for pages */ 2083 static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) 2084 { 2085 PostcopyState ps = postcopy_state_get(); 2086 2087 trace_loadvm_postcopy_handle_run(); 2088 if (ps != POSTCOPY_INCOMING_LISTENING) { 2089 error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps); 2090 return -1; 2091 } 2092 2093 postcopy_state_set(POSTCOPY_INCOMING_RUNNING); 2094 mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis); 2095 qemu_bh_schedule(mis->bh); 2096 2097 /* We need to finish reading the stream from the package 2098 * and also stop reading anything more from the stream that loaded the 2099 * package (since it's now being read by the listener thread). 2100 * LOADVM_QUIT will quit all the layers of nested loadvm loops. 2101 */ 2102 return LOADVM_QUIT; 2103 } 2104 2105 /* We must be with page_request_mutex held */ 2106 static gboolean postcopy_sync_page_req(gpointer key, gpointer value, 2107 gpointer data) 2108 { 2109 MigrationIncomingState *mis = data; 2110 void *host_addr = (void *) key; 2111 ram_addr_t rb_offset; 2112 RAMBlock *rb; 2113 int ret; 2114 2115 rb = qemu_ram_block_from_host(host_addr, true, &rb_offset); 2116 if (!rb) { 2117 /* 2118 * This should _never_ happen. However be nice for a migrating VM to 2119 * not crash/assert. Post an error (note: intended to not use *_once 2120 * because we do want to see all the illegal addresses; and this can 2121 * never be triggered by the guest so we're safe) and move on next. 2122 */ 2123 error_report("%s: illegal host addr %p", __func__, host_addr); 2124 /* Try the next entry */ 2125 return FALSE; 2126 } 2127 2128 ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset); 2129 if (ret) { 2130 /* Please refer to above comment. */ 2131 error_report("%s: send rp message failed for addr %p", 2132 __func__, host_addr); 2133 return FALSE; 2134 } 2135 2136 trace_postcopy_page_req_sync(host_addr); 2137 2138 return FALSE; 2139 } 2140 2141 static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis) 2142 { 2143 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) { 2144 g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis); 2145 } 2146 } 2147 2148 static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) 2149 { 2150 if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 2151 error_report("%s: illegal resume received", __func__); 2152 /* Don't fail the load, only for this. */ 2153 return 0; 2154 } 2155 2156 /* 2157 * Reset the last_rb before we resend any page req to source again, since 2158 * the source should have it reset already. 2159 */ 2160 mis->last_rb = NULL; 2161 2162 /* 2163 * This means source VM is ready to resume the postcopy migration. 2164 */ 2165 migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER, 2166 MIGRATION_STATUS_POSTCOPY_ACTIVE); 2167 2168 trace_loadvm_postcopy_handle_resume(); 2169 2170 /* Tell source that "we are ready" */ 2171 migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE); 2172 2173 /* 2174 * After a postcopy recovery, the source should have lost the postcopy 2175 * queue, or potentially the requested pages could have been lost during 2176 * the network down phase. Let's re-sync with the source VM by re-sending 2177 * all the pending pages that we eagerly need, so these threads won't get 2178 * blocked too long due to the recovery. 2179 * 2180 * Without this procedure, the faulted destination VM threads (waiting for 2181 * page requests right before the postcopy is interrupted) can keep hanging 2182 * until the pages are sent by the source during the background copying of 2183 * pages, or another thread faulted on the same address accidentally. 2184 */ 2185 migrate_send_rp_req_pages_pending(mis); 2186 2187 /* 2188 * It's time to switch state and release the fault thread to continue 2189 * service page faults. Note that this should be explicitly after the 2190 * above call to migrate_send_rp_req_pages_pending(). In short: 2191 * migrate_send_rp_message_req_pages() is not thread safe, yet. 2192 */ 2193 qemu_sem_post(&mis->postcopy_pause_sem_fault); 2194 2195 if (migrate_postcopy_preempt()) { 2196 /* 2197 * The preempt channel will be created in async manner, now let's 2198 * wait for it and make sure it's created. 2199 */ 2200 qemu_sem_wait(&mis->postcopy_qemufile_dst_done); 2201 assert(mis->postcopy_qemufile_dst); 2202 /* Kick the fast ram load thread too */ 2203 qemu_sem_post(&mis->postcopy_pause_sem_fast_load); 2204 } 2205 2206 return 0; 2207 } 2208 2209 /** 2210 * Immediately following this command is a blob of data containing an embedded 2211 * chunk of migration stream; read it and load it. 2212 * 2213 * @mis: Incoming state 2214 * @length: Length of packaged data to read 2215 * 2216 * Returns: Negative values on error 2217 * 2218 */ 2219 static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) 2220 { 2221 int ret; 2222 size_t length; 2223 QIOChannelBuffer *bioc; 2224 2225 length = qemu_get_be32(mis->from_src_file); 2226 trace_loadvm_handle_cmd_packaged(length); 2227 2228 if (length > MAX_VM_CMD_PACKAGED_SIZE) { 2229 error_report("Unreasonably large packaged state: %zu", length); 2230 return -1; 2231 } 2232 2233 bioc = qio_channel_buffer_new(length); 2234 qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer"); 2235 ret = qemu_get_buffer(mis->from_src_file, 2236 bioc->data, 2237 length); 2238 if (ret != length) { 2239 object_unref(OBJECT(bioc)); 2240 error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu", 2241 ret, length); 2242 return (ret < 0) ? ret : -EAGAIN; 2243 } 2244 bioc->usage += length; 2245 trace_loadvm_handle_cmd_packaged_received(ret); 2246 2247 QEMUFile *packf = qemu_file_new_input(QIO_CHANNEL(bioc)); 2248 2249 ret = qemu_loadvm_state_main(packf, mis); 2250 trace_loadvm_handle_cmd_packaged_main(ret); 2251 qemu_fclose(packf); 2252 object_unref(OBJECT(bioc)); 2253 2254 return ret; 2255 } 2256 2257 /* 2258 * Handle request that source requests for recved_bitmap on 2259 * destination. Payload format: 2260 * 2261 * len (1 byte) + ramblock_name (<255 bytes) 2262 */ 2263 static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis, 2264 uint16_t len) 2265 { 2266 QEMUFile *file = mis->from_src_file; 2267 RAMBlock *rb; 2268 char block_name[256]; 2269 size_t cnt; 2270 2271 cnt = qemu_get_counted_string(file, block_name); 2272 if (!cnt) { 2273 error_report("%s: failed to read block name", __func__); 2274 return -EINVAL; 2275 } 2276 2277 /* Validate before using the data */ 2278 if (qemu_file_get_error(file)) { 2279 return qemu_file_get_error(file); 2280 } 2281 2282 if (len != cnt + 1) { 2283 error_report("%s: invalid payload length (%d)", __func__, len); 2284 return -EINVAL; 2285 } 2286 2287 rb = qemu_ram_block_by_name(block_name); 2288 if (!rb) { 2289 error_report("%s: block '%s' not found", __func__, block_name); 2290 return -EINVAL; 2291 } 2292 2293 migrate_send_rp_recv_bitmap(mis, block_name); 2294 2295 trace_loadvm_handle_recv_bitmap(block_name); 2296 2297 return 0; 2298 } 2299 2300 static int loadvm_process_enable_colo(MigrationIncomingState *mis) 2301 { 2302 int ret = migration_incoming_enable_colo(); 2303 2304 if (!ret) { 2305 ret = colo_init_ram_cache(); 2306 if (ret) { 2307 migration_incoming_disable_colo(); 2308 } 2309 } 2310 return ret; 2311 } 2312 2313 /* 2314 * Process an incoming 'QEMU_VM_COMMAND' 2315 * 0 just a normal return 2316 * LOADVM_QUIT All good, but exit the loop 2317 * <0 Error 2318 */ 2319 static int loadvm_process_command(QEMUFile *f) 2320 { 2321 MigrationIncomingState *mis = migration_incoming_get_current(); 2322 uint16_t cmd; 2323 uint16_t len; 2324 uint32_t tmp32; 2325 2326 cmd = qemu_get_be16(f); 2327 len = qemu_get_be16(f); 2328 2329 /* Check validity before continue processing of cmds */ 2330 if (qemu_file_get_error(f)) { 2331 return qemu_file_get_error(f); 2332 } 2333 2334 if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) { 2335 error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len); 2336 return -EINVAL; 2337 } 2338 2339 trace_loadvm_process_command(mig_cmd_args[cmd].name, len); 2340 2341 if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) { 2342 error_report("%s received with bad length - expecting %zu, got %d", 2343 mig_cmd_args[cmd].name, 2344 (size_t)mig_cmd_args[cmd].len, len); 2345 return -ERANGE; 2346 } 2347 2348 switch (cmd) { 2349 case MIG_CMD_OPEN_RETURN_PATH: 2350 if (mis->to_src_file) { 2351 error_report("CMD_OPEN_RETURN_PATH called when RP already open"); 2352 /* Not really a problem, so don't give up */ 2353 return 0; 2354 } 2355 mis->to_src_file = qemu_file_get_return_path(f); 2356 if (!mis->to_src_file) { 2357 error_report("CMD_OPEN_RETURN_PATH failed"); 2358 return -1; 2359 } 2360 break; 2361 2362 case MIG_CMD_PING: 2363 tmp32 = qemu_get_be32(f); 2364 trace_loadvm_process_command_ping(tmp32); 2365 if (!mis->to_src_file) { 2366 error_report("CMD_PING (0x%x) received with no return path", 2367 tmp32); 2368 return -1; 2369 } 2370 migrate_send_rp_pong(mis, tmp32); 2371 break; 2372 2373 case MIG_CMD_PACKAGED: 2374 return loadvm_handle_cmd_packaged(mis); 2375 2376 case MIG_CMD_POSTCOPY_ADVISE: 2377 return loadvm_postcopy_handle_advise(mis, len); 2378 2379 case MIG_CMD_POSTCOPY_LISTEN: 2380 return loadvm_postcopy_handle_listen(mis); 2381 2382 case MIG_CMD_POSTCOPY_RUN: 2383 return loadvm_postcopy_handle_run(mis); 2384 2385 case MIG_CMD_POSTCOPY_RAM_DISCARD: 2386 return loadvm_postcopy_ram_handle_discard(mis, len); 2387 2388 case MIG_CMD_POSTCOPY_RESUME: 2389 return loadvm_postcopy_handle_resume(mis); 2390 2391 case MIG_CMD_RECV_BITMAP: 2392 return loadvm_handle_recv_bitmap(mis, len); 2393 2394 case MIG_CMD_ENABLE_COLO: 2395 return loadvm_process_enable_colo(mis); 2396 } 2397 2398 return 0; 2399 } 2400 2401 /* 2402 * Read a footer off the wire and check that it matches the expected section 2403 * 2404 * Returns: true if the footer was good 2405 * false if there is a problem (and calls error_report to say why) 2406 */ 2407 static bool check_section_footer(QEMUFile *f, SaveStateEntry *se) 2408 { 2409 int ret; 2410 uint8_t read_mark; 2411 uint32_t read_section_id; 2412 2413 if (!migrate_get_current()->send_section_footer) { 2414 /* No footer to check */ 2415 return true; 2416 } 2417 2418 read_mark = qemu_get_byte(f); 2419 2420 ret = qemu_file_get_error(f); 2421 if (ret) { 2422 error_report("%s: Read section footer failed: %d", 2423 __func__, ret); 2424 return false; 2425 } 2426 2427 if (read_mark != QEMU_VM_SECTION_FOOTER) { 2428 error_report("Missing section footer for %s", se->idstr); 2429 return false; 2430 } 2431 2432 read_section_id = qemu_get_be32(f); 2433 if (read_section_id != se->load_section_id) { 2434 error_report("Mismatched section id in footer for %s -" 2435 " read 0x%x expected 0x%x", 2436 se->idstr, read_section_id, se->load_section_id); 2437 return false; 2438 } 2439 2440 /* All good */ 2441 return true; 2442 } 2443 2444 static int 2445 qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis) 2446 { 2447 uint32_t instance_id, version_id, section_id; 2448 SaveStateEntry *se; 2449 char idstr[256]; 2450 int ret; 2451 2452 /* Read section start */ 2453 section_id = qemu_get_be32(f); 2454 if (!qemu_get_counted_string(f, idstr)) { 2455 error_report("Unable to read ID string for section %u", 2456 section_id); 2457 return -EINVAL; 2458 } 2459 instance_id = qemu_get_be32(f); 2460 version_id = qemu_get_be32(f); 2461 2462 ret = qemu_file_get_error(f); 2463 if (ret) { 2464 error_report("%s: Failed to read instance/version ID: %d", 2465 __func__, ret); 2466 return ret; 2467 } 2468 2469 trace_qemu_loadvm_state_section_startfull(section_id, idstr, 2470 instance_id, version_id); 2471 /* Find savevm section */ 2472 se = find_se(idstr, instance_id); 2473 if (se == NULL) { 2474 error_report("Unknown savevm section or instance '%s' %"PRIu32". " 2475 "Make sure that your current VM setup matches your " 2476 "saved VM setup, including any hotplugged devices", 2477 idstr, instance_id); 2478 return -EINVAL; 2479 } 2480 2481 /* Validate version */ 2482 if (version_id > se->version_id) { 2483 error_report("savevm: unsupported version %d for '%s' v%d", 2484 version_id, idstr, se->version_id); 2485 return -EINVAL; 2486 } 2487 se->load_version_id = version_id; 2488 se->load_section_id = section_id; 2489 2490 /* Validate if it is a device's state */ 2491 if (xen_enabled() && se->is_ram) { 2492 error_report("loadvm: %s RAM loading not allowed on Xen", idstr); 2493 return -EINVAL; 2494 } 2495 2496 ret = vmstate_load(f, se); 2497 if (ret < 0) { 2498 error_report("error while loading state for instance 0x%"PRIx32" of" 2499 " device '%s'", instance_id, idstr); 2500 return ret; 2501 } 2502 if (!check_section_footer(f, se)) { 2503 return -EINVAL; 2504 } 2505 2506 return 0; 2507 } 2508 2509 static int 2510 qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis) 2511 { 2512 uint32_t section_id; 2513 SaveStateEntry *se; 2514 int ret; 2515 2516 section_id = qemu_get_be32(f); 2517 2518 ret = qemu_file_get_error(f); 2519 if (ret) { 2520 error_report("%s: Failed to read section ID: %d", 2521 __func__, ret); 2522 return ret; 2523 } 2524 2525 trace_qemu_loadvm_state_section_partend(section_id); 2526 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 2527 if (se->load_section_id == section_id) { 2528 break; 2529 } 2530 } 2531 if (se == NULL) { 2532 error_report("Unknown savevm section %d", section_id); 2533 return -EINVAL; 2534 } 2535 2536 ret = vmstate_load(f, se); 2537 if (ret < 0) { 2538 error_report("error while loading state section id %d(%s)", 2539 section_id, se->idstr); 2540 return ret; 2541 } 2542 if (!check_section_footer(f, se)) { 2543 return -EINVAL; 2544 } 2545 2546 return 0; 2547 } 2548 2549 static int qemu_loadvm_state_header(QEMUFile *f) 2550 { 2551 unsigned int v; 2552 int ret; 2553 2554 v = qemu_get_be32(f); 2555 if (v != QEMU_VM_FILE_MAGIC) { 2556 error_report("Not a migration stream"); 2557 return -EINVAL; 2558 } 2559 2560 v = qemu_get_be32(f); 2561 if (v == QEMU_VM_FILE_VERSION_COMPAT) { 2562 error_report("SaveVM v2 format is obsolete and don't work anymore"); 2563 return -ENOTSUP; 2564 } 2565 if (v != QEMU_VM_FILE_VERSION) { 2566 error_report("Unsupported migration stream version"); 2567 return -ENOTSUP; 2568 } 2569 2570 if (migrate_get_current()->send_configuration) { 2571 if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { 2572 error_report("Configuration section missing"); 2573 qemu_loadvm_state_cleanup(); 2574 return -EINVAL; 2575 } 2576 ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); 2577 2578 if (ret) { 2579 qemu_loadvm_state_cleanup(); 2580 return ret; 2581 } 2582 } 2583 return 0; 2584 } 2585 2586 static int qemu_loadvm_state_setup(QEMUFile *f) 2587 { 2588 SaveStateEntry *se; 2589 int ret; 2590 2591 trace_loadvm_state_setup(); 2592 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 2593 if (!se->ops || !se->ops->load_setup) { 2594 continue; 2595 } 2596 if (se->ops->is_active) { 2597 if (!se->ops->is_active(se->opaque)) { 2598 continue; 2599 } 2600 } 2601 2602 ret = se->ops->load_setup(f, se->opaque); 2603 if (ret < 0) { 2604 qemu_file_set_error(f, ret); 2605 error_report("Load state of device %s failed", se->idstr); 2606 return ret; 2607 } 2608 } 2609 return 0; 2610 } 2611 2612 void qemu_loadvm_state_cleanup(void) 2613 { 2614 SaveStateEntry *se; 2615 2616 trace_loadvm_state_cleanup(); 2617 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { 2618 if (se->ops && se->ops->load_cleanup) { 2619 se->ops->load_cleanup(se->opaque); 2620 } 2621 } 2622 } 2623 2624 /* Return true if we should continue the migration, or false. */ 2625 static bool postcopy_pause_incoming(MigrationIncomingState *mis) 2626 { 2627 int i; 2628 2629 trace_postcopy_pause_incoming(); 2630 2631 assert(migrate_postcopy_ram()); 2632 2633 /* 2634 * Unregister yank with either from/to src would work, since ioc behind it 2635 * is the same 2636 */ 2637 migration_ioc_unregister_yank_from_file(mis->from_src_file); 2638 2639 assert(mis->from_src_file); 2640 qemu_file_shutdown(mis->from_src_file); 2641 qemu_fclose(mis->from_src_file); 2642 mis->from_src_file = NULL; 2643 2644 assert(mis->to_src_file); 2645 qemu_file_shutdown(mis->to_src_file); 2646 qemu_mutex_lock(&mis->rp_mutex); 2647 qemu_fclose(mis->to_src_file); 2648 mis->to_src_file = NULL; 2649 qemu_mutex_unlock(&mis->rp_mutex); 2650 2651 /* 2652 * NOTE: this must happen before reset the PostcopyTmpPages below, 2653 * otherwise it's racy to reset those fields when the fast load thread 2654 * can be accessing it in parallel. 2655 */ 2656 if (mis->postcopy_qemufile_dst) { 2657 qemu_file_shutdown(mis->postcopy_qemufile_dst); 2658 /* Take the mutex to make sure the fast ram load thread halted */ 2659 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex); 2660 migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst); 2661 qemu_fclose(mis->postcopy_qemufile_dst); 2662 mis->postcopy_qemufile_dst = NULL; 2663 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex); 2664 } 2665 2666 migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, 2667 MIGRATION_STATUS_POSTCOPY_PAUSED); 2668 2669 /* Notify the fault thread for the invalidated file handle */ 2670 postcopy_fault_thread_notify(mis); 2671 2672 /* 2673 * If network is interrupted, any temp page we received will be useless 2674 * because we didn't mark them as "received" in receivedmap. After a 2675 * proper recovery later (which will sync src dirty bitmap with receivedmap 2676 * on dest) these cached small pages will be resent again. 2677 */ 2678 for (i = 0; i < mis->postcopy_channels; i++) { 2679 postcopy_temp_page_reset(&mis->postcopy_tmp_pages[i]); 2680 } 2681 2682 error_report("Detected IO failure for postcopy. " 2683 "Migration paused."); 2684 2685 while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { 2686 qemu_sem_wait(&mis->postcopy_pause_sem_dst); 2687 } 2688 2689 trace_postcopy_pause_incoming_continued(); 2690 2691 return true; 2692 } 2693 2694 int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) 2695 { 2696 uint8_t section_type; 2697 int ret = 0; 2698 2699 retry: 2700 while (true) { 2701 section_type = qemu_get_byte(f); 2702 2703 ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL); 2704 if (ret) { 2705 break; 2706 } 2707 2708 trace_qemu_loadvm_state_section(section_type); 2709 switch (section_type) { 2710 case QEMU_VM_SECTION_START: 2711 case QEMU_VM_SECTION_FULL: 2712 ret = qemu_loadvm_section_start_full(f, mis); 2713 if (ret < 0) { 2714 goto out; 2715 } 2716 break; 2717 case QEMU_VM_SECTION_PART: 2718 case QEMU_VM_SECTION_END: 2719 ret = qemu_loadvm_section_part_end(f, mis); 2720 if (ret < 0) { 2721 goto out; 2722 } 2723 break; 2724 case QEMU_VM_COMMAND: 2725 ret = loadvm_process_command(f); 2726 trace_qemu_loadvm_state_section_command(ret); 2727 if ((ret < 0) || (ret == LOADVM_QUIT)) { 2728 goto out; 2729 } 2730 break; 2731 case QEMU_VM_EOF: 2732 /* This is the end of migration */ 2733 goto out; 2734 default: 2735 error_report("Unknown savevm section type %d", section_type); 2736 ret = -EINVAL; 2737 goto out; 2738 } 2739 } 2740 2741 out: 2742 if (ret < 0) { 2743 qemu_file_set_error(f, ret); 2744 2745 /* Cancel bitmaps incoming regardless of recovery */ 2746 dirty_bitmap_mig_cancel_incoming(); 2747 2748 /* 2749 * If we are during an active postcopy, then we pause instead 2750 * of bail out to at least keep the VM's dirty data. Note 2751 * that POSTCOPY_INCOMING_LISTENING stage is still not enough, 2752 * during which we're still receiving device states and we 2753 * still haven't yet started the VM on destination. 2754 * 2755 * Only RAM postcopy supports recovery. Still, if RAM postcopy is 2756 * enabled, canceled bitmaps postcopy will not affect RAM postcopy 2757 * recovering. 2758 */ 2759 if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING && 2760 migrate_postcopy_ram() && postcopy_pause_incoming(mis)) { 2761 /* Reset f to point to the newly created channel */ 2762 f = mis->from_src_file; 2763 goto retry; 2764 } 2765 } 2766 return ret; 2767 } 2768 2769 int qemu_loadvm_state(QEMUFile *f) 2770 { 2771 MigrationIncomingState *mis = migration_incoming_get_current(); 2772 Error *local_err = NULL; 2773 int ret; 2774 2775 if (qemu_savevm_state_blocked(&local_err)) { 2776 error_report_err(local_err); 2777 return -EINVAL; 2778 } 2779 2780 ret = qemu_loadvm_state_header(f); 2781 if (ret) { 2782 return ret; 2783 } 2784 2785 if (qemu_loadvm_state_setup(f) != 0) { 2786 return -EINVAL; 2787 } 2788 2789 cpu_synchronize_all_pre_loadvm(); 2790 2791 ret = qemu_loadvm_state_main(f, mis); 2792 qemu_event_set(&mis->main_thread_load_event); 2793 2794 trace_qemu_loadvm_state_post_main(ret); 2795 2796 if (mis->have_listen_thread) { 2797 /* Listen thread still going, can't clean up yet */ 2798 return ret; 2799 } 2800 2801 if (ret == 0) { 2802 ret = qemu_file_get_error(f); 2803 } 2804 2805 /* 2806 * Try to read in the VMDESC section as well, so that dumping tools that 2807 * intercept our migration stream have the chance to see it. 2808 */ 2809 2810 /* We've got to be careful; if we don't read the data and just shut the fd 2811 * then the sender can error if we close while it's still sending. 2812 * We also mustn't read data that isn't there; some transports (RDMA) 2813 * will stall waiting for that data when the source has already closed. 2814 */ 2815 if (ret == 0 && should_send_vmdesc()) { 2816 uint8_t *buf; 2817 uint32_t size; 2818 uint8_t section_type = qemu_get_byte(f); 2819 2820 if (section_type != QEMU_VM_VMDESCRIPTION) { 2821 error_report("Expected vmdescription section, but got %d", 2822 section_type); 2823 /* 2824 * It doesn't seem worth failing at this point since 2825 * we apparently have an otherwise valid VM state 2826 */ 2827 } else { 2828 buf = g_malloc(0x1000); 2829 size = qemu_get_be32(f); 2830 2831 while (size > 0) { 2832 uint32_t read_chunk = MIN(size, 0x1000); 2833 qemu_get_buffer(f, buf, read_chunk); 2834 size -= read_chunk; 2835 } 2836 g_free(buf); 2837 } 2838 } 2839 2840 qemu_loadvm_state_cleanup(); 2841 cpu_synchronize_all_post_init(); 2842 2843 return ret; 2844 } 2845 2846 int qemu_load_device_state(QEMUFile *f) 2847 { 2848 MigrationIncomingState *mis = migration_incoming_get_current(); 2849 int ret; 2850 2851 /* Load QEMU_VM_SECTION_FULL section */ 2852 ret = qemu_loadvm_state_main(f, mis); 2853 if (ret < 0) { 2854 error_report("Failed to load device state: %d", ret); 2855 return ret; 2856 } 2857 2858 cpu_synchronize_all_post_init(); 2859 return 0; 2860 } 2861 2862 bool save_snapshot(const char *name, bool overwrite, const char *vmstate, 2863 bool has_devices, strList *devices, Error **errp) 2864 { 2865 BlockDriverState *bs; 2866 QEMUSnapshotInfo sn1, *sn = &sn1; 2867 int ret = -1, ret2; 2868 QEMUFile *f; 2869 int saved_vm_running; 2870 uint64_t vm_state_size; 2871 g_autoptr(GDateTime) now = g_date_time_new_now_local(); 2872 AioContext *aio_context; 2873 2874 GLOBAL_STATE_CODE(); 2875 2876 if (migration_is_blocked(errp)) { 2877 return false; 2878 } 2879 2880 if (!replay_can_snapshot()) { 2881 error_setg(errp, "Record/replay does not allow making snapshot " 2882 "right now. Try once more later."); 2883 return false; 2884 } 2885 2886 if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { 2887 return false; 2888 } 2889 2890 /* Delete old snapshots of the same name */ 2891 if (name) { 2892 if (overwrite) { 2893 if (bdrv_all_delete_snapshot(name, has_devices, 2894 devices, errp) < 0) { 2895 return false; 2896 } 2897 } else { 2898 ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp); 2899 if (ret2 < 0) { 2900 return false; 2901 } 2902 if (ret2 == 1) { 2903 error_setg(errp, 2904 "Snapshot '%s' already exists in one or more devices", 2905 name); 2906 return false; 2907 } 2908 } 2909 } 2910 2911 bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp); 2912 if (bs == NULL) { 2913 return false; 2914 } 2915 aio_context = bdrv_get_aio_context(bs); 2916 2917 saved_vm_running = runstate_is_running(); 2918 2919 ret = global_state_store(); 2920 if (ret) { 2921 error_setg(errp, "Error saving global state"); 2922 return false; 2923 } 2924 vm_stop(RUN_STATE_SAVE_VM); 2925 2926 bdrv_drain_all_begin(); 2927 2928 aio_context_acquire(aio_context); 2929 2930 memset(sn, 0, sizeof(*sn)); 2931 2932 /* fill auxiliary fields */ 2933 sn->date_sec = g_date_time_to_unix(now); 2934 sn->date_nsec = g_date_time_get_microsecond(now) * 1000; 2935 sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 2936 if (replay_mode != REPLAY_MODE_NONE) { 2937 sn->icount = replay_get_current_icount(); 2938 } else { 2939 sn->icount = -1ULL; 2940 } 2941 2942 if (name) { 2943 pstrcpy(sn->name, sizeof(sn->name), name); 2944 } else { 2945 g_autofree char *autoname = g_date_time_format(now, "vm-%Y%m%d%H%M%S"); 2946 pstrcpy(sn->name, sizeof(sn->name), autoname); 2947 } 2948 2949 /* save the VM state */ 2950 f = qemu_fopen_bdrv(bs, 1); 2951 if (!f) { 2952 error_setg(errp, "Could not open VM state file"); 2953 goto the_end; 2954 } 2955 ret = qemu_savevm_state(f, errp); 2956 vm_state_size = qemu_file_total_transferred(f); 2957 ret2 = qemu_fclose(f); 2958 if (ret < 0) { 2959 goto the_end; 2960 } 2961 if (ret2 < 0) { 2962 ret = ret2; 2963 goto the_end; 2964 } 2965 2966 /* The bdrv_all_create_snapshot() call that follows acquires the AioContext 2967 * for itself. BDRV_POLL_WHILE() does not support nested locking because 2968 * it only releases the lock once. Therefore synchronous I/O will deadlock 2969 * unless we release the AioContext before bdrv_all_create_snapshot(). 2970 */ 2971 aio_context_release(aio_context); 2972 aio_context = NULL; 2973 2974 ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, 2975 has_devices, devices, errp); 2976 if (ret < 0) { 2977 bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL); 2978 goto the_end; 2979 } 2980 2981 ret = 0; 2982 2983 the_end: 2984 if (aio_context) { 2985 aio_context_release(aio_context); 2986 } 2987 2988 bdrv_drain_all_end(); 2989 2990 if (saved_vm_running) { 2991 vm_start(); 2992 } 2993 return ret == 0; 2994 } 2995 2996 void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live, 2997 Error **errp) 2998 { 2999 QEMUFile *f; 3000 QIOChannelFile *ioc; 3001 int saved_vm_running; 3002 int ret; 3003 3004 if (!has_live) { 3005 /* live default to true so old version of Xen tool stack can have a 3006 * successful live migration */ 3007 live = true; 3008 } 3009 3010 saved_vm_running = runstate_is_running(); 3011 vm_stop(RUN_STATE_SAVE_VM); 3012 global_state_store_running(); 3013 3014 ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC, 3015 0660, errp); 3016 if (!ioc) { 3017 goto the_end; 3018 } 3019 qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state"); 3020 f = qemu_file_new_output(QIO_CHANNEL(ioc)); 3021 object_unref(OBJECT(ioc)); 3022 ret = qemu_save_device_state(f); 3023 if (ret < 0 || qemu_fclose(f) < 0) { 3024 error_setg(errp, QERR_IO_ERROR); 3025 } else { 3026 /* libxl calls the QMP command "stop" before calling 3027 * "xen-save-devices-state" and in case of migration failure, libxl 3028 * would call "cont". 3029 * So call bdrv_inactivate_all (release locks) here to let the other 3030 * side of the migration take control of the images. 3031 */ 3032 if (live && !saved_vm_running) { 3033 ret = bdrv_inactivate_all(); 3034 if (ret) { 3035 error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)", 3036 __func__, ret); 3037 } 3038 } 3039 } 3040 3041 the_end: 3042 if (saved_vm_running) { 3043 vm_start(); 3044 } 3045 } 3046 3047 void qmp_xen_load_devices_state(const char *filename, Error **errp) 3048 { 3049 QEMUFile *f; 3050 QIOChannelFile *ioc; 3051 int ret; 3052 3053 /* Guest must be paused before loading the device state; the RAM state 3054 * will already have been loaded by xc 3055 */ 3056 if (runstate_is_running()) { 3057 error_setg(errp, "Cannot update device state while vm is running"); 3058 return; 3059 } 3060 vm_stop(RUN_STATE_RESTORE_VM); 3061 3062 ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp); 3063 if (!ioc) { 3064 return; 3065 } 3066 qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state"); 3067 f = qemu_file_new_input(QIO_CHANNEL(ioc)); 3068 object_unref(OBJECT(ioc)); 3069 3070 ret = qemu_loadvm_state(f); 3071 qemu_fclose(f); 3072 if (ret < 0) { 3073 error_setg(errp, QERR_IO_ERROR); 3074 } 3075 migration_incoming_state_destroy(); 3076 } 3077 3078 bool load_snapshot(const char *name, const char *vmstate, 3079 bool has_devices, strList *devices, Error **errp) 3080 { 3081 BlockDriverState *bs_vm_state; 3082 QEMUSnapshotInfo sn; 3083 QEMUFile *f; 3084 int ret; 3085 AioContext *aio_context; 3086 MigrationIncomingState *mis = migration_incoming_get_current(); 3087 3088 if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { 3089 return false; 3090 } 3091 ret = bdrv_all_has_snapshot(name, has_devices, devices, errp); 3092 if (ret < 0) { 3093 return false; 3094 } 3095 if (ret == 0) { 3096 error_setg(errp, "Snapshot '%s' does not exist in one or more devices", 3097 name); 3098 return false; 3099 } 3100 3101 bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp); 3102 if (!bs_vm_state) { 3103 return false; 3104 } 3105 aio_context = bdrv_get_aio_context(bs_vm_state); 3106 3107 /* Don't even try to load empty VM states */ 3108 aio_context_acquire(aio_context); 3109 ret = bdrv_snapshot_find(bs_vm_state, &sn, name); 3110 aio_context_release(aio_context); 3111 if (ret < 0) { 3112 return false; 3113 } else if (sn.vm_state_size == 0) { 3114 error_setg(errp, "This is a disk-only snapshot. Revert to it " 3115 " offline using qemu-img"); 3116 return false; 3117 } 3118 3119 /* 3120 * Flush the record/replay queue. Now the VM state is going 3121 * to change. Therefore we don't need to preserve its consistency 3122 */ 3123 replay_flush_events(); 3124 3125 /* Flush all IO requests so they don't interfere with the new state. */ 3126 bdrv_drain_all_begin(); 3127 3128 ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp); 3129 if (ret < 0) { 3130 goto err_drain; 3131 } 3132 3133 /* restore the VM state */ 3134 f = qemu_fopen_bdrv(bs_vm_state, 0); 3135 if (!f) { 3136 error_setg(errp, "Could not open VM state file"); 3137 goto err_drain; 3138 } 3139 3140 qemu_system_reset(SHUTDOWN_CAUSE_SNAPSHOT_LOAD); 3141 mis->from_src_file = f; 3142 3143 if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) { 3144 ret = -EINVAL; 3145 goto err_drain; 3146 } 3147 aio_context_acquire(aio_context); 3148 ret = qemu_loadvm_state(f); 3149 migration_incoming_state_destroy(); 3150 aio_context_release(aio_context); 3151 3152 bdrv_drain_all_end(); 3153 3154 if (ret < 0) { 3155 error_setg(errp, "Error %d while loading VM state", ret); 3156 return false; 3157 } 3158 3159 return true; 3160 3161 err_drain: 3162 bdrv_drain_all_end(); 3163 return false; 3164 } 3165 3166 bool delete_snapshot(const char *name, bool has_devices, 3167 strList *devices, Error **errp) 3168 { 3169 if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { 3170 return false; 3171 } 3172 3173 if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) { 3174 return false; 3175 } 3176 3177 return true; 3178 } 3179 3180 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev) 3181 { 3182 qemu_ram_set_idstr(mr->ram_block, 3183 memory_region_name(mr), dev); 3184 qemu_ram_set_migratable(mr->ram_block); 3185 } 3186 3187 void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev) 3188 { 3189 qemu_ram_unset_idstr(mr->ram_block); 3190 qemu_ram_unset_migratable(mr->ram_block); 3191 } 3192 3193 void vmstate_register_ram_global(MemoryRegion *mr) 3194 { 3195 vmstate_register_ram(mr, NULL); 3196 } 3197 3198 bool vmstate_check_only_migratable(const VMStateDescription *vmsd) 3199 { 3200 /* check needed if --only-migratable is specified */ 3201 if (!only_migratable) { 3202 return true; 3203 } 3204 3205 return !(vmsd && vmsd->unmigratable); 3206 } 3207 3208 typedef struct SnapshotJob { 3209 Job common; 3210 char *tag; 3211 char *vmstate; 3212 strList *devices; 3213 Coroutine *co; 3214 Error **errp; 3215 bool ret; 3216 } SnapshotJob; 3217 3218 static void qmp_snapshot_job_free(SnapshotJob *s) 3219 { 3220 g_free(s->tag); 3221 g_free(s->vmstate); 3222 qapi_free_strList(s->devices); 3223 } 3224 3225 3226 static void snapshot_load_job_bh(void *opaque) 3227 { 3228 Job *job = opaque; 3229 SnapshotJob *s = container_of(job, SnapshotJob, common); 3230 int orig_vm_running; 3231 3232 job_progress_set_remaining(&s->common, 1); 3233 3234 orig_vm_running = runstate_is_running(); 3235 vm_stop(RUN_STATE_RESTORE_VM); 3236 3237 s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp); 3238 if (s->ret && orig_vm_running) { 3239 vm_start(); 3240 } 3241 3242 job_progress_update(&s->common, 1); 3243 3244 qmp_snapshot_job_free(s); 3245 aio_co_wake(s->co); 3246 } 3247 3248 static void snapshot_save_job_bh(void *opaque) 3249 { 3250 Job *job = opaque; 3251 SnapshotJob *s = container_of(job, SnapshotJob, common); 3252 3253 job_progress_set_remaining(&s->common, 1); 3254 s->ret = save_snapshot(s->tag, false, s->vmstate, 3255 true, s->devices, s->errp); 3256 job_progress_update(&s->common, 1); 3257 3258 qmp_snapshot_job_free(s); 3259 aio_co_wake(s->co); 3260 } 3261 3262 static void snapshot_delete_job_bh(void *opaque) 3263 { 3264 Job *job = opaque; 3265 SnapshotJob *s = container_of(job, SnapshotJob, common); 3266 3267 job_progress_set_remaining(&s->common, 1); 3268 s->ret = delete_snapshot(s->tag, true, s->devices, s->errp); 3269 job_progress_update(&s->common, 1); 3270 3271 qmp_snapshot_job_free(s); 3272 aio_co_wake(s->co); 3273 } 3274 3275 static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp) 3276 { 3277 SnapshotJob *s = container_of(job, SnapshotJob, common); 3278 s->errp = errp; 3279 s->co = qemu_coroutine_self(); 3280 aio_bh_schedule_oneshot(qemu_get_aio_context(), 3281 snapshot_save_job_bh, job); 3282 qemu_coroutine_yield(); 3283 return s->ret ? 0 : -1; 3284 } 3285 3286 static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp) 3287 { 3288 SnapshotJob *s = container_of(job, SnapshotJob, common); 3289 s->errp = errp; 3290 s->co = qemu_coroutine_self(); 3291 aio_bh_schedule_oneshot(qemu_get_aio_context(), 3292 snapshot_load_job_bh, job); 3293 qemu_coroutine_yield(); 3294 return s->ret ? 0 : -1; 3295 } 3296 3297 static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp) 3298 { 3299 SnapshotJob *s = container_of(job, SnapshotJob, common); 3300 s->errp = errp; 3301 s->co = qemu_coroutine_self(); 3302 aio_bh_schedule_oneshot(qemu_get_aio_context(), 3303 snapshot_delete_job_bh, job); 3304 qemu_coroutine_yield(); 3305 return s->ret ? 0 : -1; 3306 } 3307 3308 3309 static const JobDriver snapshot_load_job_driver = { 3310 .instance_size = sizeof(SnapshotJob), 3311 .job_type = JOB_TYPE_SNAPSHOT_LOAD, 3312 .run = snapshot_load_job_run, 3313 }; 3314 3315 static const JobDriver snapshot_save_job_driver = { 3316 .instance_size = sizeof(SnapshotJob), 3317 .job_type = JOB_TYPE_SNAPSHOT_SAVE, 3318 .run = snapshot_save_job_run, 3319 }; 3320 3321 static const JobDriver snapshot_delete_job_driver = { 3322 .instance_size = sizeof(SnapshotJob), 3323 .job_type = JOB_TYPE_SNAPSHOT_DELETE, 3324 .run = snapshot_delete_job_run, 3325 }; 3326 3327 3328 void qmp_snapshot_save(const char *job_id, 3329 const char *tag, 3330 const char *vmstate, 3331 strList *devices, 3332 Error **errp) 3333 { 3334 SnapshotJob *s; 3335 3336 s = job_create(job_id, &snapshot_save_job_driver, NULL, 3337 qemu_get_aio_context(), JOB_MANUAL_DISMISS, 3338 NULL, NULL, errp); 3339 if (!s) { 3340 return; 3341 } 3342 3343 s->tag = g_strdup(tag); 3344 s->vmstate = g_strdup(vmstate); 3345 s->devices = QAPI_CLONE(strList, devices); 3346 3347 job_start(&s->common); 3348 } 3349 3350 void qmp_snapshot_load(const char *job_id, 3351 const char *tag, 3352 const char *vmstate, 3353 strList *devices, 3354 Error **errp) 3355 { 3356 SnapshotJob *s; 3357 3358 s = job_create(job_id, &snapshot_load_job_driver, NULL, 3359 qemu_get_aio_context(), JOB_MANUAL_DISMISS, 3360 NULL, NULL, errp); 3361 if (!s) { 3362 return; 3363 } 3364 3365 s->tag = g_strdup(tag); 3366 s->vmstate = g_strdup(vmstate); 3367 s->devices = QAPI_CLONE(strList, devices); 3368 3369 job_start(&s->common); 3370 } 3371 3372 void qmp_snapshot_delete(const char *job_id, 3373 const char *tag, 3374 strList *devices, 3375 Error **errp) 3376 { 3377 SnapshotJob *s; 3378 3379 s = job_create(job_id, &snapshot_delete_job_driver, NULL, 3380 qemu_get_aio_context(), JOB_MANUAL_DISMISS, 3381 NULL, NULL, errp); 3382 if (!s) { 3383 return; 3384 } 3385 3386 s->tag = g_strdup(tag); 3387 s->devices = QAPI_CLONE(strList, devices); 3388 3389 job_start(&s->common); 3390 } 3391