1 /* 2 * Postcopy migration for RAM 3 * 4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates 5 * 6 * Authors: 7 * Dave Gilbert <dgilbert@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 * 12 */ 13 14 /* 15 * Postcopy is a migration technique where the execution flips from the 16 * source to the destination before all the data has been copied. 17 */ 18 19 #include "qemu/osdep.h" 20 #include "qemu/madvise.h" 21 #include "exec/target_page.h" 22 #include "migration.h" 23 #include "qemu-file.h" 24 #include "savevm.h" 25 #include "postcopy-ram.h" 26 #include "ram.h" 27 #include "qapi/error.h" 28 #include "qemu/notify.h" 29 #include "qemu/rcu.h" 30 #include "sysemu/sysemu.h" 31 #include "qemu/error-report.h" 32 #include "trace.h" 33 #include "hw/boards.h" 34 #include "exec/ramblock.h" 35 #include "socket.h" 36 #include "yank_functions.h" 37 #include "tls.h" 38 #include "qemu/userfaultfd.h" 39 #include "qemu/mmap-alloc.h" 40 #include "options.h" 41 42 /* Arbitrary limit on size of each discard command, 43 * keeps them around ~200 bytes 44 */ 45 #define MAX_DISCARDS_PER_COMMAND 12 46 47 typedef struct PostcopyDiscardState { 48 const char *ramblock_name; 49 uint16_t cur_entry; 50 /* 51 * Start and length of a discard range (bytes) 52 */ 53 uint64_t start_list[MAX_DISCARDS_PER_COMMAND]; 54 uint64_t length_list[MAX_DISCARDS_PER_COMMAND]; 55 unsigned int nsentwords; 56 unsigned int nsentcmds; 57 } PostcopyDiscardState; 58 59 static NotifierWithReturnList postcopy_notifier_list; 60 61 void postcopy_infrastructure_init(void) 62 { 63 notifier_with_return_list_init(&postcopy_notifier_list); 64 } 65 66 void postcopy_add_notifier(NotifierWithReturn *nn) 67 { 68 notifier_with_return_list_add(&postcopy_notifier_list, nn); 69 } 70 71 void postcopy_remove_notifier(NotifierWithReturn *n) 72 { 73 notifier_with_return_remove(n); 74 } 75 76 int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp) 77 { 78 struct PostcopyNotifyData pnd; 79 pnd.reason = reason; 80 81 return notifier_with_return_list_notify(&postcopy_notifier_list, 82 &pnd, errp); 83 } 84 85 /* 86 * NOTE: this routine is not thread safe, we can't call it concurrently. But it 87 * should be good enough for migration's purposes. 88 */ 89 void postcopy_thread_create(MigrationIncomingState *mis, 90 QemuThread *thread, const char *name, 91 void *(*fn)(void *), int joinable) 92 { 93 qemu_sem_init(&mis->thread_sync_sem, 0); 94 qemu_thread_create(thread, name, fn, mis, joinable); 95 qemu_sem_wait(&mis->thread_sync_sem); 96 qemu_sem_destroy(&mis->thread_sync_sem); 97 } 98 99 /* Postcopy needs to detect accesses to pages that haven't yet been copied 100 * across, and efficiently map new pages in, the techniques for doing this 101 * are target OS specific. 102 */ 103 #if defined(__linux__) 104 #include <poll.h> 105 #include <sys/ioctl.h> 106 #include <sys/syscall.h> 107 #endif 108 109 #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD) 110 #include <sys/eventfd.h> 111 #include <linux/userfaultfd.h> 112 113 typedef struct PostcopyBlocktimeContext { 114 /* time when page fault initiated per vCPU */ 115 uint32_t *page_fault_vcpu_time; 116 /* page address per vCPU */ 117 uintptr_t *vcpu_addr; 118 uint32_t total_blocktime; 119 /* blocktime per vCPU */ 120 uint32_t *vcpu_blocktime; 121 /* point in time when last page fault was initiated */ 122 uint32_t last_begin; 123 /* number of vCPU are suspended */ 124 int smp_cpus_down; 125 uint64_t start_time; 126 127 /* 128 * Handler for exit event, necessary for 129 * releasing whole blocktime_ctx 130 */ 131 Notifier exit_notifier; 132 } PostcopyBlocktimeContext; 133 134 static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx) 135 { 136 g_free(ctx->page_fault_vcpu_time); 137 g_free(ctx->vcpu_addr); 138 g_free(ctx->vcpu_blocktime); 139 g_free(ctx); 140 } 141 142 static void migration_exit_cb(Notifier *n, void *data) 143 { 144 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext, 145 exit_notifier); 146 destroy_blocktime_context(ctx); 147 } 148 149 static struct PostcopyBlocktimeContext *blocktime_context_new(void) 150 { 151 MachineState *ms = MACHINE(qdev_get_machine()); 152 unsigned int smp_cpus = ms->smp.cpus; 153 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1); 154 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus); 155 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus); 156 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus); 157 158 ctx->exit_notifier.notify = migration_exit_cb; 159 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 160 qemu_add_exit_notifier(&ctx->exit_notifier); 161 return ctx; 162 } 163 164 static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx) 165 { 166 MachineState *ms = MACHINE(qdev_get_machine()); 167 uint32List *list = NULL; 168 int i; 169 170 for (i = ms->smp.cpus - 1; i >= 0; i--) { 171 QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]); 172 } 173 174 return list; 175 } 176 177 /* 178 * This function just populates MigrationInfo from postcopy's 179 * blocktime context. It will not populate MigrationInfo, 180 * unless postcopy-blocktime capability was set. 181 * 182 * @info: pointer to MigrationInfo to populate 183 */ 184 void fill_destination_postcopy_migration_info(MigrationInfo *info) 185 { 186 MigrationIncomingState *mis = migration_incoming_get_current(); 187 PostcopyBlocktimeContext *bc = mis->blocktime_ctx; 188 189 if (!bc) { 190 return; 191 } 192 193 info->has_postcopy_blocktime = true; 194 info->postcopy_blocktime = bc->total_blocktime; 195 info->has_postcopy_vcpu_blocktime = true; 196 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc); 197 } 198 199 static uint32_t get_postcopy_total_blocktime(void) 200 { 201 MigrationIncomingState *mis = migration_incoming_get_current(); 202 PostcopyBlocktimeContext *bc = mis->blocktime_ctx; 203 204 if (!bc) { 205 return 0; 206 } 207 208 return bc->total_blocktime; 209 } 210 211 /** 212 * receive_ufd_features: check userfault fd features, to request only supported 213 * features in the future. 214 * 215 * Returns: true on success 216 * 217 * __NR_userfaultfd - should be checked before 218 * @features: out parameter will contain uffdio_api.features provided by kernel 219 * in case of success 220 */ 221 static bool receive_ufd_features(uint64_t *features) 222 { 223 struct uffdio_api api_struct = {0}; 224 int ufd; 225 bool ret = true; 226 227 ufd = uffd_open(O_CLOEXEC); 228 if (ufd == -1) { 229 error_report("%s: uffd_open() failed: %s", __func__, strerror(errno)); 230 return false; 231 } 232 233 /* ask features */ 234 api_struct.api = UFFD_API; 235 api_struct.features = 0; 236 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 237 error_report("%s: UFFDIO_API failed: %s", __func__, 238 strerror(errno)); 239 ret = false; 240 goto release_ufd; 241 } 242 243 *features = api_struct.features; 244 245 release_ufd: 246 close(ufd); 247 return ret; 248 } 249 250 /** 251 * request_ufd_features: this function should be called only once on a newly 252 * opened ufd, subsequent calls will lead to error. 253 * 254 * Returns: true on success 255 * 256 * @ufd: fd obtained from userfaultfd syscall 257 * @features: bit mask see UFFD_API_FEATURES 258 */ 259 static bool request_ufd_features(int ufd, uint64_t features) 260 { 261 struct uffdio_api api_struct = {0}; 262 uint64_t ioctl_mask; 263 264 api_struct.api = UFFD_API; 265 api_struct.features = features; 266 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 267 error_report("%s failed: UFFDIO_API failed: %s", __func__, 268 strerror(errno)); 269 return false; 270 } 271 272 ioctl_mask = 1ULL << _UFFDIO_REGISTER | 273 1ULL << _UFFDIO_UNREGISTER; 274 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { 275 error_report("Missing userfault features: %" PRIx64, 276 (uint64_t)(~api_struct.ioctls & ioctl_mask)); 277 return false; 278 } 279 280 return true; 281 } 282 283 static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis, 284 Error **errp) 285 { 286 ERRP_GUARD(); 287 uint64_t asked_features = 0; 288 static uint64_t supported_features; 289 290 /* 291 * it's not possible to 292 * request UFFD_API twice per one fd 293 * userfault fd features is persistent 294 */ 295 if (!supported_features) { 296 if (!receive_ufd_features(&supported_features)) { 297 error_setg(errp, "Userfault feature detection failed"); 298 return false; 299 } 300 } 301 302 #ifdef UFFD_FEATURE_THREAD_ID 303 if (UFFD_FEATURE_THREAD_ID & supported_features) { 304 asked_features |= UFFD_FEATURE_THREAD_ID; 305 if (migrate_postcopy_blocktime()) { 306 if (!mis->blocktime_ctx) { 307 mis->blocktime_ctx = blocktime_context_new(); 308 } 309 } 310 } 311 #endif 312 313 /* 314 * request features, even if asked_features is 0, due to 315 * kernel expects UFFD_API before UFFDIO_REGISTER, per 316 * userfault file descriptor 317 */ 318 if (!request_ufd_features(ufd, asked_features)) { 319 error_setg(errp, "Failed features %" PRIu64, asked_features); 320 return false; 321 } 322 323 if (qemu_real_host_page_size() != ram_pagesize_summary()) { 324 bool have_hp = false; 325 /* We've got a huge page */ 326 #ifdef UFFD_FEATURE_MISSING_HUGETLBFS 327 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS; 328 #endif 329 if (!have_hp) { 330 error_setg(errp, 331 "Userfault on this host does not support huge pages"); 332 return false; 333 } 334 } 335 return true; 336 } 337 338 /* Callback from postcopy_ram_supported_by_host block iterator. 339 */ 340 static int test_ramblock_postcopiable(RAMBlock *rb, Error **errp) 341 { 342 const char *block_name = qemu_ram_get_idstr(rb); 343 ram_addr_t length = qemu_ram_get_used_length(rb); 344 size_t pagesize = qemu_ram_pagesize(rb); 345 QemuFsType fs; 346 347 if (length % pagesize) { 348 error_setg(errp, 349 "Postcopy requires RAM blocks to be a page size multiple," 350 " block %s is 0x" RAM_ADDR_FMT " bytes with a " 351 "page size of 0x%zx", block_name, length, pagesize); 352 return 1; 353 } 354 355 if (rb->fd >= 0) { 356 fs = qemu_fd_getfs(rb->fd); 357 if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) { 358 error_setg(errp, 359 "Host backend files need to be TMPFS or HUGETLBFS only"); 360 return 1; 361 } 362 } 363 364 return 0; 365 } 366 367 /* 368 * Note: This has the side effect of munlock'ing all of RAM, that's 369 * normally fine since if the postcopy succeeds it gets turned back on at the 370 * end. 371 */ 372 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp) 373 { 374 ERRP_GUARD(); 375 long pagesize = qemu_real_host_page_size(); 376 int ufd = -1; 377 bool ret = false; /* Error unless we change it */ 378 void *testarea = NULL; 379 struct uffdio_register reg_struct; 380 struct uffdio_range range_struct; 381 uint64_t feature_mask; 382 RAMBlock *block; 383 384 if (qemu_target_page_size() > pagesize) { 385 error_setg(errp, "Target page size bigger than host page size"); 386 goto out; 387 } 388 389 ufd = uffd_open(O_CLOEXEC); 390 if (ufd == -1) { 391 error_setg(errp, "Userfaultfd not available: %s", strerror(errno)); 392 goto out; 393 } 394 395 /* Give devices a chance to object */ 396 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, errp)) { 397 goto out; 398 } 399 400 /* Version and features check */ 401 if (!ufd_check_and_apply(ufd, mis, errp)) { 402 goto out; 403 } 404 405 /* 406 * We don't support postcopy with some type of ramblocks. 407 * 408 * NOTE: we explicitly ignored migrate_ram_is_ignored() instead we checked 409 * all possible ramblocks. This is because this function can be called 410 * when creating the migration object, during the phase RAM_MIGRATABLE 411 * is not even properly set for all the ramblocks. 412 * 413 * A side effect of this is we'll also check against RAM_SHARED 414 * ramblocks even if migrate_ignore_shared() is set (in which case 415 * we'll never migrate RAM_SHARED at all), but normally this shouldn't 416 * affect in reality, or we can revisit. 417 */ 418 RAMBLOCK_FOREACH(block) { 419 if (test_ramblock_postcopiable(block, errp)) { 420 goto out; 421 } 422 } 423 424 /* 425 * userfault and mlock don't go together; we'll put it back later if 426 * it was enabled. 427 */ 428 if (munlockall()) { 429 error_setg(errp, "munlockall() failed: %s", strerror(errno)); 430 goto out; 431 } 432 433 /* 434 * We need to check that the ops we need are supported on anon memory 435 * To do that we need to register a chunk and see the flags that 436 * are returned. 437 */ 438 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | 439 MAP_ANONYMOUS, -1, 0); 440 if (testarea == MAP_FAILED) { 441 error_setg(errp, "Failed to map test area: %s", strerror(errno)); 442 goto out; 443 } 444 g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize)); 445 446 reg_struct.range.start = (uintptr_t)testarea; 447 reg_struct.range.len = pagesize; 448 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 449 450 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) { 451 error_setg(errp, "UFFDIO_REGISTER failed: %s", strerror(errno)); 452 goto out; 453 } 454 455 range_struct.start = (uintptr_t)testarea; 456 range_struct.len = pagesize; 457 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) { 458 error_setg(errp, "UFFDIO_UNREGISTER failed: %s", strerror(errno)); 459 goto out; 460 } 461 462 feature_mask = 1ULL << _UFFDIO_WAKE | 463 1ULL << _UFFDIO_COPY | 464 1ULL << _UFFDIO_ZEROPAGE; 465 if ((reg_struct.ioctls & feature_mask) != feature_mask) { 466 error_setg(errp, "Missing userfault map features: %" PRIx64, 467 (uint64_t)(~reg_struct.ioctls & feature_mask)); 468 goto out; 469 } 470 471 /* Success! */ 472 ret = true; 473 out: 474 if (testarea) { 475 munmap(testarea, pagesize); 476 } 477 if (ufd != -1) { 478 close(ufd); 479 } 480 return ret; 481 } 482 483 /* 484 * Setup an area of RAM so that it *can* be used for postcopy later; this 485 * must be done right at the start prior to pre-copy. 486 * opaque should be the MIS. 487 */ 488 static int init_range(RAMBlock *rb, void *opaque) 489 { 490 const char *block_name = qemu_ram_get_idstr(rb); 491 void *host_addr = qemu_ram_get_host_addr(rb); 492 ram_addr_t offset = qemu_ram_get_offset(rb); 493 ram_addr_t length = qemu_ram_get_used_length(rb); 494 trace_postcopy_init_range(block_name, host_addr, offset, length); 495 496 /* 497 * Save the used_length before running the guest. In case we have to 498 * resize RAM blocks when syncing RAM block sizes from the source during 499 * precopy, we'll update it manually via the ram block notifier. 500 */ 501 rb->postcopy_length = length; 502 503 /* 504 * We need the whole of RAM to be truly empty for postcopy, so things 505 * like ROMs and any data tables built during init must be zero'd 506 * - we're going to get the copy from the source anyway. 507 * (Precopy will just overwrite this data, so doesn't need the discard) 508 */ 509 if (ram_discard_range(block_name, 0, length)) { 510 return -1; 511 } 512 513 return 0; 514 } 515 516 /* 517 * At the end of migration, undo the effects of init_range 518 * opaque should be the MIS. 519 */ 520 static int cleanup_range(RAMBlock *rb, void *opaque) 521 { 522 const char *block_name = qemu_ram_get_idstr(rb); 523 void *host_addr = qemu_ram_get_host_addr(rb); 524 ram_addr_t offset = qemu_ram_get_offset(rb); 525 ram_addr_t length = rb->postcopy_length; 526 MigrationIncomingState *mis = opaque; 527 struct uffdio_range range_struct; 528 trace_postcopy_cleanup_range(block_name, host_addr, offset, length); 529 530 /* 531 * We turned off hugepage for the precopy stage with postcopy enabled 532 * we can turn it back on now. 533 */ 534 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE); 535 536 /* 537 * We can also turn off userfault now since we should have all the 538 * pages. It can be useful to leave it on to debug postcopy 539 * if you're not sure it's always getting every page. 540 */ 541 range_struct.start = (uintptr_t)host_addr; 542 range_struct.len = length; 543 544 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) { 545 error_report("%s: userfault unregister %s", __func__, strerror(errno)); 546 547 return -1; 548 } 549 550 return 0; 551 } 552 553 /* 554 * Initialise postcopy-ram, setting the RAM to a state where we can go into 555 * postcopy later; must be called prior to any precopy. 556 * called from arch_init's similarly named ram_postcopy_incoming_init 557 */ 558 int postcopy_ram_incoming_init(MigrationIncomingState *mis) 559 { 560 if (foreach_not_ignored_block(init_range, NULL)) { 561 return -1; 562 } 563 564 return 0; 565 } 566 567 static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis) 568 { 569 int i; 570 571 if (mis->postcopy_tmp_pages) { 572 for (i = 0; i < mis->postcopy_channels; i++) { 573 if (mis->postcopy_tmp_pages[i].tmp_huge_page) { 574 munmap(mis->postcopy_tmp_pages[i].tmp_huge_page, 575 mis->largest_page_size); 576 mis->postcopy_tmp_pages[i].tmp_huge_page = NULL; 577 } 578 } 579 g_free(mis->postcopy_tmp_pages); 580 mis->postcopy_tmp_pages = NULL; 581 } 582 583 if (mis->postcopy_tmp_zero_page) { 584 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size); 585 mis->postcopy_tmp_zero_page = NULL; 586 } 587 } 588 589 /* 590 * At the end of a migration where postcopy_ram_incoming_init was called. 591 */ 592 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) 593 { 594 trace_postcopy_ram_incoming_cleanup_entry(); 595 596 if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) { 597 /* Notify the fast load thread to quit */ 598 mis->preempt_thread_status = PREEMPT_THREAD_QUIT; 599 /* 600 * Update preempt_thread_status before reading count. Note: mutex 601 * lock only provide ACQUIRE semantic, and it doesn't stops this 602 * write to be reordered after reading the count. 603 */ 604 smp_mb(); 605 /* 606 * It's possible that the preempt thread is still handling the last 607 * pages to arrive which were requested by guest page faults. 608 * Making sure nothing is left behind by waiting on the condvar if 609 * that unlikely case happened. 610 */ 611 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) { 612 if (qatomic_read(&mis->page_requested_count)) { 613 /* 614 * It is guaranteed to receive a signal later, because the 615 * count>0 now, so it's destined to be decreased to zero 616 * very soon by the preempt thread. 617 */ 618 qemu_cond_wait(&mis->page_request_cond, 619 &mis->page_request_mutex); 620 } 621 } 622 /* Notify the fast load thread to quit */ 623 if (mis->postcopy_qemufile_dst) { 624 qemu_file_shutdown(mis->postcopy_qemufile_dst); 625 } 626 qemu_thread_join(&mis->postcopy_prio_thread); 627 mis->preempt_thread_status = PREEMPT_THREAD_NONE; 628 } 629 630 if (mis->have_fault_thread) { 631 Error *local_err = NULL; 632 633 /* Let the fault thread quit */ 634 qatomic_set(&mis->fault_thread_quit, 1); 635 postcopy_fault_thread_notify(mis); 636 trace_postcopy_ram_incoming_cleanup_join(); 637 qemu_thread_join(&mis->fault_thread); 638 639 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) { 640 error_report_err(local_err); 641 return -1; 642 } 643 644 if (foreach_not_ignored_block(cleanup_range, mis)) { 645 return -1; 646 } 647 648 trace_postcopy_ram_incoming_cleanup_closeuf(); 649 close(mis->userfault_fd); 650 close(mis->userfault_event_fd); 651 mis->have_fault_thread = false; 652 } 653 654 if (enable_mlock) { 655 if (os_mlock() < 0) { 656 error_report("mlock: %s", strerror(errno)); 657 /* 658 * It doesn't feel right to fail at this point, we have a valid 659 * VM state. 660 */ 661 } 662 } 663 664 postcopy_temp_pages_cleanup(mis); 665 666 trace_postcopy_ram_incoming_cleanup_blocktime( 667 get_postcopy_total_blocktime()); 668 669 trace_postcopy_ram_incoming_cleanup_exit(); 670 return 0; 671 } 672 673 /* 674 * Disable huge pages on an area 675 */ 676 static int nhp_range(RAMBlock *rb, void *opaque) 677 { 678 const char *block_name = qemu_ram_get_idstr(rb); 679 void *host_addr = qemu_ram_get_host_addr(rb); 680 ram_addr_t offset = qemu_ram_get_offset(rb); 681 ram_addr_t length = rb->postcopy_length; 682 trace_postcopy_nhp_range(block_name, host_addr, offset, length); 683 684 /* 685 * Before we do discards we need to ensure those discards really 686 * do delete areas of the page, even if THP thinks a hugepage would 687 * be a good idea, so force hugepages off. 688 */ 689 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE); 690 691 return 0; 692 } 693 694 /* 695 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard 696 * however leaving it until after precopy means that most of the precopy 697 * data is still THPd 698 */ 699 int postcopy_ram_prepare_discard(MigrationIncomingState *mis) 700 { 701 if (foreach_not_ignored_block(nhp_range, mis)) { 702 return -1; 703 } 704 705 postcopy_state_set(POSTCOPY_INCOMING_DISCARD); 706 707 return 0; 708 } 709 710 /* 711 * Mark the given area of RAM as requiring notification to unwritten areas 712 * Used as a callback on foreach_not_ignored_block. 713 * host_addr: Base of area to mark 714 * offset: Offset in the whole ram arena 715 * length: Length of the section 716 * opaque: MigrationIncomingState pointer 717 * Returns 0 on success 718 */ 719 static int ram_block_enable_notify(RAMBlock *rb, void *opaque) 720 { 721 MigrationIncomingState *mis = opaque; 722 struct uffdio_register reg_struct; 723 724 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb); 725 reg_struct.range.len = rb->postcopy_length; 726 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 727 728 /* Now tell our userfault_fd that it's responsible for this area */ 729 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) { 730 error_report("%s userfault register: %s", __func__, strerror(errno)); 731 return -1; 732 } 733 if (!(reg_struct.ioctls & (1ULL << _UFFDIO_COPY))) { 734 error_report("%s userfault: Region doesn't support COPY", __func__); 735 return -1; 736 } 737 if (reg_struct.ioctls & (1ULL << _UFFDIO_ZEROPAGE)) { 738 qemu_ram_set_uf_zeroable(rb); 739 } 740 741 return 0; 742 } 743 744 int postcopy_wake_shared(struct PostCopyFD *pcfd, 745 uint64_t client_addr, 746 RAMBlock *rb) 747 { 748 size_t pagesize = qemu_ram_pagesize(rb); 749 struct uffdio_range range; 750 int ret; 751 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb)); 752 range.start = ROUND_DOWN(client_addr, pagesize); 753 range.len = pagesize; 754 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range); 755 if (ret) { 756 error_report("%s: Failed to wake: %zx in %s (%s)", 757 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb), 758 strerror(errno)); 759 } 760 return ret; 761 } 762 763 static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb, 764 ram_addr_t start, uint64_t haddr) 765 { 766 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb)); 767 768 /* 769 * Discarded pages (via RamDiscardManager) are never migrated. On unlikely 770 * access, place a zeropage, which will also set the relevant bits in the 771 * recv_bitmap accordingly, so we won't try placing a zeropage twice. 772 * 773 * Checking a single bit is sufficient to handle pagesize > TPS as either 774 * all relevant bits are set or not. 775 */ 776 assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb))); 777 if (ramblock_page_is_discarded(rb, start)) { 778 bool received = ramblock_recv_bitmap_test_byte_offset(rb, start); 779 780 return received ? 0 : postcopy_place_page_zero(mis, aligned, rb); 781 } 782 783 return migrate_send_rp_req_pages(mis, rb, start, haddr); 784 } 785 786 /* 787 * Callback from shared fault handlers to ask for a page, 788 * the page must be specified by a RAMBlock and an offset in that rb 789 * Note: Only for use by shared fault handlers (in fault thread) 790 */ 791 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, 792 uint64_t client_addr, uint64_t rb_offset) 793 { 794 uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb)); 795 MigrationIncomingState *mis = migration_incoming_get_current(); 796 797 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), 798 rb_offset); 799 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { 800 trace_postcopy_request_shared_page_present(pcfd->idstr, 801 qemu_ram_get_idstr(rb), rb_offset); 802 return postcopy_wake_shared(pcfd, client_addr, rb); 803 } 804 postcopy_request_page(mis, rb, aligned_rbo, client_addr); 805 return 0; 806 } 807 808 static int get_mem_fault_cpu_index(uint32_t pid) 809 { 810 CPUState *cpu_iter; 811 812 CPU_FOREACH(cpu_iter) { 813 if (cpu_iter->thread_id == pid) { 814 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid); 815 return cpu_iter->cpu_index; 816 } 817 } 818 trace_get_mem_fault_cpu_index(-1, pid); 819 return -1; 820 } 821 822 static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc) 823 { 824 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - 825 dc->start_time; 826 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX; 827 } 828 829 /* 830 * This function is being called when pagefault occurs. It 831 * tracks down vCPU blocking time. 832 * 833 * @addr: faulted host virtual address 834 * @ptid: faulted process thread id 835 * @rb: ramblock appropriate to addr 836 */ 837 static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, 838 RAMBlock *rb) 839 { 840 int cpu, already_received; 841 MigrationIncomingState *mis = migration_incoming_get_current(); 842 PostcopyBlocktimeContext *dc = mis->blocktime_ctx; 843 uint32_t low_time_offset; 844 845 if (!dc || ptid == 0) { 846 return; 847 } 848 cpu = get_mem_fault_cpu_index(ptid); 849 if (cpu < 0) { 850 return; 851 } 852 853 low_time_offset = get_low_time_offset(dc); 854 if (dc->vcpu_addr[cpu] == 0) { 855 qatomic_inc(&dc->smp_cpus_down); 856 } 857 858 qatomic_xchg(&dc->last_begin, low_time_offset); 859 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset); 860 qatomic_xchg(&dc->vcpu_addr[cpu], addr); 861 862 /* 863 * check it here, not at the beginning of the function, 864 * due to, check could occur early than bitmap_set in 865 * qemu_ufd_copy_ioctl 866 */ 867 already_received = ramblock_recv_bitmap_test(rb, (void *)addr); 868 if (already_received) { 869 qatomic_xchg(&dc->vcpu_addr[cpu], 0); 870 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0); 871 qatomic_dec(&dc->smp_cpus_down); 872 } 873 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu], 874 cpu, already_received); 875 } 876 877 /* 878 * This function just provide calculated blocktime per cpu and trace it. 879 * Total blocktime is calculated in mark_postcopy_blocktime_end. 880 * 881 * 882 * Assume we have 3 CPU 883 * 884 * S1 E1 S1 E1 885 * -----***********------------xxx***************------------------------> CPU1 886 * 887 * S2 E2 888 * ------------****************xxx---------------------------------------> CPU2 889 * 890 * S3 E3 891 * ------------------------****xxx********-------------------------------> CPU3 892 * 893 * We have sequence S1,S2,E1,S3,S1,E2,E3,E1 894 * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3 895 * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 - 896 * it's a part of total blocktime. 897 * S1 - here is last_begin 898 * Legend of the picture is following: 899 * * - means blocktime per vCPU 900 * x - means overlapped blocktime (total blocktime) 901 * 902 * @addr: host virtual address 903 */ 904 static void mark_postcopy_blocktime_end(uintptr_t addr) 905 { 906 MigrationIncomingState *mis = migration_incoming_get_current(); 907 PostcopyBlocktimeContext *dc = mis->blocktime_ctx; 908 MachineState *ms = MACHINE(qdev_get_machine()); 909 unsigned int smp_cpus = ms->smp.cpus; 910 int i, affected_cpu = 0; 911 bool vcpu_total_blocktime = false; 912 uint32_t read_vcpu_time, low_time_offset; 913 914 if (!dc) { 915 return; 916 } 917 918 low_time_offset = get_low_time_offset(dc); 919 /* lookup cpu, to clear it, 920 * that algorithm looks straightforward, but it's not 921 * optimal, more optimal algorithm is keeping tree or hash 922 * where key is address value is a list of */ 923 for (i = 0; i < smp_cpus; i++) { 924 uint32_t vcpu_blocktime = 0; 925 926 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0); 927 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr || 928 read_vcpu_time == 0) { 929 continue; 930 } 931 qatomic_xchg(&dc->vcpu_addr[i], 0); 932 vcpu_blocktime = low_time_offset - read_vcpu_time; 933 affected_cpu += 1; 934 /* we need to know is that mark_postcopy_end was due to 935 * faulted page, another possible case it's prefetched 936 * page and in that case we shouldn't be here */ 937 if (!vcpu_total_blocktime && 938 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) { 939 vcpu_total_blocktime = true; 940 } 941 /* continue cycle, due to one page could affect several vCPUs */ 942 dc->vcpu_blocktime[i] += vcpu_blocktime; 943 } 944 945 qatomic_sub(&dc->smp_cpus_down, affected_cpu); 946 if (vcpu_total_blocktime) { 947 dc->total_blocktime += low_time_offset - qatomic_fetch_add( 948 &dc->last_begin, 0); 949 } 950 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime, 951 affected_cpu); 952 } 953 954 static void postcopy_pause_fault_thread(MigrationIncomingState *mis) 955 { 956 trace_postcopy_pause_fault_thread(); 957 qemu_sem_wait(&mis->postcopy_pause_sem_fault); 958 trace_postcopy_pause_fault_thread_continued(); 959 } 960 961 /* 962 * Handle faults detected by the USERFAULT markings 963 */ 964 static void *postcopy_ram_fault_thread(void *opaque) 965 { 966 MigrationIncomingState *mis = opaque; 967 struct uffd_msg msg; 968 int ret; 969 size_t index; 970 RAMBlock *rb = NULL; 971 972 trace_postcopy_ram_fault_thread_entry(); 973 rcu_register_thread(); 974 mis->last_rb = NULL; /* last RAMBlock we sent part of */ 975 qemu_sem_post(&mis->thread_sync_sem); 976 977 struct pollfd *pfd; 978 size_t pfd_len = 2 + mis->postcopy_remote_fds->len; 979 980 pfd = g_new0(struct pollfd, pfd_len); 981 982 pfd[0].fd = mis->userfault_fd; 983 pfd[0].events = POLLIN; 984 pfd[1].fd = mis->userfault_event_fd; 985 pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ 986 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd); 987 for (index = 0; index < mis->postcopy_remote_fds->len; index++) { 988 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds, 989 struct PostCopyFD, index); 990 pfd[2 + index].fd = pcfd->fd; 991 pfd[2 + index].events = POLLIN; 992 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr, 993 pcfd->fd); 994 } 995 996 while (true) { 997 ram_addr_t rb_offset; 998 int poll_result; 999 1000 /* 1001 * We're mainly waiting for the kernel to give us a faulting HVA, 1002 * however we can be told to quit via userfault_quit_fd which is 1003 * an eventfd 1004 */ 1005 1006 poll_result = poll(pfd, pfd_len, -1 /* Wait forever */); 1007 if (poll_result == -1) { 1008 error_report("%s: userfault poll: %s", __func__, strerror(errno)); 1009 break; 1010 } 1011 1012 if (!mis->to_src_file) { 1013 /* 1014 * Possibly someone tells us that the return path is 1015 * broken already using the event. We should hold until 1016 * the channel is rebuilt. 1017 */ 1018 postcopy_pause_fault_thread(mis); 1019 } 1020 1021 if (pfd[1].revents) { 1022 uint64_t tmp64 = 0; 1023 1024 /* Consume the signal */ 1025 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) { 1026 /* Nothing obviously nicer than posting this error. */ 1027 error_report("%s: read() failed", __func__); 1028 } 1029 1030 if (qatomic_read(&mis->fault_thread_quit)) { 1031 trace_postcopy_ram_fault_thread_quit(); 1032 break; 1033 } 1034 } 1035 1036 if (pfd[0].revents) { 1037 poll_result--; 1038 ret = read(mis->userfault_fd, &msg, sizeof(msg)); 1039 if (ret != sizeof(msg)) { 1040 if (errno == EAGAIN) { 1041 /* 1042 * if a wake up happens on the other thread just after 1043 * the poll, there is nothing to read. 1044 */ 1045 continue; 1046 } 1047 if (ret < 0) { 1048 error_report("%s: Failed to read full userfault " 1049 "message: %s", 1050 __func__, strerror(errno)); 1051 break; 1052 } else { 1053 error_report("%s: Read %d bytes from userfaultfd " 1054 "expected %zd", 1055 __func__, ret, sizeof(msg)); 1056 break; /* Lost alignment, don't know what we'd read next */ 1057 } 1058 } 1059 if (msg.event != UFFD_EVENT_PAGEFAULT) { 1060 error_report("%s: Read unexpected event %ud from userfaultfd", 1061 __func__, msg.event); 1062 continue; /* It's not a page fault, shouldn't happen */ 1063 } 1064 1065 rb = qemu_ram_block_from_host( 1066 (void *)(uintptr_t)msg.arg.pagefault.address, 1067 true, &rb_offset); 1068 if (!rb) { 1069 error_report("postcopy_ram_fault_thread: Fault outside guest: %" 1070 PRIx64, (uint64_t)msg.arg.pagefault.address); 1071 break; 1072 } 1073 1074 rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb)); 1075 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, 1076 qemu_ram_get_idstr(rb), 1077 rb_offset, 1078 msg.arg.pagefault.feat.ptid); 1079 mark_postcopy_blocktime_begin( 1080 (uintptr_t)(msg.arg.pagefault.address), 1081 msg.arg.pagefault.feat.ptid, rb); 1082 1083 retry: 1084 /* 1085 * Send the request to the source - we want to request one 1086 * of our host page sizes (which is >= TPS) 1087 */ 1088 ret = postcopy_request_page(mis, rb, rb_offset, 1089 msg.arg.pagefault.address); 1090 if (ret) { 1091 /* May be network failure, try to wait for recovery */ 1092 postcopy_pause_fault_thread(mis); 1093 goto retry; 1094 } 1095 } 1096 1097 /* Now handle any requests from external processes on shared memory */ 1098 /* TODO: May need to handle devices deregistering during postcopy */ 1099 for (index = 2; index < pfd_len && poll_result; index++) { 1100 if (pfd[index].revents) { 1101 struct PostCopyFD *pcfd = 1102 &g_array_index(mis->postcopy_remote_fds, 1103 struct PostCopyFD, index - 2); 1104 1105 poll_result--; 1106 if (pfd[index].revents & POLLERR) { 1107 error_report("%s: POLLERR on poll %zd fd=%d", 1108 __func__, index, pcfd->fd); 1109 pfd[index].events = 0; 1110 continue; 1111 } 1112 1113 ret = read(pcfd->fd, &msg, sizeof(msg)); 1114 if (ret != sizeof(msg)) { 1115 if (errno == EAGAIN) { 1116 /* 1117 * if a wake up happens on the other thread just after 1118 * the poll, there is nothing to read. 1119 */ 1120 continue; 1121 } 1122 if (ret < 0) { 1123 error_report("%s: Failed to read full userfault " 1124 "message: %s (shared) revents=%d", 1125 __func__, strerror(errno), 1126 pfd[index].revents); 1127 /*TODO: Could just disable this sharer */ 1128 break; 1129 } else { 1130 error_report("%s: Read %d bytes from userfaultfd " 1131 "expected %zd (shared)", 1132 __func__, ret, sizeof(msg)); 1133 /*TODO: Could just disable this sharer */ 1134 break; /*Lost alignment,don't know what we'd read next*/ 1135 } 1136 } 1137 if (msg.event != UFFD_EVENT_PAGEFAULT) { 1138 error_report("%s: Read unexpected event %ud " 1139 "from userfaultfd (shared)", 1140 __func__, msg.event); 1141 continue; /* It's not a page fault, shouldn't happen */ 1142 } 1143 /* Call the device handler registered with us */ 1144 ret = pcfd->handler(pcfd, &msg); 1145 if (ret) { 1146 error_report("%s: Failed to resolve shared fault on %zd/%s", 1147 __func__, index, pcfd->idstr); 1148 /* TODO: Fail? Disable this sharer? */ 1149 } 1150 } 1151 } 1152 } 1153 rcu_unregister_thread(); 1154 trace_postcopy_ram_fault_thread_exit(); 1155 g_free(pfd); 1156 return NULL; 1157 } 1158 1159 static int postcopy_temp_pages_setup(MigrationIncomingState *mis) 1160 { 1161 PostcopyTmpPage *tmp_page; 1162 int err, i, channels; 1163 void *temp_page; 1164 1165 if (migrate_postcopy_preempt()) { 1166 /* If preemption enabled, need extra channel for urgent requests */ 1167 mis->postcopy_channels = RAM_CHANNEL_MAX; 1168 } else { 1169 /* Both precopy/postcopy on the same channel */ 1170 mis->postcopy_channels = 1; 1171 } 1172 1173 channels = mis->postcopy_channels; 1174 mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels); 1175 1176 for (i = 0; i < channels; i++) { 1177 tmp_page = &mis->postcopy_tmp_pages[i]; 1178 temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE, 1179 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1180 if (temp_page == MAP_FAILED) { 1181 err = errno; 1182 error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s", 1183 __func__, i, strerror(err)); 1184 /* Clean up will be done later */ 1185 return -err; 1186 } 1187 tmp_page->tmp_huge_page = temp_page; 1188 /* Initialize default states for each tmp page */ 1189 postcopy_temp_page_reset(tmp_page); 1190 } 1191 1192 /* 1193 * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages 1194 */ 1195 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size, 1196 PROT_READ | PROT_WRITE, 1197 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1198 if (mis->postcopy_tmp_zero_page == MAP_FAILED) { 1199 err = errno; 1200 mis->postcopy_tmp_zero_page = NULL; 1201 error_report("%s: Failed to map large zero page %s", 1202 __func__, strerror(err)); 1203 return -err; 1204 } 1205 1206 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size); 1207 1208 return 0; 1209 } 1210 1211 int postcopy_ram_incoming_setup(MigrationIncomingState *mis) 1212 { 1213 Error *local_err = NULL; 1214 1215 /* Open the fd for the kernel to give us userfaults */ 1216 mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK); 1217 if (mis->userfault_fd == -1) { 1218 error_report("%s: Failed to open userfault fd: %s", __func__, 1219 strerror(errno)); 1220 return -1; 1221 } 1222 1223 /* 1224 * Although the host check already tested the API, we need to 1225 * do the check again as an ABI handshake on the new fd. 1226 */ 1227 if (!ufd_check_and_apply(mis->userfault_fd, mis, &local_err)) { 1228 error_report_err(local_err); 1229 return -1; 1230 } 1231 1232 /* Now an eventfd we use to tell the fault-thread to quit */ 1233 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC); 1234 if (mis->userfault_event_fd == -1) { 1235 error_report("%s: Opening userfault_event_fd: %s", __func__, 1236 strerror(errno)); 1237 close(mis->userfault_fd); 1238 return -1; 1239 } 1240 1241 postcopy_thread_create(mis, &mis->fault_thread, "fault-default", 1242 postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE); 1243 mis->have_fault_thread = true; 1244 1245 /* Mark so that we get notified of accesses to unwritten areas */ 1246 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) { 1247 error_report("ram_block_enable_notify failed"); 1248 return -1; 1249 } 1250 1251 if (postcopy_temp_pages_setup(mis)) { 1252 /* Error dumped in the sub-function */ 1253 return -1; 1254 } 1255 1256 if (migrate_postcopy_preempt()) { 1257 /* 1258 * This thread needs to be created after the temp pages because 1259 * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately. 1260 */ 1261 postcopy_thread_create(mis, &mis->postcopy_prio_thread, "fault-fast", 1262 postcopy_preempt_thread, QEMU_THREAD_JOINABLE); 1263 mis->preempt_thread_status = PREEMPT_THREAD_CREATED; 1264 } 1265 1266 trace_postcopy_ram_enable_notify(); 1267 1268 return 0; 1269 } 1270 1271 static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr, 1272 void *from_addr, uint64_t pagesize, RAMBlock *rb) 1273 { 1274 int userfault_fd = mis->userfault_fd; 1275 int ret; 1276 1277 if (from_addr) { 1278 struct uffdio_copy copy_struct; 1279 copy_struct.dst = (uint64_t)(uintptr_t)host_addr; 1280 copy_struct.src = (uint64_t)(uintptr_t)from_addr; 1281 copy_struct.len = pagesize; 1282 copy_struct.mode = 0; 1283 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct); 1284 } else { 1285 struct uffdio_zeropage zero_struct; 1286 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr; 1287 zero_struct.range.len = pagesize; 1288 zero_struct.mode = 0; 1289 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct); 1290 } 1291 if (!ret) { 1292 qemu_mutex_lock(&mis->page_request_mutex); 1293 ramblock_recv_bitmap_set_range(rb, host_addr, 1294 pagesize / qemu_target_page_size()); 1295 /* 1296 * If this page resolves a page fault for a previous recorded faulted 1297 * address, take a special note to maintain the requested page list. 1298 */ 1299 if (g_tree_lookup(mis->page_requested, host_addr)) { 1300 g_tree_remove(mis->page_requested, host_addr); 1301 int left_pages = qatomic_dec_fetch(&mis->page_requested_count); 1302 1303 trace_postcopy_page_req_del(host_addr, mis->page_requested_count); 1304 /* Order the update of count and read of preempt status */ 1305 smp_mb(); 1306 if (mis->preempt_thread_status == PREEMPT_THREAD_QUIT && 1307 left_pages == 0) { 1308 /* 1309 * This probably means the main thread is waiting for us. 1310 * Notify that we've finished receiving the last requested 1311 * page. 1312 */ 1313 qemu_cond_signal(&mis->page_request_cond); 1314 } 1315 } 1316 qemu_mutex_unlock(&mis->page_request_mutex); 1317 mark_postcopy_blocktime_end((uintptr_t)host_addr); 1318 } 1319 return ret; 1320 } 1321 1322 int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset) 1323 { 1324 int i; 1325 MigrationIncomingState *mis = migration_incoming_get_current(); 1326 GArray *pcrfds = mis->postcopy_remote_fds; 1327 1328 for (i = 0; i < pcrfds->len; i++) { 1329 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); 1330 int ret = cur->waker(cur, rb, offset); 1331 if (ret) { 1332 return ret; 1333 } 1334 } 1335 return 0; 1336 } 1337 1338 /* 1339 * Place a host page (from) at (host) atomically 1340 * returns 0 on success 1341 */ 1342 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, 1343 RAMBlock *rb) 1344 { 1345 size_t pagesize = qemu_ram_pagesize(rb); 1346 1347 /* copy also acks to the kernel waking the stalled thread up 1348 * TODO: We can inhibit that ack and only do it if it was requested 1349 * which would be slightly cheaper, but we'd have to be careful 1350 * of the order of updating our page state. 1351 */ 1352 if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) { 1353 int e = errno; 1354 error_report("%s: %s copy host: %p from: %p (size: %zd)", 1355 __func__, strerror(e), host, from, pagesize); 1356 1357 return -e; 1358 } 1359 1360 trace_postcopy_place_page(host); 1361 return postcopy_notify_shared_wake(rb, 1362 qemu_ram_block_host_offset(rb, host)); 1363 } 1364 1365 /* 1366 * Place a zero page at (host) atomically 1367 * returns 0 on success 1368 */ 1369 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, 1370 RAMBlock *rb) 1371 { 1372 size_t pagesize = qemu_ram_pagesize(rb); 1373 trace_postcopy_place_page_zero(host); 1374 1375 /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE 1376 * but it's not available for everything (e.g. hugetlbpages) 1377 */ 1378 if (qemu_ram_is_uf_zeroable(rb)) { 1379 if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) { 1380 int e = errno; 1381 error_report("%s: %s zero host: %p", 1382 __func__, strerror(e), host); 1383 1384 return -e; 1385 } 1386 return postcopy_notify_shared_wake(rb, 1387 qemu_ram_block_host_offset(rb, 1388 host)); 1389 } else { 1390 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb); 1391 } 1392 } 1393 1394 #else 1395 /* No target OS support, stubs just fail */ 1396 void fill_destination_postcopy_migration_info(MigrationInfo *info) 1397 { 1398 } 1399 1400 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp) 1401 { 1402 error_report("%s: No OS support", __func__); 1403 return false; 1404 } 1405 1406 int postcopy_ram_incoming_init(MigrationIncomingState *mis) 1407 { 1408 error_report("postcopy_ram_incoming_init: No OS support"); 1409 return -1; 1410 } 1411 1412 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) 1413 { 1414 assert(0); 1415 return -1; 1416 } 1417 1418 int postcopy_ram_prepare_discard(MigrationIncomingState *mis) 1419 { 1420 assert(0); 1421 return -1; 1422 } 1423 1424 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, 1425 uint64_t client_addr, uint64_t rb_offset) 1426 { 1427 assert(0); 1428 return -1; 1429 } 1430 1431 int postcopy_ram_incoming_setup(MigrationIncomingState *mis) 1432 { 1433 assert(0); 1434 return -1; 1435 } 1436 1437 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, 1438 RAMBlock *rb) 1439 { 1440 assert(0); 1441 return -1; 1442 } 1443 1444 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, 1445 RAMBlock *rb) 1446 { 1447 assert(0); 1448 return -1; 1449 } 1450 1451 int postcopy_wake_shared(struct PostCopyFD *pcfd, 1452 uint64_t client_addr, 1453 RAMBlock *rb) 1454 { 1455 assert(0); 1456 return -1; 1457 } 1458 #endif 1459 1460 /* ------------------------------------------------------------------------- */ 1461 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page) 1462 { 1463 tmp_page->target_pages = 0; 1464 tmp_page->host_addr = NULL; 1465 /* 1466 * This is set to true when reset, and cleared as long as we received any 1467 * of the non-zero small page within this huge page. 1468 */ 1469 tmp_page->all_zero = true; 1470 } 1471 1472 void postcopy_fault_thread_notify(MigrationIncomingState *mis) 1473 { 1474 uint64_t tmp64 = 1; 1475 1476 /* 1477 * Wakeup the fault_thread. It's an eventfd that should currently 1478 * be at 0, we're going to increment it to 1 1479 */ 1480 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) { 1481 /* Not much we can do here, but may as well report it */ 1482 error_report("%s: incrementing failed: %s", __func__, 1483 strerror(errno)); 1484 } 1485 } 1486 1487 /** 1488 * postcopy_discard_send_init: Called at the start of each RAMBlock before 1489 * asking to discard individual ranges. 1490 * 1491 * @ms: The current migration state. 1492 * @offset: the bitmap offset of the named RAMBlock in the migration bitmap. 1493 * @name: RAMBlock that discards will operate on. 1494 */ 1495 static PostcopyDiscardState pds = {0}; 1496 void postcopy_discard_send_init(MigrationState *ms, const char *name) 1497 { 1498 pds.ramblock_name = name; 1499 pds.cur_entry = 0; 1500 pds.nsentwords = 0; 1501 pds.nsentcmds = 0; 1502 } 1503 1504 /** 1505 * postcopy_discard_send_range: Called by the bitmap code for each chunk to 1506 * discard. May send a discard message, may just leave it queued to 1507 * be sent later. 1508 * 1509 * @ms: Current migration state. 1510 * @start,@length: a range of pages in the migration bitmap in the 1511 * RAM block passed to postcopy_discard_send_init() (length=1 is one page) 1512 */ 1513 void postcopy_discard_send_range(MigrationState *ms, unsigned long start, 1514 unsigned long length) 1515 { 1516 size_t tp_size = qemu_target_page_size(); 1517 /* Convert to byte offsets within the RAM block */ 1518 pds.start_list[pds.cur_entry] = start * tp_size; 1519 pds.length_list[pds.cur_entry] = length * tp_size; 1520 trace_postcopy_discard_send_range(pds.ramblock_name, start, length); 1521 pds.cur_entry++; 1522 pds.nsentwords++; 1523 1524 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) { 1525 /* Full set, ship it! */ 1526 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file, 1527 pds.ramblock_name, 1528 pds.cur_entry, 1529 pds.start_list, 1530 pds.length_list); 1531 pds.nsentcmds++; 1532 pds.cur_entry = 0; 1533 } 1534 } 1535 1536 /** 1537 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the 1538 * bitmap code. Sends any outstanding discard messages, frees the PDS 1539 * 1540 * @ms: Current migration state. 1541 */ 1542 void postcopy_discard_send_finish(MigrationState *ms) 1543 { 1544 /* Anything unsent? */ 1545 if (pds.cur_entry) { 1546 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file, 1547 pds.ramblock_name, 1548 pds.cur_entry, 1549 pds.start_list, 1550 pds.length_list); 1551 pds.nsentcmds++; 1552 } 1553 1554 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords, 1555 pds.nsentcmds); 1556 } 1557 1558 /* 1559 * Current state of incoming postcopy; note this is not part of 1560 * MigrationIncomingState since it's state is used during cleanup 1561 * at the end as MIS is being freed. 1562 */ 1563 static PostcopyState incoming_postcopy_state; 1564 1565 PostcopyState postcopy_state_get(void) 1566 { 1567 return qatomic_load_acquire(&incoming_postcopy_state); 1568 } 1569 1570 /* Set the state and return the old state */ 1571 PostcopyState postcopy_state_set(PostcopyState new_state) 1572 { 1573 return qatomic_xchg(&incoming_postcopy_state, new_state); 1574 } 1575 1576 /* Register a handler for external shared memory postcopy 1577 * called on the destination. 1578 */ 1579 void postcopy_register_shared_ufd(struct PostCopyFD *pcfd) 1580 { 1581 MigrationIncomingState *mis = migration_incoming_get_current(); 1582 1583 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds, 1584 *pcfd); 1585 } 1586 1587 /* Unregister a handler for external shared memory postcopy 1588 */ 1589 void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd) 1590 { 1591 guint i; 1592 MigrationIncomingState *mis = migration_incoming_get_current(); 1593 GArray *pcrfds = mis->postcopy_remote_fds; 1594 1595 if (!pcrfds) { 1596 /* migration has already finished and freed the array */ 1597 return; 1598 } 1599 for (i = 0; i < pcrfds->len; i++) { 1600 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); 1601 if (cur->fd == pcfd->fd) { 1602 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i); 1603 return; 1604 } 1605 } 1606 } 1607 1608 void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file) 1609 { 1610 /* 1611 * The new loading channel has its own threads, so it needs to be 1612 * blocked too. It's by default true, just be explicit. 1613 */ 1614 qemu_file_set_blocking(file, true); 1615 mis->postcopy_qemufile_dst = file; 1616 qemu_sem_post(&mis->postcopy_qemufile_dst_done); 1617 trace_postcopy_preempt_new_channel(); 1618 } 1619 1620 /* 1621 * Setup the postcopy preempt channel with the IOC. If ERROR is specified, 1622 * setup the error instead. This helper will free the ERROR if specified. 1623 */ 1624 static void 1625 postcopy_preempt_send_channel_done(MigrationState *s, 1626 QIOChannel *ioc, Error *local_err) 1627 { 1628 if (local_err) { 1629 migrate_set_error(s, local_err); 1630 error_free(local_err); 1631 } else { 1632 migration_ioc_register_yank(ioc); 1633 s->postcopy_qemufile_src = qemu_file_new_output(ioc); 1634 trace_postcopy_preempt_new_channel(); 1635 } 1636 1637 /* 1638 * Kick the waiter in all cases. The waiter should check upon 1639 * postcopy_qemufile_src to know whether it failed or not. 1640 */ 1641 qemu_sem_post(&s->postcopy_qemufile_src_sem); 1642 } 1643 1644 static void 1645 postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque) 1646 { 1647 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task)); 1648 MigrationState *s = opaque; 1649 Error *local_err = NULL; 1650 1651 qio_task_propagate_error(task, &local_err); 1652 postcopy_preempt_send_channel_done(s, ioc, local_err); 1653 } 1654 1655 static void 1656 postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque) 1657 { 1658 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task)); 1659 MigrationState *s = opaque; 1660 QIOChannelTLS *tioc; 1661 Error *local_err = NULL; 1662 1663 if (qio_task_propagate_error(task, &local_err)) { 1664 goto out; 1665 } 1666 1667 if (migrate_channel_requires_tls_upgrade(ioc)) { 1668 tioc = migration_tls_client_create(ioc, s->hostname, &local_err); 1669 if (!tioc) { 1670 goto out; 1671 } 1672 trace_postcopy_preempt_tls_handshake(); 1673 qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt"); 1674 qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake, 1675 s, NULL, NULL); 1676 /* Setup the channel until TLS handshake finished */ 1677 return; 1678 } 1679 1680 out: 1681 /* This handles both good and error cases */ 1682 postcopy_preempt_send_channel_done(s, ioc, local_err); 1683 } 1684 1685 /* 1686 * This function will kick off an async task to establish the preempt 1687 * channel, and wait until the connection setup completed. Returns 0 if 1688 * channel established, -1 for error. 1689 */ 1690 int postcopy_preempt_establish_channel(MigrationState *s) 1691 { 1692 /* If preempt not enabled, no need to wait */ 1693 if (!migrate_postcopy_preempt()) { 1694 return 0; 1695 } 1696 1697 /* 1698 * Kick off async task to establish preempt channel. Only do so with 1699 * 8.0+ machines, because 7.1/7.2 require the channel to be created in 1700 * setup phase of migration (even if racy in an unreliable network). 1701 */ 1702 if (!s->preempt_pre_7_2) { 1703 postcopy_preempt_setup(s); 1704 } 1705 1706 /* 1707 * We need the postcopy preempt channel to be established before 1708 * starting doing anything. 1709 */ 1710 qemu_sem_wait(&s->postcopy_qemufile_src_sem); 1711 1712 return s->postcopy_qemufile_src ? 0 : -1; 1713 } 1714 1715 void postcopy_preempt_setup(MigrationState *s) 1716 { 1717 /* Kick an async task to connect */ 1718 socket_send_channel_create(postcopy_preempt_send_channel_new, s); 1719 } 1720 1721 static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis) 1722 { 1723 trace_postcopy_pause_fast_load(); 1724 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex); 1725 qemu_sem_wait(&mis->postcopy_pause_sem_fast_load); 1726 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex); 1727 trace_postcopy_pause_fast_load_continued(); 1728 } 1729 1730 static bool preempt_thread_should_run(MigrationIncomingState *mis) 1731 { 1732 return mis->preempt_thread_status != PREEMPT_THREAD_QUIT; 1733 } 1734 1735 void *postcopy_preempt_thread(void *opaque) 1736 { 1737 MigrationIncomingState *mis = opaque; 1738 int ret; 1739 1740 trace_postcopy_preempt_thread_entry(); 1741 1742 rcu_register_thread(); 1743 1744 qemu_sem_post(&mis->thread_sync_sem); 1745 1746 /* 1747 * The preempt channel is established in asynchronous way. Wait 1748 * for its completion. 1749 */ 1750 qemu_sem_wait(&mis->postcopy_qemufile_dst_done); 1751 1752 /* Sending RAM_SAVE_FLAG_EOS to terminate this thread */ 1753 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex); 1754 while (preempt_thread_should_run(mis)) { 1755 ret = ram_load_postcopy(mis->postcopy_qemufile_dst, 1756 RAM_CHANNEL_POSTCOPY); 1757 /* If error happened, go into recovery routine */ 1758 if (ret && preempt_thread_should_run(mis)) { 1759 postcopy_pause_ram_fast_load(mis); 1760 } else { 1761 /* We're done */ 1762 break; 1763 } 1764 } 1765 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex); 1766 1767 rcu_unregister_thread(); 1768 1769 trace_postcopy_preempt_thread_exit(); 1770 1771 return NULL; 1772 } 1773