1 /* 2 * Postcopy migration for RAM 3 * 4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates 5 * 6 * Authors: 7 * Dave Gilbert <dgilbert@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 * 12 */ 13 14 /* 15 * Postcopy is a migration technique where the execution flips from the 16 * source to the destination before all the data has been copied. 17 */ 18 19 #include "qemu/osdep.h" 20 #include "qemu/madvise.h" 21 #include "exec/target_page.h" 22 #include "migration.h" 23 #include "qemu-file.h" 24 #include "savevm.h" 25 #include "postcopy-ram.h" 26 #include "ram.h" 27 #include "qapi/error.h" 28 #include "qemu/notify.h" 29 #include "qemu/rcu.h" 30 #include "sysemu/sysemu.h" 31 #include "qemu/error-report.h" 32 #include "trace.h" 33 #include "hw/boards.h" 34 #include "exec/ramblock.h" 35 #include "socket.h" 36 #include "yank_functions.h" 37 #include "tls.h" 38 #include "qemu/userfaultfd.h" 39 #include "qemu/mmap-alloc.h" 40 #include "options.h" 41 42 /* Arbitrary limit on size of each discard command, 43 * keeps them around ~200 bytes 44 */ 45 #define MAX_DISCARDS_PER_COMMAND 12 46 47 typedef struct PostcopyDiscardState { 48 const char *ramblock_name; 49 uint16_t cur_entry; 50 /* 51 * Start and length of a discard range (bytes) 52 */ 53 uint64_t start_list[MAX_DISCARDS_PER_COMMAND]; 54 uint64_t length_list[MAX_DISCARDS_PER_COMMAND]; 55 unsigned int nsentwords; 56 unsigned int nsentcmds; 57 } PostcopyDiscardState; 58 59 static NotifierWithReturnList postcopy_notifier_list; 60 61 void postcopy_infrastructure_init(void) 62 { 63 notifier_with_return_list_init(&postcopy_notifier_list); 64 } 65 66 void postcopy_add_notifier(NotifierWithReturn *nn) 67 { 68 notifier_with_return_list_add(&postcopy_notifier_list, nn); 69 } 70 71 void postcopy_remove_notifier(NotifierWithReturn *n) 72 { 73 notifier_with_return_remove(n); 74 } 75 76 int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp) 77 { 78 struct PostcopyNotifyData pnd; 79 pnd.reason = reason; 80 81 return notifier_with_return_list_notify(&postcopy_notifier_list, 82 &pnd, errp); 83 } 84 85 /* 86 * NOTE: this routine is not thread safe, we can't call it concurrently. But it 87 * should be good enough for migration's purposes. 88 */ 89 void postcopy_thread_create(MigrationIncomingState *mis, 90 QemuThread *thread, const char *name, 91 void *(*fn)(void *), int joinable) 92 { 93 qemu_sem_init(&mis->thread_sync_sem, 0); 94 qemu_thread_create(thread, name, fn, mis, joinable); 95 qemu_sem_wait(&mis->thread_sync_sem); 96 qemu_sem_destroy(&mis->thread_sync_sem); 97 } 98 99 /* Postcopy needs to detect accesses to pages that haven't yet been copied 100 * across, and efficiently map new pages in, the techniques for doing this 101 * are target OS specific. 102 */ 103 #if defined(__linux__) 104 #include <poll.h> 105 #include <sys/ioctl.h> 106 #include <sys/syscall.h> 107 #endif 108 109 #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD) 110 #include <sys/eventfd.h> 111 #include <linux/userfaultfd.h> 112 113 typedef struct PostcopyBlocktimeContext { 114 /* time when page fault initiated per vCPU */ 115 uint32_t *page_fault_vcpu_time; 116 /* page address per vCPU */ 117 uintptr_t *vcpu_addr; 118 uint32_t total_blocktime; 119 /* blocktime per vCPU */ 120 uint32_t *vcpu_blocktime; 121 /* point in time when last page fault was initiated */ 122 uint32_t last_begin; 123 /* number of vCPU are suspended */ 124 int smp_cpus_down; 125 uint64_t start_time; 126 127 /* 128 * Handler for exit event, necessary for 129 * releasing whole blocktime_ctx 130 */ 131 Notifier exit_notifier; 132 } PostcopyBlocktimeContext; 133 134 static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx) 135 { 136 g_free(ctx->page_fault_vcpu_time); 137 g_free(ctx->vcpu_addr); 138 g_free(ctx->vcpu_blocktime); 139 g_free(ctx); 140 } 141 142 static void migration_exit_cb(Notifier *n, void *data) 143 { 144 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext, 145 exit_notifier); 146 destroy_blocktime_context(ctx); 147 } 148 149 static struct PostcopyBlocktimeContext *blocktime_context_new(void) 150 { 151 MachineState *ms = MACHINE(qdev_get_machine()); 152 unsigned int smp_cpus = ms->smp.cpus; 153 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1); 154 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus); 155 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus); 156 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus); 157 158 ctx->exit_notifier.notify = migration_exit_cb; 159 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 160 qemu_add_exit_notifier(&ctx->exit_notifier); 161 return ctx; 162 } 163 164 static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx) 165 { 166 MachineState *ms = MACHINE(qdev_get_machine()); 167 uint32List *list = NULL; 168 int i; 169 170 for (i = ms->smp.cpus - 1; i >= 0; i--) { 171 QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]); 172 } 173 174 return list; 175 } 176 177 /* 178 * This function just populates MigrationInfo from postcopy's 179 * blocktime context. It will not populate MigrationInfo, 180 * unless postcopy-blocktime capability was set. 181 * 182 * @info: pointer to MigrationInfo to populate 183 */ 184 void fill_destination_postcopy_migration_info(MigrationInfo *info) 185 { 186 MigrationIncomingState *mis = migration_incoming_get_current(); 187 PostcopyBlocktimeContext *bc = mis->blocktime_ctx; 188 189 if (!bc) { 190 return; 191 } 192 193 info->has_postcopy_blocktime = true; 194 info->postcopy_blocktime = bc->total_blocktime; 195 info->has_postcopy_vcpu_blocktime = true; 196 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc); 197 } 198 199 static uint32_t get_postcopy_total_blocktime(void) 200 { 201 MigrationIncomingState *mis = migration_incoming_get_current(); 202 PostcopyBlocktimeContext *bc = mis->blocktime_ctx; 203 204 if (!bc) { 205 return 0; 206 } 207 208 return bc->total_blocktime; 209 } 210 211 /** 212 * receive_ufd_features: check userfault fd features, to request only supported 213 * features in the future. 214 * 215 * Returns: true on success 216 * 217 * __NR_userfaultfd - should be checked before 218 * @features: out parameter will contain uffdio_api.features provided by kernel 219 * in case of success 220 */ 221 static bool receive_ufd_features(uint64_t *features) 222 { 223 struct uffdio_api api_struct = {0}; 224 int ufd; 225 bool ret = true; 226 227 ufd = uffd_open(O_CLOEXEC); 228 if (ufd == -1) { 229 error_report("%s: uffd_open() failed: %s", __func__, strerror(errno)); 230 return false; 231 } 232 233 /* ask features */ 234 api_struct.api = UFFD_API; 235 api_struct.features = 0; 236 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 237 error_report("%s: UFFDIO_API failed: %s", __func__, 238 strerror(errno)); 239 ret = false; 240 goto release_ufd; 241 } 242 243 *features = api_struct.features; 244 245 release_ufd: 246 close(ufd); 247 return ret; 248 } 249 250 /** 251 * request_ufd_features: this function should be called only once on a newly 252 * opened ufd, subsequent calls will lead to error. 253 * 254 * Returns: true on success 255 * 256 * @ufd: fd obtained from userfaultfd syscall 257 * @features: bit mask see UFFD_API_FEATURES 258 */ 259 static bool request_ufd_features(int ufd, uint64_t features) 260 { 261 struct uffdio_api api_struct = {0}; 262 uint64_t ioctl_mask; 263 264 api_struct.api = UFFD_API; 265 api_struct.features = features; 266 if (ioctl(ufd, UFFDIO_API, &api_struct)) { 267 error_report("%s failed: UFFDIO_API failed: %s", __func__, 268 strerror(errno)); 269 return false; 270 } 271 272 ioctl_mask = 1ULL << _UFFDIO_REGISTER | 273 1ULL << _UFFDIO_UNREGISTER; 274 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { 275 error_report("Missing userfault features: %" PRIx64, 276 (uint64_t)(~api_struct.ioctls & ioctl_mask)); 277 return false; 278 } 279 280 return true; 281 } 282 283 static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis, 284 Error **errp) 285 { 286 ERRP_GUARD(); 287 uint64_t asked_features = 0; 288 static uint64_t supported_features; 289 290 /* 291 * it's not possible to 292 * request UFFD_API twice per one fd 293 * userfault fd features is persistent 294 */ 295 if (!supported_features) { 296 if (!receive_ufd_features(&supported_features)) { 297 error_setg(errp, "Userfault feature detection failed"); 298 return false; 299 } 300 } 301 302 #ifdef UFFD_FEATURE_THREAD_ID 303 if (UFFD_FEATURE_THREAD_ID & supported_features) { 304 asked_features |= UFFD_FEATURE_THREAD_ID; 305 if (migrate_postcopy_blocktime()) { 306 if (!mis->blocktime_ctx) { 307 mis->blocktime_ctx = blocktime_context_new(); 308 } 309 } 310 } 311 #endif 312 313 /* 314 * request features, even if asked_features is 0, due to 315 * kernel expects UFFD_API before UFFDIO_REGISTER, per 316 * userfault file descriptor 317 */ 318 if (!request_ufd_features(ufd, asked_features)) { 319 error_setg(errp, "Failed features %" PRIu64, asked_features); 320 return false; 321 } 322 323 if (qemu_real_host_page_size() != ram_pagesize_summary()) { 324 bool have_hp = false; 325 /* We've got a huge page */ 326 #ifdef UFFD_FEATURE_MISSING_HUGETLBFS 327 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS; 328 #endif 329 if (!have_hp) { 330 error_setg(errp, 331 "Userfault on this host does not support huge pages"); 332 return false; 333 } 334 } 335 return true; 336 } 337 338 /* Callback from postcopy_ram_supported_by_host block iterator. 339 */ 340 static int test_ramblock_postcopiable(RAMBlock *rb, Error **errp) 341 { 342 const char *block_name = qemu_ram_get_idstr(rb); 343 ram_addr_t length = qemu_ram_get_used_length(rb); 344 size_t pagesize = qemu_ram_pagesize(rb); 345 QemuFsType fs; 346 347 if (length % pagesize) { 348 error_setg(errp, 349 "Postcopy requires RAM blocks to be a page size multiple," 350 " block %s is 0x" RAM_ADDR_FMT " bytes with a " 351 "page size of 0x%zx", block_name, length, pagesize); 352 return 1; 353 } 354 355 if (rb->fd >= 0) { 356 fs = qemu_fd_getfs(rb->fd); 357 if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) { 358 error_setg(errp, 359 "Host backend files need to be TMPFS or HUGETLBFS only"); 360 return 1; 361 } 362 } 363 364 return 0; 365 } 366 367 /* 368 * Note: This has the side effect of munlock'ing all of RAM, that's 369 * normally fine since if the postcopy succeeds it gets turned back on at the 370 * end. 371 */ 372 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp) 373 { 374 ERRP_GUARD(); 375 long pagesize = qemu_real_host_page_size(); 376 int ufd = -1; 377 bool ret = false; /* Error unless we change it */ 378 void *testarea = NULL; 379 struct uffdio_register reg_struct; 380 struct uffdio_range range_struct; 381 uint64_t feature_mask; 382 RAMBlock *block; 383 384 if (qemu_target_page_size() > pagesize) { 385 error_setg(errp, "Target page size bigger than host page size"); 386 goto out; 387 } 388 389 ufd = uffd_open(O_CLOEXEC); 390 if (ufd == -1) { 391 error_setg(errp, "Userfaultfd not available: %s", strerror(errno)); 392 goto out; 393 } 394 395 /* Give devices a chance to object */ 396 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, errp)) { 397 goto out; 398 } 399 400 /* Version and features check */ 401 if (!ufd_check_and_apply(ufd, mis, errp)) { 402 goto out; 403 } 404 405 /* 406 * We don't support postcopy with some type of ramblocks. 407 * 408 * NOTE: we explicitly ignored migrate_ram_is_ignored() instead we checked 409 * all possible ramblocks. This is because this function can be called 410 * when creating the migration object, during the phase RAM_MIGRATABLE 411 * is not even properly set for all the ramblocks. 412 * 413 * A side effect of this is we'll also check against RAM_SHARED 414 * ramblocks even if migrate_ignore_shared() is set (in which case 415 * we'll never migrate RAM_SHARED at all), but normally this shouldn't 416 * affect in reality, or we can revisit. 417 */ 418 RAMBLOCK_FOREACH(block) { 419 if (test_ramblock_postcopiable(block, errp)) { 420 goto out; 421 } 422 } 423 424 /* 425 * userfault and mlock don't go together; we'll put it back later if 426 * it was enabled. 427 */ 428 if (munlockall()) { 429 error_setg(errp, "munlockall() failed: %s", strerror(errno)); 430 goto out; 431 } 432 433 /* 434 * We need to check that the ops we need are supported on anon memory 435 * To do that we need to register a chunk and see the flags that 436 * are returned. 437 */ 438 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | 439 MAP_ANONYMOUS, -1, 0); 440 if (testarea == MAP_FAILED) { 441 error_setg(errp, "Failed to map test area: %s", strerror(errno)); 442 goto out; 443 } 444 g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize)); 445 446 reg_struct.range.start = (uintptr_t)testarea; 447 reg_struct.range.len = pagesize; 448 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 449 450 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) { 451 error_setg(errp, "UFFDIO_REGISTER failed: %s", strerror(errno)); 452 goto out; 453 } 454 455 range_struct.start = (uintptr_t)testarea; 456 range_struct.len = pagesize; 457 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) { 458 error_setg(errp, "UFFDIO_UNREGISTER failed: %s", strerror(errno)); 459 goto out; 460 } 461 462 feature_mask = 1ULL << _UFFDIO_WAKE | 463 1ULL << _UFFDIO_COPY | 464 1ULL << _UFFDIO_ZEROPAGE; 465 if ((reg_struct.ioctls & feature_mask) != feature_mask) { 466 error_setg(errp, "Missing userfault map features: %" PRIx64, 467 (uint64_t)(~reg_struct.ioctls & feature_mask)); 468 goto out; 469 } 470 471 /* Success! */ 472 ret = true; 473 out: 474 if (testarea) { 475 munmap(testarea, pagesize); 476 } 477 if (ufd != -1) { 478 close(ufd); 479 } 480 return ret; 481 } 482 483 /* 484 * Setup an area of RAM so that it *can* be used for postcopy later; this 485 * must be done right at the start prior to pre-copy. 486 * opaque should be the MIS. 487 */ 488 static int init_range(RAMBlock *rb, void *opaque) 489 { 490 const char *block_name = qemu_ram_get_idstr(rb); 491 void *host_addr = qemu_ram_get_host_addr(rb); 492 ram_addr_t offset = qemu_ram_get_offset(rb); 493 ram_addr_t length = qemu_ram_get_used_length(rb); 494 trace_postcopy_init_range(block_name, host_addr, offset, length); 495 496 /* 497 * Save the used_length before running the guest. In case we have to 498 * resize RAM blocks when syncing RAM block sizes from the source during 499 * precopy, we'll update it manually via the ram block notifier. 500 */ 501 rb->postcopy_length = length; 502 503 /* 504 * We need the whole of RAM to be truly empty for postcopy, so things 505 * like ROMs and any data tables built during init must be zero'd 506 * - we're going to get the copy from the source anyway. 507 * (Precopy will just overwrite this data, so doesn't need the discard) 508 */ 509 if (ram_discard_range(block_name, 0, length)) { 510 return -1; 511 } 512 513 return 0; 514 } 515 516 /* 517 * At the end of migration, undo the effects of init_range 518 * opaque should be the MIS. 519 */ 520 static int cleanup_range(RAMBlock *rb, void *opaque) 521 { 522 const char *block_name = qemu_ram_get_idstr(rb); 523 void *host_addr = qemu_ram_get_host_addr(rb); 524 ram_addr_t offset = qemu_ram_get_offset(rb); 525 ram_addr_t length = rb->postcopy_length; 526 MigrationIncomingState *mis = opaque; 527 struct uffdio_range range_struct; 528 trace_postcopy_cleanup_range(block_name, host_addr, offset, length); 529 530 /* 531 * We turned off hugepage for the precopy stage with postcopy enabled 532 * we can turn it back on now. 533 */ 534 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE); 535 536 /* 537 * We can also turn off userfault now since we should have all the 538 * pages. It can be useful to leave it on to debug postcopy 539 * if you're not sure it's always getting every page. 540 */ 541 range_struct.start = (uintptr_t)host_addr; 542 range_struct.len = length; 543 544 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) { 545 error_report("%s: userfault unregister %s", __func__, strerror(errno)); 546 547 return -1; 548 } 549 550 return 0; 551 } 552 553 /* 554 * Initialise postcopy-ram, setting the RAM to a state where we can go into 555 * postcopy later; must be called prior to any precopy. 556 * called from arch_init's similarly named ram_postcopy_incoming_init 557 */ 558 int postcopy_ram_incoming_init(MigrationIncomingState *mis) 559 { 560 if (foreach_not_ignored_block(init_range, NULL)) { 561 return -1; 562 } 563 564 return 0; 565 } 566 567 static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis) 568 { 569 int i; 570 571 if (mis->postcopy_tmp_pages) { 572 for (i = 0; i < mis->postcopy_channels; i++) { 573 if (mis->postcopy_tmp_pages[i].tmp_huge_page) { 574 munmap(mis->postcopy_tmp_pages[i].tmp_huge_page, 575 mis->largest_page_size); 576 mis->postcopy_tmp_pages[i].tmp_huge_page = NULL; 577 } 578 } 579 g_free(mis->postcopy_tmp_pages); 580 mis->postcopy_tmp_pages = NULL; 581 } 582 583 if (mis->postcopy_tmp_zero_page) { 584 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size); 585 mis->postcopy_tmp_zero_page = NULL; 586 } 587 } 588 589 /* 590 * At the end of a migration where postcopy_ram_incoming_init was called. 591 */ 592 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) 593 { 594 trace_postcopy_ram_incoming_cleanup_entry(); 595 596 if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) { 597 /* Notify the fast load thread to quit */ 598 mis->preempt_thread_status = PREEMPT_THREAD_QUIT; 599 /* 600 * Update preempt_thread_status before reading count. Note: mutex 601 * lock only provide ACQUIRE semantic, and it doesn't stops this 602 * write to be reordered after reading the count. 603 */ 604 smp_mb(); 605 /* 606 * It's possible that the preempt thread is still handling the last 607 * pages to arrive which were requested by guest page faults. 608 * Making sure nothing is left behind by waiting on the condvar if 609 * that unlikely case happened. 610 */ 611 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) { 612 if (qatomic_read(&mis->page_requested_count)) { 613 /* 614 * It is guaranteed to receive a signal later, because the 615 * count>0 now, so it's destined to be decreased to zero 616 * very soon by the preempt thread. 617 */ 618 qemu_cond_wait(&mis->page_request_cond, 619 &mis->page_request_mutex); 620 } 621 } 622 /* Notify the fast load thread to quit */ 623 if (mis->postcopy_qemufile_dst) { 624 qemu_file_shutdown(mis->postcopy_qemufile_dst); 625 } 626 qemu_thread_join(&mis->postcopy_prio_thread); 627 mis->preempt_thread_status = PREEMPT_THREAD_NONE; 628 } 629 630 if (mis->have_fault_thread) { 631 Error *local_err = NULL; 632 633 /* Let the fault thread quit */ 634 qatomic_set(&mis->fault_thread_quit, 1); 635 postcopy_fault_thread_notify(mis); 636 trace_postcopy_ram_incoming_cleanup_join(); 637 qemu_thread_join(&mis->fault_thread); 638 639 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) { 640 error_report_err(local_err); 641 return -1; 642 } 643 644 if (foreach_not_ignored_block(cleanup_range, mis)) { 645 return -1; 646 } 647 648 trace_postcopy_ram_incoming_cleanup_closeuf(); 649 close(mis->userfault_fd); 650 close(mis->userfault_event_fd); 651 mis->have_fault_thread = false; 652 } 653 654 if (enable_mlock) { 655 if (os_mlock() < 0) { 656 error_report("mlock: %s", strerror(errno)); 657 /* 658 * It doesn't feel right to fail at this point, we have a valid 659 * VM state. 660 */ 661 } 662 } 663 664 postcopy_temp_pages_cleanup(mis); 665 666 trace_postcopy_ram_incoming_cleanup_blocktime( 667 get_postcopy_total_blocktime()); 668 669 trace_postcopy_ram_incoming_cleanup_exit(); 670 return 0; 671 } 672 673 /* 674 * Disable huge pages on an area 675 */ 676 static int nhp_range(RAMBlock *rb, void *opaque) 677 { 678 const char *block_name = qemu_ram_get_idstr(rb); 679 void *host_addr = qemu_ram_get_host_addr(rb); 680 ram_addr_t offset = qemu_ram_get_offset(rb); 681 ram_addr_t length = rb->postcopy_length; 682 trace_postcopy_nhp_range(block_name, host_addr, offset, length); 683 684 /* 685 * Before we do discards we need to ensure those discards really 686 * do delete areas of the page, even if THP thinks a hugepage would 687 * be a good idea, so force hugepages off. 688 */ 689 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE); 690 691 return 0; 692 } 693 694 /* 695 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard 696 * however leaving it until after precopy means that most of the precopy 697 * data is still THPd 698 */ 699 int postcopy_ram_prepare_discard(MigrationIncomingState *mis) 700 { 701 if (foreach_not_ignored_block(nhp_range, mis)) { 702 return -1; 703 } 704 705 postcopy_state_set(POSTCOPY_INCOMING_DISCARD); 706 707 return 0; 708 } 709 710 /* 711 * Mark the given area of RAM as requiring notification to unwritten areas 712 * Used as a callback on foreach_not_ignored_block. 713 * host_addr: Base of area to mark 714 * offset: Offset in the whole ram arena 715 * length: Length of the section 716 * opaque: MigrationIncomingState pointer 717 * Returns 0 on success 718 */ 719 static int ram_block_enable_notify(RAMBlock *rb, void *opaque) 720 { 721 MigrationIncomingState *mis = opaque; 722 struct uffdio_register reg_struct; 723 724 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb); 725 reg_struct.range.len = rb->postcopy_length; 726 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; 727 728 /* Now tell our userfault_fd that it's responsible for this area */ 729 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) { 730 error_report("%s userfault register: %s", __func__, strerror(errno)); 731 return -1; 732 } 733 if (!(reg_struct.ioctls & (1ULL << _UFFDIO_COPY))) { 734 error_report("%s userfault: Region doesn't support COPY", __func__); 735 return -1; 736 } 737 if (reg_struct.ioctls & (1ULL << _UFFDIO_ZEROPAGE)) { 738 qemu_ram_set_uf_zeroable(rb); 739 } 740 741 return 0; 742 } 743 744 int postcopy_wake_shared(struct PostCopyFD *pcfd, 745 uint64_t client_addr, 746 RAMBlock *rb) 747 { 748 size_t pagesize = qemu_ram_pagesize(rb); 749 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb)); 750 return uffd_wakeup(pcfd->fd, 751 (void *)(uintptr_t)ROUND_DOWN(client_addr, pagesize), 752 pagesize); 753 } 754 755 static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb, 756 ram_addr_t start, uint64_t haddr) 757 { 758 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb)); 759 760 /* 761 * Discarded pages (via RamDiscardManager) are never migrated. On unlikely 762 * access, place a zeropage, which will also set the relevant bits in the 763 * recv_bitmap accordingly, so we won't try placing a zeropage twice. 764 * 765 * Checking a single bit is sufficient to handle pagesize > TPS as either 766 * all relevant bits are set or not. 767 */ 768 assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb))); 769 if (ramblock_page_is_discarded(rb, start)) { 770 bool received = ramblock_recv_bitmap_test_byte_offset(rb, start); 771 772 return received ? 0 : postcopy_place_page_zero(mis, aligned, rb); 773 } 774 775 return migrate_send_rp_req_pages(mis, rb, start, haddr); 776 } 777 778 /* 779 * Callback from shared fault handlers to ask for a page, 780 * the page must be specified by a RAMBlock and an offset in that rb 781 * Note: Only for use by shared fault handlers (in fault thread) 782 */ 783 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, 784 uint64_t client_addr, uint64_t rb_offset) 785 { 786 uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb)); 787 MigrationIncomingState *mis = migration_incoming_get_current(); 788 789 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), 790 rb_offset); 791 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { 792 trace_postcopy_request_shared_page_present(pcfd->idstr, 793 qemu_ram_get_idstr(rb), rb_offset); 794 return postcopy_wake_shared(pcfd, client_addr, rb); 795 } 796 postcopy_request_page(mis, rb, aligned_rbo, client_addr); 797 return 0; 798 } 799 800 static int get_mem_fault_cpu_index(uint32_t pid) 801 { 802 CPUState *cpu_iter; 803 804 CPU_FOREACH(cpu_iter) { 805 if (cpu_iter->thread_id == pid) { 806 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid); 807 return cpu_iter->cpu_index; 808 } 809 } 810 trace_get_mem_fault_cpu_index(-1, pid); 811 return -1; 812 } 813 814 static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc) 815 { 816 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - 817 dc->start_time; 818 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX; 819 } 820 821 /* 822 * This function is being called when pagefault occurs. It 823 * tracks down vCPU blocking time. 824 * 825 * @addr: faulted host virtual address 826 * @ptid: faulted process thread id 827 * @rb: ramblock appropriate to addr 828 */ 829 static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, 830 RAMBlock *rb) 831 { 832 int cpu, already_received; 833 MigrationIncomingState *mis = migration_incoming_get_current(); 834 PostcopyBlocktimeContext *dc = mis->blocktime_ctx; 835 uint32_t low_time_offset; 836 837 if (!dc || ptid == 0) { 838 return; 839 } 840 cpu = get_mem_fault_cpu_index(ptid); 841 if (cpu < 0) { 842 return; 843 } 844 845 low_time_offset = get_low_time_offset(dc); 846 if (dc->vcpu_addr[cpu] == 0) { 847 qatomic_inc(&dc->smp_cpus_down); 848 } 849 850 qatomic_xchg(&dc->last_begin, low_time_offset); 851 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset); 852 qatomic_xchg(&dc->vcpu_addr[cpu], addr); 853 854 /* 855 * check it here, not at the beginning of the function, 856 * due to, check could occur early than bitmap_set in 857 * qemu_ufd_copy_ioctl 858 */ 859 already_received = ramblock_recv_bitmap_test(rb, (void *)addr); 860 if (already_received) { 861 qatomic_xchg(&dc->vcpu_addr[cpu], 0); 862 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0); 863 qatomic_dec(&dc->smp_cpus_down); 864 } 865 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu], 866 cpu, already_received); 867 } 868 869 /* 870 * This function just provide calculated blocktime per cpu and trace it. 871 * Total blocktime is calculated in mark_postcopy_blocktime_end. 872 * 873 * 874 * Assume we have 3 CPU 875 * 876 * S1 E1 S1 E1 877 * -----***********------------xxx***************------------------------> CPU1 878 * 879 * S2 E2 880 * ------------****************xxx---------------------------------------> CPU2 881 * 882 * S3 E3 883 * ------------------------****xxx********-------------------------------> CPU3 884 * 885 * We have sequence S1,S2,E1,S3,S1,E2,E3,E1 886 * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3 887 * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 - 888 * it's a part of total blocktime. 889 * S1 - here is last_begin 890 * Legend of the picture is following: 891 * * - means blocktime per vCPU 892 * x - means overlapped blocktime (total blocktime) 893 * 894 * @addr: host virtual address 895 */ 896 static void mark_postcopy_blocktime_end(uintptr_t addr) 897 { 898 MigrationIncomingState *mis = migration_incoming_get_current(); 899 PostcopyBlocktimeContext *dc = mis->blocktime_ctx; 900 MachineState *ms = MACHINE(qdev_get_machine()); 901 unsigned int smp_cpus = ms->smp.cpus; 902 int i, affected_cpu = 0; 903 bool vcpu_total_blocktime = false; 904 uint32_t read_vcpu_time, low_time_offset; 905 906 if (!dc) { 907 return; 908 } 909 910 low_time_offset = get_low_time_offset(dc); 911 /* lookup cpu, to clear it, 912 * that algorithm looks straightforward, but it's not 913 * optimal, more optimal algorithm is keeping tree or hash 914 * where key is address value is a list of */ 915 for (i = 0; i < smp_cpus; i++) { 916 uint32_t vcpu_blocktime = 0; 917 918 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0); 919 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr || 920 read_vcpu_time == 0) { 921 continue; 922 } 923 qatomic_xchg(&dc->vcpu_addr[i], 0); 924 vcpu_blocktime = low_time_offset - read_vcpu_time; 925 affected_cpu += 1; 926 /* we need to know is that mark_postcopy_end was due to 927 * faulted page, another possible case it's prefetched 928 * page and in that case we shouldn't be here */ 929 if (!vcpu_total_blocktime && 930 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) { 931 vcpu_total_blocktime = true; 932 } 933 /* continue cycle, due to one page could affect several vCPUs */ 934 dc->vcpu_blocktime[i] += vcpu_blocktime; 935 } 936 937 qatomic_sub(&dc->smp_cpus_down, affected_cpu); 938 if (vcpu_total_blocktime) { 939 dc->total_blocktime += low_time_offset - qatomic_fetch_add( 940 &dc->last_begin, 0); 941 } 942 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime, 943 affected_cpu); 944 } 945 946 static void postcopy_pause_fault_thread(MigrationIncomingState *mis) 947 { 948 trace_postcopy_pause_fault_thread(); 949 qemu_sem_wait(&mis->postcopy_pause_sem_fault); 950 trace_postcopy_pause_fault_thread_continued(); 951 } 952 953 /* 954 * Handle faults detected by the USERFAULT markings 955 */ 956 static void *postcopy_ram_fault_thread(void *opaque) 957 { 958 MigrationIncomingState *mis = opaque; 959 struct uffd_msg msg; 960 int ret; 961 size_t index; 962 RAMBlock *rb = NULL; 963 964 trace_postcopy_ram_fault_thread_entry(); 965 rcu_register_thread(); 966 mis->last_rb = NULL; /* last RAMBlock we sent part of */ 967 qemu_sem_post(&mis->thread_sync_sem); 968 969 struct pollfd *pfd; 970 size_t pfd_len = 2 + mis->postcopy_remote_fds->len; 971 972 pfd = g_new0(struct pollfd, pfd_len); 973 974 pfd[0].fd = mis->userfault_fd; 975 pfd[0].events = POLLIN; 976 pfd[1].fd = mis->userfault_event_fd; 977 pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ 978 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd); 979 for (index = 0; index < mis->postcopy_remote_fds->len; index++) { 980 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds, 981 struct PostCopyFD, index); 982 pfd[2 + index].fd = pcfd->fd; 983 pfd[2 + index].events = POLLIN; 984 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr, 985 pcfd->fd); 986 } 987 988 while (true) { 989 ram_addr_t rb_offset; 990 int poll_result; 991 992 /* 993 * We're mainly waiting for the kernel to give us a faulting HVA, 994 * however we can be told to quit via userfault_quit_fd which is 995 * an eventfd 996 */ 997 998 poll_result = poll(pfd, pfd_len, -1 /* Wait forever */); 999 if (poll_result == -1) { 1000 error_report("%s: userfault poll: %s", __func__, strerror(errno)); 1001 break; 1002 } 1003 1004 if (!mis->to_src_file) { 1005 /* 1006 * Possibly someone tells us that the return path is 1007 * broken already using the event. We should hold until 1008 * the channel is rebuilt. 1009 */ 1010 postcopy_pause_fault_thread(mis); 1011 } 1012 1013 if (pfd[1].revents) { 1014 uint64_t tmp64 = 0; 1015 1016 /* Consume the signal */ 1017 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) { 1018 /* Nothing obviously nicer than posting this error. */ 1019 error_report("%s: read() failed", __func__); 1020 } 1021 1022 if (qatomic_read(&mis->fault_thread_quit)) { 1023 trace_postcopy_ram_fault_thread_quit(); 1024 break; 1025 } 1026 } 1027 1028 if (pfd[0].revents) { 1029 poll_result--; 1030 ret = read(mis->userfault_fd, &msg, sizeof(msg)); 1031 if (ret != sizeof(msg)) { 1032 if (errno == EAGAIN) { 1033 /* 1034 * if a wake up happens on the other thread just after 1035 * the poll, there is nothing to read. 1036 */ 1037 continue; 1038 } 1039 if (ret < 0) { 1040 error_report("%s: Failed to read full userfault " 1041 "message: %s", 1042 __func__, strerror(errno)); 1043 break; 1044 } else { 1045 error_report("%s: Read %d bytes from userfaultfd " 1046 "expected %zd", 1047 __func__, ret, sizeof(msg)); 1048 break; /* Lost alignment, don't know what we'd read next */ 1049 } 1050 } 1051 if (msg.event != UFFD_EVENT_PAGEFAULT) { 1052 error_report("%s: Read unexpected event %ud from userfaultfd", 1053 __func__, msg.event); 1054 continue; /* It's not a page fault, shouldn't happen */ 1055 } 1056 1057 rb = qemu_ram_block_from_host( 1058 (void *)(uintptr_t)msg.arg.pagefault.address, 1059 true, &rb_offset); 1060 if (!rb) { 1061 error_report("postcopy_ram_fault_thread: Fault outside guest: %" 1062 PRIx64, (uint64_t)msg.arg.pagefault.address); 1063 break; 1064 } 1065 1066 rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb)); 1067 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, 1068 qemu_ram_get_idstr(rb), 1069 rb_offset, 1070 msg.arg.pagefault.feat.ptid); 1071 mark_postcopy_blocktime_begin( 1072 (uintptr_t)(msg.arg.pagefault.address), 1073 msg.arg.pagefault.feat.ptid, rb); 1074 1075 retry: 1076 /* 1077 * Send the request to the source - we want to request one 1078 * of our host page sizes (which is >= TPS) 1079 */ 1080 ret = postcopy_request_page(mis, rb, rb_offset, 1081 msg.arg.pagefault.address); 1082 if (ret) { 1083 /* May be network failure, try to wait for recovery */ 1084 postcopy_pause_fault_thread(mis); 1085 goto retry; 1086 } 1087 } 1088 1089 /* Now handle any requests from external processes on shared memory */ 1090 /* TODO: May need to handle devices deregistering during postcopy */ 1091 for (index = 2; index < pfd_len && poll_result; index++) { 1092 if (pfd[index].revents) { 1093 struct PostCopyFD *pcfd = 1094 &g_array_index(mis->postcopy_remote_fds, 1095 struct PostCopyFD, index - 2); 1096 1097 poll_result--; 1098 if (pfd[index].revents & POLLERR) { 1099 error_report("%s: POLLERR on poll %zd fd=%d", 1100 __func__, index, pcfd->fd); 1101 pfd[index].events = 0; 1102 continue; 1103 } 1104 1105 ret = read(pcfd->fd, &msg, sizeof(msg)); 1106 if (ret != sizeof(msg)) { 1107 if (errno == EAGAIN) { 1108 /* 1109 * if a wake up happens on the other thread just after 1110 * the poll, there is nothing to read. 1111 */ 1112 continue; 1113 } 1114 if (ret < 0) { 1115 error_report("%s: Failed to read full userfault " 1116 "message: %s (shared) revents=%d", 1117 __func__, strerror(errno), 1118 pfd[index].revents); 1119 /*TODO: Could just disable this sharer */ 1120 break; 1121 } else { 1122 error_report("%s: Read %d bytes from userfaultfd " 1123 "expected %zd (shared)", 1124 __func__, ret, sizeof(msg)); 1125 /*TODO: Could just disable this sharer */ 1126 break; /*Lost alignment,don't know what we'd read next*/ 1127 } 1128 } 1129 if (msg.event != UFFD_EVENT_PAGEFAULT) { 1130 error_report("%s: Read unexpected event %ud " 1131 "from userfaultfd (shared)", 1132 __func__, msg.event); 1133 continue; /* It's not a page fault, shouldn't happen */ 1134 } 1135 /* Call the device handler registered with us */ 1136 ret = pcfd->handler(pcfd, &msg); 1137 if (ret) { 1138 error_report("%s: Failed to resolve shared fault on %zd/%s", 1139 __func__, index, pcfd->idstr); 1140 /* TODO: Fail? Disable this sharer? */ 1141 } 1142 } 1143 } 1144 } 1145 rcu_unregister_thread(); 1146 trace_postcopy_ram_fault_thread_exit(); 1147 g_free(pfd); 1148 return NULL; 1149 } 1150 1151 static int postcopy_temp_pages_setup(MigrationIncomingState *mis) 1152 { 1153 PostcopyTmpPage *tmp_page; 1154 int err, i, channels; 1155 void *temp_page; 1156 1157 if (migrate_postcopy_preempt()) { 1158 /* If preemption enabled, need extra channel for urgent requests */ 1159 mis->postcopy_channels = RAM_CHANNEL_MAX; 1160 } else { 1161 /* Both precopy/postcopy on the same channel */ 1162 mis->postcopy_channels = 1; 1163 } 1164 1165 channels = mis->postcopy_channels; 1166 mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels); 1167 1168 for (i = 0; i < channels; i++) { 1169 tmp_page = &mis->postcopy_tmp_pages[i]; 1170 temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE, 1171 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1172 if (temp_page == MAP_FAILED) { 1173 err = errno; 1174 error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s", 1175 __func__, i, strerror(err)); 1176 /* Clean up will be done later */ 1177 return -err; 1178 } 1179 tmp_page->tmp_huge_page = temp_page; 1180 /* Initialize default states for each tmp page */ 1181 postcopy_temp_page_reset(tmp_page); 1182 } 1183 1184 /* 1185 * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages 1186 */ 1187 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size, 1188 PROT_READ | PROT_WRITE, 1189 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 1190 if (mis->postcopy_tmp_zero_page == MAP_FAILED) { 1191 err = errno; 1192 mis->postcopy_tmp_zero_page = NULL; 1193 error_report("%s: Failed to map large zero page %s", 1194 __func__, strerror(err)); 1195 return -err; 1196 } 1197 1198 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size); 1199 1200 return 0; 1201 } 1202 1203 int postcopy_ram_incoming_setup(MigrationIncomingState *mis) 1204 { 1205 Error *local_err = NULL; 1206 1207 /* Open the fd for the kernel to give us userfaults */ 1208 mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK); 1209 if (mis->userfault_fd == -1) { 1210 error_report("%s: Failed to open userfault fd: %s", __func__, 1211 strerror(errno)); 1212 return -1; 1213 } 1214 1215 /* 1216 * Although the host check already tested the API, we need to 1217 * do the check again as an ABI handshake on the new fd. 1218 */ 1219 if (!ufd_check_and_apply(mis->userfault_fd, mis, &local_err)) { 1220 error_report_err(local_err); 1221 return -1; 1222 } 1223 1224 /* Now an eventfd we use to tell the fault-thread to quit */ 1225 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC); 1226 if (mis->userfault_event_fd == -1) { 1227 error_report("%s: Opening userfault_event_fd: %s", __func__, 1228 strerror(errno)); 1229 close(mis->userfault_fd); 1230 return -1; 1231 } 1232 1233 postcopy_thread_create(mis, &mis->fault_thread, "mig/dst/fault", 1234 postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE); 1235 mis->have_fault_thread = true; 1236 1237 /* Mark so that we get notified of accesses to unwritten areas */ 1238 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) { 1239 error_report("ram_block_enable_notify failed"); 1240 return -1; 1241 } 1242 1243 if (postcopy_temp_pages_setup(mis)) { 1244 /* Error dumped in the sub-function */ 1245 return -1; 1246 } 1247 1248 if (migrate_postcopy_preempt()) { 1249 /* 1250 * This thread needs to be created after the temp pages because 1251 * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately. 1252 */ 1253 postcopy_thread_create(mis, &mis->postcopy_prio_thread, "mig/dst/preempt", 1254 postcopy_preempt_thread, QEMU_THREAD_JOINABLE); 1255 mis->preempt_thread_status = PREEMPT_THREAD_CREATED; 1256 } 1257 1258 trace_postcopy_ram_enable_notify(); 1259 1260 return 0; 1261 } 1262 1263 static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr, 1264 void *from_addr, uint64_t pagesize, RAMBlock *rb) 1265 { 1266 int userfault_fd = mis->userfault_fd; 1267 int ret; 1268 1269 if (from_addr) { 1270 ret = uffd_copy_page(userfault_fd, host_addr, from_addr, pagesize, 1271 false); 1272 } else { 1273 ret = uffd_zero_page(userfault_fd, host_addr, pagesize, false); 1274 } 1275 if (!ret) { 1276 qemu_mutex_lock(&mis->page_request_mutex); 1277 ramblock_recv_bitmap_set_range(rb, host_addr, 1278 pagesize / qemu_target_page_size()); 1279 /* 1280 * If this page resolves a page fault for a previous recorded faulted 1281 * address, take a special note to maintain the requested page list. 1282 */ 1283 if (g_tree_lookup(mis->page_requested, host_addr)) { 1284 g_tree_remove(mis->page_requested, host_addr); 1285 int left_pages = qatomic_dec_fetch(&mis->page_requested_count); 1286 1287 trace_postcopy_page_req_del(host_addr, mis->page_requested_count); 1288 /* Order the update of count and read of preempt status */ 1289 smp_mb(); 1290 if (mis->preempt_thread_status == PREEMPT_THREAD_QUIT && 1291 left_pages == 0) { 1292 /* 1293 * This probably means the main thread is waiting for us. 1294 * Notify that we've finished receiving the last requested 1295 * page. 1296 */ 1297 qemu_cond_signal(&mis->page_request_cond); 1298 } 1299 } 1300 qemu_mutex_unlock(&mis->page_request_mutex); 1301 mark_postcopy_blocktime_end((uintptr_t)host_addr); 1302 } 1303 return ret; 1304 } 1305 1306 int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset) 1307 { 1308 int i; 1309 MigrationIncomingState *mis = migration_incoming_get_current(); 1310 GArray *pcrfds = mis->postcopy_remote_fds; 1311 1312 for (i = 0; i < pcrfds->len; i++) { 1313 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); 1314 int ret = cur->waker(cur, rb, offset); 1315 if (ret) { 1316 return ret; 1317 } 1318 } 1319 return 0; 1320 } 1321 1322 /* 1323 * Place a host page (from) at (host) atomically 1324 * returns 0 on success 1325 */ 1326 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, 1327 RAMBlock *rb) 1328 { 1329 size_t pagesize = qemu_ram_pagesize(rb); 1330 int e; 1331 1332 /* copy also acks to the kernel waking the stalled thread up 1333 * TODO: We can inhibit that ack and only do it if it was requested 1334 * which would be slightly cheaper, but we'd have to be careful 1335 * of the order of updating our page state. 1336 */ 1337 e = qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb); 1338 if (e) { 1339 return e; 1340 } 1341 1342 trace_postcopy_place_page(host); 1343 return postcopy_notify_shared_wake(rb, 1344 qemu_ram_block_host_offset(rb, host)); 1345 } 1346 1347 /* 1348 * Place a zero page at (host) atomically 1349 * returns 0 on success 1350 */ 1351 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, 1352 RAMBlock *rb) 1353 { 1354 size_t pagesize = qemu_ram_pagesize(rb); 1355 trace_postcopy_place_page_zero(host); 1356 1357 /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE 1358 * but it's not available for everything (e.g. hugetlbpages) 1359 */ 1360 if (qemu_ram_is_uf_zeroable(rb)) { 1361 int e; 1362 e = qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb); 1363 if (e) { 1364 return e; 1365 } 1366 return postcopy_notify_shared_wake(rb, 1367 qemu_ram_block_host_offset(rb, 1368 host)); 1369 } else { 1370 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb); 1371 } 1372 } 1373 1374 #else 1375 /* No target OS support, stubs just fail */ 1376 void fill_destination_postcopy_migration_info(MigrationInfo *info) 1377 { 1378 } 1379 1380 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp) 1381 { 1382 error_report("%s: No OS support", __func__); 1383 return false; 1384 } 1385 1386 int postcopy_ram_incoming_init(MigrationIncomingState *mis) 1387 { 1388 error_report("postcopy_ram_incoming_init: No OS support"); 1389 return -1; 1390 } 1391 1392 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) 1393 { 1394 g_assert_not_reached(); 1395 } 1396 1397 int postcopy_ram_prepare_discard(MigrationIncomingState *mis) 1398 { 1399 g_assert_not_reached(); 1400 } 1401 1402 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, 1403 uint64_t client_addr, uint64_t rb_offset) 1404 { 1405 g_assert_not_reached(); 1406 } 1407 1408 int postcopy_ram_incoming_setup(MigrationIncomingState *mis) 1409 { 1410 g_assert_not_reached(); 1411 } 1412 1413 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, 1414 RAMBlock *rb) 1415 { 1416 g_assert_not_reached(); 1417 } 1418 1419 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, 1420 RAMBlock *rb) 1421 { 1422 g_assert_not_reached(); 1423 } 1424 1425 int postcopy_wake_shared(struct PostCopyFD *pcfd, 1426 uint64_t client_addr, 1427 RAMBlock *rb) 1428 { 1429 g_assert_not_reached(); 1430 } 1431 #endif 1432 1433 /* ------------------------------------------------------------------------- */ 1434 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page) 1435 { 1436 tmp_page->target_pages = 0; 1437 tmp_page->host_addr = NULL; 1438 /* 1439 * This is set to true when reset, and cleared as long as we received any 1440 * of the non-zero small page within this huge page. 1441 */ 1442 tmp_page->all_zero = true; 1443 } 1444 1445 void postcopy_fault_thread_notify(MigrationIncomingState *mis) 1446 { 1447 uint64_t tmp64 = 1; 1448 1449 /* 1450 * Wakeup the fault_thread. It's an eventfd that should currently 1451 * be at 0, we're going to increment it to 1 1452 */ 1453 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) { 1454 /* Not much we can do here, but may as well report it */ 1455 error_report("%s: incrementing failed: %s", __func__, 1456 strerror(errno)); 1457 } 1458 } 1459 1460 /** 1461 * postcopy_discard_send_init: Called at the start of each RAMBlock before 1462 * asking to discard individual ranges. 1463 * 1464 * @ms: The current migration state. 1465 * @offset: the bitmap offset of the named RAMBlock in the migration bitmap. 1466 * @name: RAMBlock that discards will operate on. 1467 */ 1468 static PostcopyDiscardState pds = {0}; 1469 void postcopy_discard_send_init(MigrationState *ms, const char *name) 1470 { 1471 pds.ramblock_name = name; 1472 pds.cur_entry = 0; 1473 pds.nsentwords = 0; 1474 pds.nsentcmds = 0; 1475 } 1476 1477 /** 1478 * postcopy_discard_send_range: Called by the bitmap code for each chunk to 1479 * discard. May send a discard message, may just leave it queued to 1480 * be sent later. 1481 * 1482 * @ms: Current migration state. 1483 * @start,@length: a range of pages in the migration bitmap in the 1484 * RAM block passed to postcopy_discard_send_init() (length=1 is one page) 1485 */ 1486 void postcopy_discard_send_range(MigrationState *ms, unsigned long start, 1487 unsigned long length) 1488 { 1489 size_t tp_size = qemu_target_page_size(); 1490 /* Convert to byte offsets within the RAM block */ 1491 pds.start_list[pds.cur_entry] = start * tp_size; 1492 pds.length_list[pds.cur_entry] = length * tp_size; 1493 trace_postcopy_discard_send_range(pds.ramblock_name, start, length); 1494 pds.cur_entry++; 1495 pds.nsentwords++; 1496 1497 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) { 1498 /* Full set, ship it! */ 1499 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file, 1500 pds.ramblock_name, 1501 pds.cur_entry, 1502 pds.start_list, 1503 pds.length_list); 1504 pds.nsentcmds++; 1505 pds.cur_entry = 0; 1506 } 1507 } 1508 1509 /** 1510 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the 1511 * bitmap code. Sends any outstanding discard messages, frees the PDS 1512 * 1513 * @ms: Current migration state. 1514 */ 1515 void postcopy_discard_send_finish(MigrationState *ms) 1516 { 1517 /* Anything unsent? */ 1518 if (pds.cur_entry) { 1519 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file, 1520 pds.ramblock_name, 1521 pds.cur_entry, 1522 pds.start_list, 1523 pds.length_list); 1524 pds.nsentcmds++; 1525 } 1526 1527 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords, 1528 pds.nsentcmds); 1529 } 1530 1531 /* 1532 * Current state of incoming postcopy; note this is not part of 1533 * MigrationIncomingState since it's state is used during cleanup 1534 * at the end as MIS is being freed. 1535 */ 1536 static PostcopyState incoming_postcopy_state; 1537 1538 PostcopyState postcopy_state_get(void) 1539 { 1540 return qatomic_load_acquire(&incoming_postcopy_state); 1541 } 1542 1543 /* Set the state and return the old state */ 1544 PostcopyState postcopy_state_set(PostcopyState new_state) 1545 { 1546 return qatomic_xchg(&incoming_postcopy_state, new_state); 1547 } 1548 1549 /* Register a handler for external shared memory postcopy 1550 * called on the destination. 1551 */ 1552 void postcopy_register_shared_ufd(struct PostCopyFD *pcfd) 1553 { 1554 MigrationIncomingState *mis = migration_incoming_get_current(); 1555 1556 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds, 1557 *pcfd); 1558 } 1559 1560 /* Unregister a handler for external shared memory postcopy 1561 */ 1562 void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd) 1563 { 1564 guint i; 1565 MigrationIncomingState *mis = migration_incoming_get_current(); 1566 GArray *pcrfds = mis->postcopy_remote_fds; 1567 1568 if (!pcrfds) { 1569 /* migration has already finished and freed the array */ 1570 return; 1571 } 1572 for (i = 0; i < pcrfds->len; i++) { 1573 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); 1574 if (cur->fd == pcfd->fd) { 1575 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i); 1576 return; 1577 } 1578 } 1579 } 1580 1581 void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file) 1582 { 1583 /* 1584 * The new loading channel has its own threads, so it needs to be 1585 * blocked too. It's by default true, just be explicit. 1586 */ 1587 qemu_file_set_blocking(file, true); 1588 mis->postcopy_qemufile_dst = file; 1589 qemu_sem_post(&mis->postcopy_qemufile_dst_done); 1590 trace_postcopy_preempt_new_channel(); 1591 } 1592 1593 /* 1594 * Setup the postcopy preempt channel with the IOC. If ERROR is specified, 1595 * setup the error instead. This helper will free the ERROR if specified. 1596 */ 1597 static void 1598 postcopy_preempt_send_channel_done(MigrationState *s, 1599 QIOChannel *ioc, Error *local_err) 1600 { 1601 if (local_err) { 1602 migrate_set_error(s, local_err); 1603 error_free(local_err); 1604 } else { 1605 migration_ioc_register_yank(ioc); 1606 s->postcopy_qemufile_src = qemu_file_new_output(ioc); 1607 trace_postcopy_preempt_new_channel(); 1608 } 1609 1610 /* 1611 * Kick the waiter in all cases. The waiter should check upon 1612 * postcopy_qemufile_src to know whether it failed or not. 1613 */ 1614 qemu_sem_post(&s->postcopy_qemufile_src_sem); 1615 } 1616 1617 static void 1618 postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque) 1619 { 1620 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task)); 1621 MigrationState *s = opaque; 1622 Error *local_err = NULL; 1623 1624 qio_task_propagate_error(task, &local_err); 1625 postcopy_preempt_send_channel_done(s, ioc, local_err); 1626 } 1627 1628 static void 1629 postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque) 1630 { 1631 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task)); 1632 MigrationState *s = opaque; 1633 QIOChannelTLS *tioc; 1634 Error *local_err = NULL; 1635 1636 if (qio_task_propagate_error(task, &local_err)) { 1637 goto out; 1638 } 1639 1640 if (migrate_channel_requires_tls_upgrade(ioc)) { 1641 tioc = migration_tls_client_create(ioc, s->hostname, &local_err); 1642 if (!tioc) { 1643 goto out; 1644 } 1645 trace_postcopy_preempt_tls_handshake(); 1646 qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt"); 1647 qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake, 1648 s, NULL, NULL); 1649 /* Setup the channel until TLS handshake finished */ 1650 return; 1651 } 1652 1653 out: 1654 /* This handles both good and error cases */ 1655 postcopy_preempt_send_channel_done(s, ioc, local_err); 1656 } 1657 1658 /* 1659 * This function will kick off an async task to establish the preempt 1660 * channel, and wait until the connection setup completed. Returns 0 if 1661 * channel established, -1 for error. 1662 */ 1663 int postcopy_preempt_establish_channel(MigrationState *s) 1664 { 1665 /* If preempt not enabled, no need to wait */ 1666 if (!migrate_postcopy_preempt()) { 1667 return 0; 1668 } 1669 1670 /* 1671 * Kick off async task to establish preempt channel. Only do so with 1672 * 8.0+ machines, because 7.1/7.2 require the channel to be created in 1673 * setup phase of migration (even if racy in an unreliable network). 1674 */ 1675 if (!s->preempt_pre_7_2) { 1676 postcopy_preempt_setup(s); 1677 } 1678 1679 /* 1680 * We need the postcopy preempt channel to be established before 1681 * starting doing anything. 1682 */ 1683 qemu_sem_wait(&s->postcopy_qemufile_src_sem); 1684 1685 return s->postcopy_qemufile_src ? 0 : -1; 1686 } 1687 1688 void postcopy_preempt_setup(MigrationState *s) 1689 { 1690 /* Kick an async task to connect */ 1691 socket_send_channel_create(postcopy_preempt_send_channel_new, s); 1692 } 1693 1694 static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis) 1695 { 1696 trace_postcopy_pause_fast_load(); 1697 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex); 1698 qemu_sem_wait(&mis->postcopy_pause_sem_fast_load); 1699 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex); 1700 trace_postcopy_pause_fast_load_continued(); 1701 } 1702 1703 static bool preempt_thread_should_run(MigrationIncomingState *mis) 1704 { 1705 return mis->preempt_thread_status != PREEMPT_THREAD_QUIT; 1706 } 1707 1708 void *postcopy_preempt_thread(void *opaque) 1709 { 1710 MigrationIncomingState *mis = opaque; 1711 int ret; 1712 1713 trace_postcopy_preempt_thread_entry(); 1714 1715 rcu_register_thread(); 1716 1717 qemu_sem_post(&mis->thread_sync_sem); 1718 1719 /* 1720 * The preempt channel is established in asynchronous way. Wait 1721 * for its completion. 1722 */ 1723 qemu_sem_wait(&mis->postcopy_qemufile_dst_done); 1724 1725 /* Sending RAM_SAVE_FLAG_EOS to terminate this thread */ 1726 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex); 1727 while (preempt_thread_should_run(mis)) { 1728 ret = ram_load_postcopy(mis->postcopy_qemufile_dst, 1729 RAM_CHANNEL_POSTCOPY); 1730 /* If error happened, go into recovery routine */ 1731 if (ret && preempt_thread_should_run(mis)) { 1732 postcopy_pause_ram_fast_load(mis); 1733 } else { 1734 /* We're done */ 1735 break; 1736 } 1737 } 1738 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex); 1739 1740 rcu_unregister_thread(); 1741 1742 trace_postcopy_preempt_thread_exit(); 1743 1744 return NULL; 1745 } 1746 1747 bool postcopy_is_paused(MigrationStatus status) 1748 { 1749 return status == MIGRATION_STATUS_POSTCOPY_PAUSED || 1750 status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP; 1751 } 1752