xref: /openbmc/qemu/migration/postcopy-ram.c (revision f9e1ef74)
1 /*
2  * Postcopy migration for RAM
3  *
4  * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
5  *
6  * Authors:
7  *  Dave Gilbert  <dgilbert@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  *
12  */
13 
14 /*
15  * Postcopy is a migration technique where the execution flips from the
16  * source to the destination before all the data has been copied.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qemu/madvise.h"
21 #include "exec/target_page.h"
22 #include "migration.h"
23 #include "qemu-file.h"
24 #include "savevm.h"
25 #include "postcopy-ram.h"
26 #include "ram.h"
27 #include "qapi/error.h"
28 #include "qemu/notify.h"
29 #include "qemu/rcu.h"
30 #include "sysemu/sysemu.h"
31 #include "qemu/error-report.h"
32 #include "trace.h"
33 #include "hw/boards.h"
34 #include "exec/ramblock.h"
35 #include "socket.h"
36 #include "yank_functions.h"
37 #include "tls.h"
38 #include "qemu/userfaultfd.h"
39 #include "qemu/mmap-alloc.h"
40 
41 /* Arbitrary limit on size of each discard command,
42  * keeps them around ~200 bytes
43  */
44 #define MAX_DISCARDS_PER_COMMAND 12
45 
46 struct PostcopyDiscardState {
47     const char *ramblock_name;
48     uint16_t cur_entry;
49     /*
50      * Start and length of a discard range (bytes)
51      */
52     uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
53     uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
54     unsigned int nsentwords;
55     unsigned int nsentcmds;
56 };
57 
58 static NotifierWithReturnList postcopy_notifier_list;
59 
60 void postcopy_infrastructure_init(void)
61 {
62     notifier_with_return_list_init(&postcopy_notifier_list);
63 }
64 
65 void postcopy_add_notifier(NotifierWithReturn *nn)
66 {
67     notifier_with_return_list_add(&postcopy_notifier_list, nn);
68 }
69 
70 void postcopy_remove_notifier(NotifierWithReturn *n)
71 {
72     notifier_with_return_remove(n);
73 }
74 
75 int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
76 {
77     struct PostcopyNotifyData pnd;
78     pnd.reason = reason;
79     pnd.errp = errp;
80 
81     return notifier_with_return_list_notify(&postcopy_notifier_list,
82                                             &pnd);
83 }
84 
85 /*
86  * NOTE: this routine is not thread safe, we can't call it concurrently. But it
87  * should be good enough for migration's purposes.
88  */
89 void postcopy_thread_create(MigrationIncomingState *mis,
90                             QemuThread *thread, const char *name,
91                             void *(*fn)(void *), int joinable)
92 {
93     qemu_sem_init(&mis->thread_sync_sem, 0);
94     qemu_thread_create(thread, name, fn, mis, joinable);
95     qemu_sem_wait(&mis->thread_sync_sem);
96     qemu_sem_destroy(&mis->thread_sync_sem);
97 }
98 
99 /* Postcopy needs to detect accesses to pages that haven't yet been copied
100  * across, and efficiently map new pages in, the techniques for doing this
101  * are target OS specific.
102  */
103 #if defined(__linux__)
104 
105 #include <poll.h>
106 #include <sys/ioctl.h>
107 #include <sys/syscall.h>
108 #include <asm/types.h> /* for __u64 */
109 #endif
110 
111 #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
112 #include <sys/eventfd.h>
113 #include <linux/userfaultfd.h>
114 
115 typedef struct PostcopyBlocktimeContext {
116     /* time when page fault initiated per vCPU */
117     uint32_t *page_fault_vcpu_time;
118     /* page address per vCPU */
119     uintptr_t *vcpu_addr;
120     uint32_t total_blocktime;
121     /* blocktime per vCPU */
122     uint32_t *vcpu_blocktime;
123     /* point in time when last page fault was initiated */
124     uint32_t last_begin;
125     /* number of vCPU are suspended */
126     int smp_cpus_down;
127     uint64_t start_time;
128 
129     /*
130      * Handler for exit event, necessary for
131      * releasing whole blocktime_ctx
132      */
133     Notifier exit_notifier;
134 } PostcopyBlocktimeContext;
135 
136 static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
137 {
138     g_free(ctx->page_fault_vcpu_time);
139     g_free(ctx->vcpu_addr);
140     g_free(ctx->vcpu_blocktime);
141     g_free(ctx);
142 }
143 
144 static void migration_exit_cb(Notifier *n, void *data)
145 {
146     PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
147                                                  exit_notifier);
148     destroy_blocktime_context(ctx);
149 }
150 
151 static struct PostcopyBlocktimeContext *blocktime_context_new(void)
152 {
153     MachineState *ms = MACHINE(qdev_get_machine());
154     unsigned int smp_cpus = ms->smp.cpus;
155     PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
156     ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
157     ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
158     ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
159 
160     ctx->exit_notifier.notify = migration_exit_cb;
161     ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
162     qemu_add_exit_notifier(&ctx->exit_notifier);
163     return ctx;
164 }
165 
166 static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
167 {
168     MachineState *ms = MACHINE(qdev_get_machine());
169     uint32List *list = NULL;
170     int i;
171 
172     for (i = ms->smp.cpus - 1; i >= 0; i--) {
173         QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
174     }
175 
176     return list;
177 }
178 
179 /*
180  * This function just populates MigrationInfo from postcopy's
181  * blocktime context. It will not populate MigrationInfo,
182  * unless postcopy-blocktime capability was set.
183  *
184  * @info: pointer to MigrationInfo to populate
185  */
186 void fill_destination_postcopy_migration_info(MigrationInfo *info)
187 {
188     MigrationIncomingState *mis = migration_incoming_get_current();
189     PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
190 
191     if (!bc) {
192         return;
193     }
194 
195     info->has_postcopy_blocktime = true;
196     info->postcopy_blocktime = bc->total_blocktime;
197     info->has_postcopy_vcpu_blocktime = true;
198     info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
199 }
200 
201 static uint32_t get_postcopy_total_blocktime(void)
202 {
203     MigrationIncomingState *mis = migration_incoming_get_current();
204     PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
205 
206     if (!bc) {
207         return 0;
208     }
209 
210     return bc->total_blocktime;
211 }
212 
213 /**
214  * receive_ufd_features: check userfault fd features, to request only supported
215  * features in the future.
216  *
217  * Returns: true on success
218  *
219  * __NR_userfaultfd - should be checked before
220  *  @features: out parameter will contain uffdio_api.features provided by kernel
221  *              in case of success
222  */
223 static bool receive_ufd_features(uint64_t *features)
224 {
225     struct uffdio_api api_struct = {0};
226     int ufd;
227     bool ret = true;
228 
229     ufd = uffd_open(O_CLOEXEC);
230     if (ufd == -1) {
231         error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
232         return false;
233     }
234 
235     /* ask features */
236     api_struct.api = UFFD_API;
237     api_struct.features = 0;
238     if (ioctl(ufd, UFFDIO_API, &api_struct)) {
239         error_report("%s: UFFDIO_API failed: %s", __func__,
240                      strerror(errno));
241         ret = false;
242         goto release_ufd;
243     }
244 
245     *features = api_struct.features;
246 
247 release_ufd:
248     close(ufd);
249     return ret;
250 }
251 
252 /**
253  * request_ufd_features: this function should be called only once on a newly
254  * opened ufd, subsequent calls will lead to error.
255  *
256  * Returns: true on success
257  *
258  * @ufd: fd obtained from userfaultfd syscall
259  * @features: bit mask see UFFD_API_FEATURES
260  */
261 static bool request_ufd_features(int ufd, uint64_t features)
262 {
263     struct uffdio_api api_struct = {0};
264     uint64_t ioctl_mask;
265 
266     api_struct.api = UFFD_API;
267     api_struct.features = features;
268     if (ioctl(ufd, UFFDIO_API, &api_struct)) {
269         error_report("%s failed: UFFDIO_API failed: %s", __func__,
270                      strerror(errno));
271         return false;
272     }
273 
274     ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
275                  (__u64)1 << _UFFDIO_UNREGISTER;
276     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
277         error_report("Missing userfault features: %" PRIx64,
278                      (uint64_t)(~api_struct.ioctls & ioctl_mask));
279         return false;
280     }
281 
282     return true;
283 }
284 
285 static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
286 {
287     uint64_t asked_features = 0;
288     static uint64_t supported_features;
289 
290     /*
291      * it's not possible to
292      * request UFFD_API twice per one fd
293      * userfault fd features is persistent
294      */
295     if (!supported_features) {
296         if (!receive_ufd_features(&supported_features)) {
297             error_report("%s failed", __func__);
298             return false;
299         }
300     }
301 
302 #ifdef UFFD_FEATURE_THREAD_ID
303     if (UFFD_FEATURE_THREAD_ID & supported_features) {
304         asked_features |= UFFD_FEATURE_THREAD_ID;
305         if (migrate_postcopy_blocktime()) {
306             if (!mis->blocktime_ctx) {
307                 mis->blocktime_ctx = blocktime_context_new();
308             }
309         }
310     }
311 #endif
312 
313     /*
314      * request features, even if asked_features is 0, due to
315      * kernel expects UFFD_API before UFFDIO_REGISTER, per
316      * userfault file descriptor
317      */
318     if (!request_ufd_features(ufd, asked_features)) {
319         error_report("%s failed: features %" PRIu64, __func__,
320                      asked_features);
321         return false;
322     }
323 
324     if (qemu_real_host_page_size() != ram_pagesize_summary()) {
325         bool have_hp = false;
326         /* We've got a huge page */
327 #ifdef UFFD_FEATURE_MISSING_HUGETLBFS
328         have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
329 #endif
330         if (!have_hp) {
331             error_report("Userfault on this host does not support huge pages");
332             return false;
333         }
334     }
335     return true;
336 }
337 
338 /* Callback from postcopy_ram_supported_by_host block iterator.
339  */
340 static int test_ramblock_postcopiable(RAMBlock *rb)
341 {
342     const char *block_name = qemu_ram_get_idstr(rb);
343     ram_addr_t length = qemu_ram_get_used_length(rb);
344     size_t pagesize = qemu_ram_pagesize(rb);
345     QemuFsType fs;
346 
347     if (length % pagesize) {
348         error_report("Postcopy requires RAM blocks to be a page size multiple,"
349                      " block %s is 0x" RAM_ADDR_FMT " bytes with a "
350                      "page size of 0x%zx", block_name, length, pagesize);
351         return 1;
352     }
353 
354     if (rb->fd >= 0) {
355         fs = qemu_fd_getfs(rb->fd);
356         if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) {
357             error_report("Host backend files need to be TMPFS or HUGETLBFS only");
358             return 1;
359         }
360     }
361 
362     return 0;
363 }
364 
365 /*
366  * Note: This has the side effect of munlock'ing all of RAM, that's
367  * normally fine since if the postcopy succeeds it gets turned back on at the
368  * end.
369  */
370 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
371 {
372     long pagesize = qemu_real_host_page_size();
373     int ufd = -1;
374     bool ret = false; /* Error unless we change it */
375     void *testarea = NULL;
376     struct uffdio_register reg_struct;
377     struct uffdio_range range_struct;
378     uint64_t feature_mask;
379     Error *local_err = NULL;
380     RAMBlock *block;
381 
382     if (qemu_target_page_size() > pagesize) {
383         error_report("Target page size bigger than host page size");
384         goto out;
385     }
386 
387     ufd = uffd_open(O_CLOEXEC);
388     if (ufd == -1) {
389         error_report("%s: userfaultfd not available: %s", __func__,
390                      strerror(errno));
391         goto out;
392     }
393 
394     /* Give devices a chance to object */
395     if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
396         error_report_err(local_err);
397         goto out;
398     }
399 
400     /* Version and features check */
401     if (!ufd_check_and_apply(ufd, mis)) {
402         goto out;
403     }
404 
405     /*
406      * We don't support postcopy with some type of ramblocks.
407      *
408      * NOTE: we explicitly ignored ramblock_is_ignored() instead we checked
409      * all possible ramblocks.  This is because this function can be called
410      * when creating the migration object, during the phase RAM_MIGRATABLE
411      * is not even properly set for all the ramblocks.
412      *
413      * A side effect of this is we'll also check against RAM_SHARED
414      * ramblocks even if migrate_ignore_shared() is set (in which case
415      * we'll never migrate RAM_SHARED at all), but normally this shouldn't
416      * affect in reality, or we can revisit.
417      */
418     RAMBLOCK_FOREACH(block) {
419         if (test_ramblock_postcopiable(block)) {
420             goto out;
421         }
422     }
423 
424     /*
425      * userfault and mlock don't go together; we'll put it back later if
426      * it was enabled.
427      */
428     if (munlockall()) {
429         error_report("%s: munlockall: %s", __func__,  strerror(errno));
430         goto out;
431     }
432 
433     /*
434      *  We need to check that the ops we need are supported on anon memory
435      *  To do that we need to register a chunk and see the flags that
436      *  are returned.
437      */
438     testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
439                                     MAP_ANONYMOUS, -1, 0);
440     if (testarea == MAP_FAILED) {
441         error_report("%s: Failed to map test area: %s", __func__,
442                      strerror(errno));
443         goto out;
444     }
445     g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
446 
447     reg_struct.range.start = (uintptr_t)testarea;
448     reg_struct.range.len = pagesize;
449     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
450 
451     if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
452         error_report("%s userfault register: %s", __func__, strerror(errno));
453         goto out;
454     }
455 
456     range_struct.start = (uintptr_t)testarea;
457     range_struct.len = pagesize;
458     if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
459         error_report("%s userfault unregister: %s", __func__, strerror(errno));
460         goto out;
461     }
462 
463     feature_mask = (__u64)1 << _UFFDIO_WAKE |
464                    (__u64)1 << _UFFDIO_COPY |
465                    (__u64)1 << _UFFDIO_ZEROPAGE;
466     if ((reg_struct.ioctls & feature_mask) != feature_mask) {
467         error_report("Missing userfault map features: %" PRIx64,
468                      (uint64_t)(~reg_struct.ioctls & feature_mask));
469         goto out;
470     }
471 
472     /* Success! */
473     ret = true;
474 out:
475     if (testarea) {
476         munmap(testarea, pagesize);
477     }
478     if (ufd != -1) {
479         close(ufd);
480     }
481     return ret;
482 }
483 
484 /*
485  * Setup an area of RAM so that it *can* be used for postcopy later; this
486  * must be done right at the start prior to pre-copy.
487  * opaque should be the MIS.
488  */
489 static int init_range(RAMBlock *rb, void *opaque)
490 {
491     const char *block_name = qemu_ram_get_idstr(rb);
492     void *host_addr = qemu_ram_get_host_addr(rb);
493     ram_addr_t offset = qemu_ram_get_offset(rb);
494     ram_addr_t length = qemu_ram_get_used_length(rb);
495     trace_postcopy_init_range(block_name, host_addr, offset, length);
496 
497     /*
498      * Save the used_length before running the guest. In case we have to
499      * resize RAM blocks when syncing RAM block sizes from the source during
500      * precopy, we'll update it manually via the ram block notifier.
501      */
502     rb->postcopy_length = length;
503 
504     /*
505      * We need the whole of RAM to be truly empty for postcopy, so things
506      * like ROMs and any data tables built during init must be zero'd
507      * - we're going to get the copy from the source anyway.
508      * (Precopy will just overwrite this data, so doesn't need the discard)
509      */
510     if (ram_discard_range(block_name, 0, length)) {
511         return -1;
512     }
513 
514     return 0;
515 }
516 
517 /*
518  * At the end of migration, undo the effects of init_range
519  * opaque should be the MIS.
520  */
521 static int cleanup_range(RAMBlock *rb, void *opaque)
522 {
523     const char *block_name = qemu_ram_get_idstr(rb);
524     void *host_addr = qemu_ram_get_host_addr(rb);
525     ram_addr_t offset = qemu_ram_get_offset(rb);
526     ram_addr_t length = rb->postcopy_length;
527     MigrationIncomingState *mis = opaque;
528     struct uffdio_range range_struct;
529     trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
530 
531     /*
532      * We turned off hugepage for the precopy stage with postcopy enabled
533      * we can turn it back on now.
534      */
535     qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
536 
537     /*
538      * We can also turn off userfault now since we should have all the
539      * pages.   It can be useful to leave it on to debug postcopy
540      * if you're not sure it's always getting every page.
541      */
542     range_struct.start = (uintptr_t)host_addr;
543     range_struct.len = length;
544 
545     if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
546         error_report("%s: userfault unregister %s", __func__, strerror(errno));
547 
548         return -1;
549     }
550 
551     return 0;
552 }
553 
554 /*
555  * Initialise postcopy-ram, setting the RAM to a state where we can go into
556  * postcopy later; must be called prior to any precopy.
557  * called from arch_init's similarly named ram_postcopy_incoming_init
558  */
559 int postcopy_ram_incoming_init(MigrationIncomingState *mis)
560 {
561     if (foreach_not_ignored_block(init_range, NULL)) {
562         return -1;
563     }
564 
565     return 0;
566 }
567 
568 static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis)
569 {
570     int i;
571 
572     if (mis->postcopy_tmp_pages) {
573         for (i = 0; i < mis->postcopy_channels; i++) {
574             if (mis->postcopy_tmp_pages[i].tmp_huge_page) {
575                 munmap(mis->postcopy_tmp_pages[i].tmp_huge_page,
576                        mis->largest_page_size);
577                 mis->postcopy_tmp_pages[i].tmp_huge_page = NULL;
578             }
579         }
580         g_free(mis->postcopy_tmp_pages);
581         mis->postcopy_tmp_pages = NULL;
582     }
583 
584     if (mis->postcopy_tmp_zero_page) {
585         munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
586         mis->postcopy_tmp_zero_page = NULL;
587     }
588 }
589 
590 /*
591  * At the end of a migration where postcopy_ram_incoming_init was called.
592  */
593 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
594 {
595     trace_postcopy_ram_incoming_cleanup_entry();
596 
597     if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) {
598         /* Notify the fast load thread to quit */
599         mis->preempt_thread_status = PREEMPT_THREAD_QUIT;
600         if (mis->postcopy_qemufile_dst) {
601             qemu_file_shutdown(mis->postcopy_qemufile_dst);
602         }
603         qemu_thread_join(&mis->postcopy_prio_thread);
604         mis->preempt_thread_status = PREEMPT_THREAD_NONE;
605     }
606 
607     if (mis->have_fault_thread) {
608         Error *local_err = NULL;
609 
610         /* Let the fault thread quit */
611         qatomic_set(&mis->fault_thread_quit, 1);
612         postcopy_fault_thread_notify(mis);
613         trace_postcopy_ram_incoming_cleanup_join();
614         qemu_thread_join(&mis->fault_thread);
615 
616         if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
617             error_report_err(local_err);
618             return -1;
619         }
620 
621         if (foreach_not_ignored_block(cleanup_range, mis)) {
622             return -1;
623         }
624 
625         trace_postcopy_ram_incoming_cleanup_closeuf();
626         close(mis->userfault_fd);
627         close(mis->userfault_event_fd);
628         mis->have_fault_thread = false;
629     }
630 
631     if (enable_mlock) {
632         if (os_mlock() < 0) {
633             error_report("mlock: %s", strerror(errno));
634             /*
635              * It doesn't feel right to fail at this point, we have a valid
636              * VM state.
637              */
638         }
639     }
640 
641     postcopy_temp_pages_cleanup(mis);
642 
643     trace_postcopy_ram_incoming_cleanup_blocktime(
644             get_postcopy_total_blocktime());
645 
646     trace_postcopy_ram_incoming_cleanup_exit();
647     return 0;
648 }
649 
650 /*
651  * Disable huge pages on an area
652  */
653 static int nhp_range(RAMBlock *rb, void *opaque)
654 {
655     const char *block_name = qemu_ram_get_idstr(rb);
656     void *host_addr = qemu_ram_get_host_addr(rb);
657     ram_addr_t offset = qemu_ram_get_offset(rb);
658     ram_addr_t length = rb->postcopy_length;
659     trace_postcopy_nhp_range(block_name, host_addr, offset, length);
660 
661     /*
662      * Before we do discards we need to ensure those discards really
663      * do delete areas of the page, even if THP thinks a hugepage would
664      * be a good idea, so force hugepages off.
665      */
666     qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
667 
668     return 0;
669 }
670 
671 /*
672  * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
673  * however leaving it until after precopy means that most of the precopy
674  * data is still THPd
675  */
676 int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
677 {
678     if (foreach_not_ignored_block(nhp_range, mis)) {
679         return -1;
680     }
681 
682     postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
683 
684     return 0;
685 }
686 
687 /*
688  * Mark the given area of RAM as requiring notification to unwritten areas
689  * Used as a  callback on foreach_not_ignored_block.
690  *   host_addr: Base of area to mark
691  *   offset: Offset in the whole ram arena
692  *   length: Length of the section
693  *   opaque: MigrationIncomingState pointer
694  * Returns 0 on success
695  */
696 static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
697 {
698     MigrationIncomingState *mis = opaque;
699     struct uffdio_register reg_struct;
700 
701     reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
702     reg_struct.range.len = rb->postcopy_length;
703     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
704 
705     /* Now tell our userfault_fd that it's responsible for this area */
706     if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
707         error_report("%s userfault register: %s", __func__, strerror(errno));
708         return -1;
709     }
710     if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
711         error_report("%s userfault: Region doesn't support COPY", __func__);
712         return -1;
713     }
714     if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
715         qemu_ram_set_uf_zeroable(rb);
716     }
717 
718     return 0;
719 }
720 
721 int postcopy_wake_shared(struct PostCopyFD *pcfd,
722                          uint64_t client_addr,
723                          RAMBlock *rb)
724 {
725     size_t pagesize = qemu_ram_pagesize(rb);
726     struct uffdio_range range;
727     int ret;
728     trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
729     range.start = ROUND_DOWN(client_addr, pagesize);
730     range.len = pagesize;
731     ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
732     if (ret) {
733         error_report("%s: Failed to wake: %zx in %s (%s)",
734                      __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
735                      strerror(errno));
736     }
737     return ret;
738 }
739 
740 static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
741                                  ram_addr_t start, uint64_t haddr)
742 {
743     void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
744 
745     /*
746      * Discarded pages (via RamDiscardManager) are never migrated. On unlikely
747      * access, place a zeropage, which will also set the relevant bits in the
748      * recv_bitmap accordingly, so we won't try placing a zeropage twice.
749      *
750      * Checking a single bit is sufficient to handle pagesize > TPS as either
751      * all relevant bits are set or not.
752      */
753     assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
754     if (ramblock_page_is_discarded(rb, start)) {
755         bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
756 
757         return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
758     }
759 
760     return migrate_send_rp_req_pages(mis, rb, start, haddr);
761 }
762 
763 /*
764  * Callback from shared fault handlers to ask for a page,
765  * the page must be specified by a RAMBlock and an offset in that rb
766  * Note: Only for use by shared fault handlers (in fault thread)
767  */
768 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
769                                  uint64_t client_addr, uint64_t rb_offset)
770 {
771     uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
772     MigrationIncomingState *mis = migration_incoming_get_current();
773 
774     trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
775                                        rb_offset);
776     if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
777         trace_postcopy_request_shared_page_present(pcfd->idstr,
778                                         qemu_ram_get_idstr(rb), rb_offset);
779         return postcopy_wake_shared(pcfd, client_addr, rb);
780     }
781     postcopy_request_page(mis, rb, aligned_rbo, client_addr);
782     return 0;
783 }
784 
785 static int get_mem_fault_cpu_index(uint32_t pid)
786 {
787     CPUState *cpu_iter;
788 
789     CPU_FOREACH(cpu_iter) {
790         if (cpu_iter->thread_id == pid) {
791             trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
792             return cpu_iter->cpu_index;
793         }
794     }
795     trace_get_mem_fault_cpu_index(-1, pid);
796     return -1;
797 }
798 
799 static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
800 {
801     int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
802                                     dc->start_time;
803     return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
804 }
805 
806 /*
807  * This function is being called when pagefault occurs. It
808  * tracks down vCPU blocking time.
809  *
810  * @addr: faulted host virtual address
811  * @ptid: faulted process thread id
812  * @rb: ramblock appropriate to addr
813  */
814 static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
815                                           RAMBlock *rb)
816 {
817     int cpu, already_received;
818     MigrationIncomingState *mis = migration_incoming_get_current();
819     PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
820     uint32_t low_time_offset;
821 
822     if (!dc || ptid == 0) {
823         return;
824     }
825     cpu = get_mem_fault_cpu_index(ptid);
826     if (cpu < 0) {
827         return;
828     }
829 
830     low_time_offset = get_low_time_offset(dc);
831     if (dc->vcpu_addr[cpu] == 0) {
832         qatomic_inc(&dc->smp_cpus_down);
833     }
834 
835     qatomic_xchg(&dc->last_begin, low_time_offset);
836     qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
837     qatomic_xchg(&dc->vcpu_addr[cpu], addr);
838 
839     /*
840      * check it here, not at the beginning of the function,
841      * due to, check could occur early than bitmap_set in
842      * qemu_ufd_copy_ioctl
843      */
844     already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
845     if (already_received) {
846         qatomic_xchg(&dc->vcpu_addr[cpu], 0);
847         qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
848         qatomic_dec(&dc->smp_cpus_down);
849     }
850     trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
851                                         cpu, already_received);
852 }
853 
854 /*
855  *  This function just provide calculated blocktime per cpu and trace it.
856  *  Total blocktime is calculated in mark_postcopy_blocktime_end.
857  *
858  *
859  * Assume we have 3 CPU
860  *
861  *      S1        E1           S1               E1
862  * -----***********------------xxx***************------------------------> CPU1
863  *
864  *             S2                E2
865  * ------------****************xxx---------------------------------------> CPU2
866  *
867  *                         S3            E3
868  * ------------------------****xxx********-------------------------------> CPU3
869  *
870  * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
871  * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
872  * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
873  *            it's a part of total blocktime.
874  * S1 - here is last_begin
875  * Legend of the picture is following:
876  *              * - means blocktime per vCPU
877  *              x - means overlapped blocktime (total blocktime)
878  *
879  * @addr: host virtual address
880  */
881 static void mark_postcopy_blocktime_end(uintptr_t addr)
882 {
883     MigrationIncomingState *mis = migration_incoming_get_current();
884     PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
885     MachineState *ms = MACHINE(qdev_get_machine());
886     unsigned int smp_cpus = ms->smp.cpus;
887     int i, affected_cpu = 0;
888     bool vcpu_total_blocktime = false;
889     uint32_t read_vcpu_time, low_time_offset;
890 
891     if (!dc) {
892         return;
893     }
894 
895     low_time_offset = get_low_time_offset(dc);
896     /* lookup cpu, to clear it,
897      * that algorithm looks straightforward, but it's not
898      * optimal, more optimal algorithm is keeping tree or hash
899      * where key is address value is a list of  */
900     for (i = 0; i < smp_cpus; i++) {
901         uint32_t vcpu_blocktime = 0;
902 
903         read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
904         if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
905             read_vcpu_time == 0) {
906             continue;
907         }
908         qatomic_xchg(&dc->vcpu_addr[i], 0);
909         vcpu_blocktime = low_time_offset - read_vcpu_time;
910         affected_cpu += 1;
911         /* we need to know is that mark_postcopy_end was due to
912          * faulted page, another possible case it's prefetched
913          * page and in that case we shouldn't be here */
914         if (!vcpu_total_blocktime &&
915             qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
916             vcpu_total_blocktime = true;
917         }
918         /* continue cycle, due to one page could affect several vCPUs */
919         dc->vcpu_blocktime[i] += vcpu_blocktime;
920     }
921 
922     qatomic_sub(&dc->smp_cpus_down, affected_cpu);
923     if (vcpu_total_blocktime) {
924         dc->total_blocktime += low_time_offset - qatomic_fetch_add(
925                 &dc->last_begin, 0);
926     }
927     trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
928                                       affected_cpu);
929 }
930 
931 static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
932 {
933     trace_postcopy_pause_fault_thread();
934     qemu_sem_wait(&mis->postcopy_pause_sem_fault);
935     trace_postcopy_pause_fault_thread_continued();
936 }
937 
938 /*
939  * Handle faults detected by the USERFAULT markings
940  */
941 static void *postcopy_ram_fault_thread(void *opaque)
942 {
943     MigrationIncomingState *mis = opaque;
944     struct uffd_msg msg;
945     int ret;
946     size_t index;
947     RAMBlock *rb = NULL;
948 
949     trace_postcopy_ram_fault_thread_entry();
950     rcu_register_thread();
951     mis->last_rb = NULL; /* last RAMBlock we sent part of */
952     qemu_sem_post(&mis->thread_sync_sem);
953 
954     struct pollfd *pfd;
955     size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
956 
957     pfd = g_new0(struct pollfd, pfd_len);
958 
959     pfd[0].fd = mis->userfault_fd;
960     pfd[0].events = POLLIN;
961     pfd[1].fd = mis->userfault_event_fd;
962     pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
963     trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
964     for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
965         struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
966                                                  struct PostCopyFD, index);
967         pfd[2 + index].fd = pcfd->fd;
968         pfd[2 + index].events = POLLIN;
969         trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
970                                                   pcfd->fd);
971     }
972 
973     while (true) {
974         ram_addr_t rb_offset;
975         int poll_result;
976 
977         /*
978          * We're mainly waiting for the kernel to give us a faulting HVA,
979          * however we can be told to quit via userfault_quit_fd which is
980          * an eventfd
981          */
982 
983         poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
984         if (poll_result == -1) {
985             error_report("%s: userfault poll: %s", __func__, strerror(errno));
986             break;
987         }
988 
989         if (!mis->to_src_file) {
990             /*
991              * Possibly someone tells us that the return path is
992              * broken already using the event. We should hold until
993              * the channel is rebuilt.
994              */
995             postcopy_pause_fault_thread(mis);
996         }
997 
998         if (pfd[1].revents) {
999             uint64_t tmp64 = 0;
1000 
1001             /* Consume the signal */
1002             if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
1003                 /* Nothing obviously nicer than posting this error. */
1004                 error_report("%s: read() failed", __func__);
1005             }
1006 
1007             if (qatomic_read(&mis->fault_thread_quit)) {
1008                 trace_postcopy_ram_fault_thread_quit();
1009                 break;
1010             }
1011         }
1012 
1013         if (pfd[0].revents) {
1014             poll_result--;
1015             ret = read(mis->userfault_fd, &msg, sizeof(msg));
1016             if (ret != sizeof(msg)) {
1017                 if (errno == EAGAIN) {
1018                     /*
1019                      * if a wake up happens on the other thread just after
1020                      * the poll, there is nothing to read.
1021                      */
1022                     continue;
1023                 }
1024                 if (ret < 0) {
1025                     error_report("%s: Failed to read full userfault "
1026                                  "message: %s",
1027                                  __func__, strerror(errno));
1028                     break;
1029                 } else {
1030                     error_report("%s: Read %d bytes from userfaultfd "
1031                                  "expected %zd",
1032                                  __func__, ret, sizeof(msg));
1033                     break; /* Lost alignment, don't know what we'd read next */
1034                 }
1035             }
1036             if (msg.event != UFFD_EVENT_PAGEFAULT) {
1037                 error_report("%s: Read unexpected event %ud from userfaultfd",
1038                              __func__, msg.event);
1039                 continue; /* It's not a page fault, shouldn't happen */
1040             }
1041 
1042             rb = qemu_ram_block_from_host(
1043                      (void *)(uintptr_t)msg.arg.pagefault.address,
1044                      true, &rb_offset);
1045             if (!rb) {
1046                 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
1047                              PRIx64, (uint64_t)msg.arg.pagefault.address);
1048                 break;
1049             }
1050 
1051             rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
1052             trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
1053                                                 qemu_ram_get_idstr(rb),
1054                                                 rb_offset,
1055                                                 msg.arg.pagefault.feat.ptid);
1056             mark_postcopy_blocktime_begin(
1057                     (uintptr_t)(msg.arg.pagefault.address),
1058                                 msg.arg.pagefault.feat.ptid, rb);
1059 
1060 retry:
1061             /*
1062              * Send the request to the source - we want to request one
1063              * of our host page sizes (which is >= TPS)
1064              */
1065             ret = postcopy_request_page(mis, rb, rb_offset,
1066                                         msg.arg.pagefault.address);
1067             if (ret) {
1068                 /* May be network failure, try to wait for recovery */
1069                 postcopy_pause_fault_thread(mis);
1070                 goto retry;
1071             }
1072         }
1073 
1074         /* Now handle any requests from external processes on shared memory */
1075         /* TODO: May need to handle devices deregistering during postcopy */
1076         for (index = 2; index < pfd_len && poll_result; index++) {
1077             if (pfd[index].revents) {
1078                 struct PostCopyFD *pcfd =
1079                     &g_array_index(mis->postcopy_remote_fds,
1080                                    struct PostCopyFD, index - 2);
1081 
1082                 poll_result--;
1083                 if (pfd[index].revents & POLLERR) {
1084                     error_report("%s: POLLERR on poll %zd fd=%d",
1085                                  __func__, index, pcfd->fd);
1086                     pfd[index].events = 0;
1087                     continue;
1088                 }
1089 
1090                 ret = read(pcfd->fd, &msg, sizeof(msg));
1091                 if (ret != sizeof(msg)) {
1092                     if (errno == EAGAIN) {
1093                         /*
1094                          * if a wake up happens on the other thread just after
1095                          * the poll, there is nothing to read.
1096                          */
1097                         continue;
1098                     }
1099                     if (ret < 0) {
1100                         error_report("%s: Failed to read full userfault "
1101                                      "message: %s (shared) revents=%d",
1102                                      __func__, strerror(errno),
1103                                      pfd[index].revents);
1104                         /*TODO: Could just disable this sharer */
1105                         break;
1106                     } else {
1107                         error_report("%s: Read %d bytes from userfaultfd "
1108                                      "expected %zd (shared)",
1109                                      __func__, ret, sizeof(msg));
1110                         /*TODO: Could just disable this sharer */
1111                         break; /*Lost alignment,don't know what we'd read next*/
1112                     }
1113                 }
1114                 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1115                     error_report("%s: Read unexpected event %ud "
1116                                  "from userfaultfd (shared)",
1117                                  __func__, msg.event);
1118                     continue; /* It's not a page fault, shouldn't happen */
1119                 }
1120                 /* Call the device handler registered with us */
1121                 ret = pcfd->handler(pcfd, &msg);
1122                 if (ret) {
1123                     error_report("%s: Failed to resolve shared fault on %zd/%s",
1124                                  __func__, index, pcfd->idstr);
1125                     /* TODO: Fail? Disable this sharer? */
1126                 }
1127             }
1128         }
1129     }
1130     rcu_unregister_thread();
1131     trace_postcopy_ram_fault_thread_exit();
1132     g_free(pfd);
1133     return NULL;
1134 }
1135 
1136 static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
1137 {
1138     PostcopyTmpPage *tmp_page;
1139     int err, i, channels;
1140     void *temp_page;
1141 
1142     if (migrate_postcopy_preempt()) {
1143         /* If preemption enabled, need extra channel for urgent requests */
1144         mis->postcopy_channels = RAM_CHANNEL_MAX;
1145     } else {
1146         /* Both precopy/postcopy on the same channel */
1147         mis->postcopy_channels = 1;
1148     }
1149 
1150     channels = mis->postcopy_channels;
1151     mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels);
1152 
1153     for (i = 0; i < channels; i++) {
1154         tmp_page = &mis->postcopy_tmp_pages[i];
1155         temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE,
1156                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1157         if (temp_page == MAP_FAILED) {
1158             err = errno;
1159             error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s",
1160                          __func__, i, strerror(err));
1161             /* Clean up will be done later */
1162             return -err;
1163         }
1164         tmp_page->tmp_huge_page = temp_page;
1165         /* Initialize default states for each tmp page */
1166         postcopy_temp_page_reset(tmp_page);
1167     }
1168 
1169     /*
1170      * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
1171      */
1172     mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1173                                        PROT_READ | PROT_WRITE,
1174                                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1175     if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1176         err = errno;
1177         mis->postcopy_tmp_zero_page = NULL;
1178         error_report("%s: Failed to map large zero page %s",
1179                      __func__, strerror(err));
1180         return -err;
1181     }
1182 
1183     memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1184 
1185     return 0;
1186 }
1187 
1188 int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1189 {
1190     /* Open the fd for the kernel to give us userfaults */
1191     mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
1192     if (mis->userfault_fd == -1) {
1193         error_report("%s: Failed to open userfault fd: %s", __func__,
1194                      strerror(errno));
1195         return -1;
1196     }
1197 
1198     /*
1199      * Although the host check already tested the API, we need to
1200      * do the check again as an ABI handshake on the new fd.
1201      */
1202     if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1203         return -1;
1204     }
1205 
1206     /* Now an eventfd we use to tell the fault-thread to quit */
1207     mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1208     if (mis->userfault_event_fd == -1) {
1209         error_report("%s: Opening userfault_event_fd: %s", __func__,
1210                      strerror(errno));
1211         close(mis->userfault_fd);
1212         return -1;
1213     }
1214 
1215     postcopy_thread_create(mis, &mis->fault_thread, "fault-default",
1216                            postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
1217     mis->have_fault_thread = true;
1218 
1219     /* Mark so that we get notified of accesses to unwritten areas */
1220     if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1221         error_report("ram_block_enable_notify failed");
1222         return -1;
1223     }
1224 
1225     if (postcopy_temp_pages_setup(mis)) {
1226         /* Error dumped in the sub-function */
1227         return -1;
1228     }
1229 
1230     if (migrate_postcopy_preempt()) {
1231         /*
1232          * This thread needs to be created after the temp pages because
1233          * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
1234          */
1235         postcopy_thread_create(mis, &mis->postcopy_prio_thread, "fault-fast",
1236                                postcopy_preempt_thread, QEMU_THREAD_JOINABLE);
1237         mis->preempt_thread_status = PREEMPT_THREAD_CREATED;
1238     }
1239 
1240     trace_postcopy_ram_enable_notify();
1241 
1242     return 0;
1243 }
1244 
1245 static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1246                                void *from_addr, uint64_t pagesize, RAMBlock *rb)
1247 {
1248     int userfault_fd = mis->userfault_fd;
1249     int ret;
1250 
1251     if (from_addr) {
1252         struct uffdio_copy copy_struct;
1253         copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1254         copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1255         copy_struct.len = pagesize;
1256         copy_struct.mode = 0;
1257         ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
1258     } else {
1259         struct uffdio_zeropage zero_struct;
1260         zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1261         zero_struct.range.len = pagesize;
1262         zero_struct.mode = 0;
1263         ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1264     }
1265     if (!ret) {
1266         qemu_mutex_lock(&mis->page_request_mutex);
1267         ramblock_recv_bitmap_set_range(rb, host_addr,
1268                                        pagesize / qemu_target_page_size());
1269         /*
1270          * If this page resolves a page fault for a previous recorded faulted
1271          * address, take a special note to maintain the requested page list.
1272          */
1273         if (g_tree_lookup(mis->page_requested, host_addr)) {
1274             g_tree_remove(mis->page_requested, host_addr);
1275             mis->page_requested_count--;
1276             trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1277         }
1278         qemu_mutex_unlock(&mis->page_request_mutex);
1279         mark_postcopy_blocktime_end((uintptr_t)host_addr);
1280     }
1281     return ret;
1282 }
1283 
1284 int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1285 {
1286     int i;
1287     MigrationIncomingState *mis = migration_incoming_get_current();
1288     GArray *pcrfds = mis->postcopy_remote_fds;
1289 
1290     for (i = 0; i < pcrfds->len; i++) {
1291         struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1292         int ret = cur->waker(cur, rb, offset);
1293         if (ret) {
1294             return ret;
1295         }
1296     }
1297     return 0;
1298 }
1299 
1300 /*
1301  * Place a host page (from) at (host) atomically
1302  * returns 0 on success
1303  */
1304 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1305                         RAMBlock *rb)
1306 {
1307     size_t pagesize = qemu_ram_pagesize(rb);
1308 
1309     /* copy also acks to the kernel waking the stalled thread up
1310      * TODO: We can inhibit that ack and only do it if it was requested
1311      * which would be slightly cheaper, but we'd have to be careful
1312      * of the order of updating our page state.
1313      */
1314     if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
1315         int e = errno;
1316         error_report("%s: %s copy host: %p from: %p (size: %zd)",
1317                      __func__, strerror(e), host, from, pagesize);
1318 
1319         return -e;
1320     }
1321 
1322     trace_postcopy_place_page(host);
1323     return postcopy_notify_shared_wake(rb,
1324                                        qemu_ram_block_host_offset(rb, host));
1325 }
1326 
1327 /*
1328  * Place a zero page at (host) atomically
1329  * returns 0 on success
1330  */
1331 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1332                              RAMBlock *rb)
1333 {
1334     size_t pagesize = qemu_ram_pagesize(rb);
1335     trace_postcopy_place_page_zero(host);
1336 
1337     /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
1338      * but it's not available for everything (e.g. hugetlbpages)
1339      */
1340     if (qemu_ram_is_uf_zeroable(rb)) {
1341         if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
1342             int e = errno;
1343             error_report("%s: %s zero host: %p",
1344                          __func__, strerror(e), host);
1345 
1346             return -e;
1347         }
1348         return postcopy_notify_shared_wake(rb,
1349                                            qemu_ram_block_host_offset(rb,
1350                                                                       host));
1351     } else {
1352         return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1353     }
1354 }
1355 
1356 #else
1357 /* No target OS support, stubs just fail */
1358 void fill_destination_postcopy_migration_info(MigrationInfo *info)
1359 {
1360 }
1361 
1362 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1363 {
1364     error_report("%s: No OS support", __func__);
1365     return false;
1366 }
1367 
1368 int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1369 {
1370     error_report("postcopy_ram_incoming_init: No OS support");
1371     return -1;
1372 }
1373 
1374 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1375 {
1376     assert(0);
1377     return -1;
1378 }
1379 
1380 int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1381 {
1382     assert(0);
1383     return -1;
1384 }
1385 
1386 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1387                                  uint64_t client_addr, uint64_t rb_offset)
1388 {
1389     assert(0);
1390     return -1;
1391 }
1392 
1393 int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1394 {
1395     assert(0);
1396     return -1;
1397 }
1398 
1399 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1400                         RAMBlock *rb)
1401 {
1402     assert(0);
1403     return -1;
1404 }
1405 
1406 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1407                         RAMBlock *rb)
1408 {
1409     assert(0);
1410     return -1;
1411 }
1412 
1413 int postcopy_wake_shared(struct PostCopyFD *pcfd,
1414                          uint64_t client_addr,
1415                          RAMBlock *rb)
1416 {
1417     assert(0);
1418     return -1;
1419 }
1420 #endif
1421 
1422 /* ------------------------------------------------------------------------- */
1423 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page)
1424 {
1425     tmp_page->target_pages = 0;
1426     tmp_page->host_addr = NULL;
1427     /*
1428      * This is set to true when reset, and cleared as long as we received any
1429      * of the non-zero small page within this huge page.
1430      */
1431     tmp_page->all_zero = true;
1432 }
1433 
1434 void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1435 {
1436     uint64_t tmp64 = 1;
1437 
1438     /*
1439      * Wakeup the fault_thread.  It's an eventfd that should currently
1440      * be at 0, we're going to increment it to 1
1441      */
1442     if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1443         /* Not much we can do here, but may as well report it */
1444         error_report("%s: incrementing failed: %s", __func__,
1445                      strerror(errno));
1446     }
1447 }
1448 
1449 /**
1450  * postcopy_discard_send_init: Called at the start of each RAMBlock before
1451  *   asking to discard individual ranges.
1452  *
1453  * @ms: The current migration state.
1454  * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
1455  * @name: RAMBlock that discards will operate on.
1456  */
1457 static PostcopyDiscardState pds = {0};
1458 void postcopy_discard_send_init(MigrationState *ms, const char *name)
1459 {
1460     pds.ramblock_name = name;
1461     pds.cur_entry = 0;
1462     pds.nsentwords = 0;
1463     pds.nsentcmds = 0;
1464 }
1465 
1466 /**
1467  * postcopy_discard_send_range: Called by the bitmap code for each chunk to
1468  *   discard. May send a discard message, may just leave it queued to
1469  *   be sent later.
1470  *
1471  * @ms: Current migration state.
1472  * @start,@length: a range of pages in the migration bitmap in the
1473  *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
1474  */
1475 void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1476                                  unsigned long length)
1477 {
1478     size_t tp_size = qemu_target_page_size();
1479     /* Convert to byte offsets within the RAM block */
1480     pds.start_list[pds.cur_entry] = start  * tp_size;
1481     pds.length_list[pds.cur_entry] = length * tp_size;
1482     trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1483     pds.cur_entry++;
1484     pds.nsentwords++;
1485 
1486     if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1487         /* Full set, ship it! */
1488         qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1489                                               pds.ramblock_name,
1490                                               pds.cur_entry,
1491                                               pds.start_list,
1492                                               pds.length_list);
1493         pds.nsentcmds++;
1494         pds.cur_entry = 0;
1495     }
1496 }
1497 
1498 /**
1499  * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
1500  * bitmap code. Sends any outstanding discard messages, frees the PDS
1501  *
1502  * @ms: Current migration state.
1503  */
1504 void postcopy_discard_send_finish(MigrationState *ms)
1505 {
1506     /* Anything unsent? */
1507     if (pds.cur_entry) {
1508         qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1509                                               pds.ramblock_name,
1510                                               pds.cur_entry,
1511                                               pds.start_list,
1512                                               pds.length_list);
1513         pds.nsentcmds++;
1514     }
1515 
1516     trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1517                                        pds.nsentcmds);
1518 }
1519 
1520 /*
1521  * Current state of incoming postcopy; note this is not part of
1522  * MigrationIncomingState since it's state is used during cleanup
1523  * at the end as MIS is being freed.
1524  */
1525 static PostcopyState incoming_postcopy_state;
1526 
1527 PostcopyState  postcopy_state_get(void)
1528 {
1529     return qatomic_load_acquire(&incoming_postcopy_state);
1530 }
1531 
1532 /* Set the state and return the old state */
1533 PostcopyState postcopy_state_set(PostcopyState new_state)
1534 {
1535     return qatomic_xchg(&incoming_postcopy_state, new_state);
1536 }
1537 
1538 /* Register a handler for external shared memory postcopy
1539  * called on the destination.
1540  */
1541 void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1542 {
1543     MigrationIncomingState *mis = migration_incoming_get_current();
1544 
1545     mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1546                                                   *pcfd);
1547 }
1548 
1549 /* Unregister a handler for external shared memory postcopy
1550  */
1551 void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1552 {
1553     guint i;
1554     MigrationIncomingState *mis = migration_incoming_get_current();
1555     GArray *pcrfds = mis->postcopy_remote_fds;
1556 
1557     if (!pcrfds) {
1558         /* migration has already finished and freed the array */
1559         return;
1560     }
1561     for (i = 0; i < pcrfds->len; i++) {
1562         struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1563         if (cur->fd == pcfd->fd) {
1564             mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1565             return;
1566         }
1567     }
1568 }
1569 
1570 void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
1571 {
1572     /*
1573      * The new loading channel has its own threads, so it needs to be
1574      * blocked too.  It's by default true, just be explicit.
1575      */
1576     qemu_file_set_blocking(file, true);
1577     mis->postcopy_qemufile_dst = file;
1578     qemu_sem_post(&mis->postcopy_qemufile_dst_done);
1579     trace_postcopy_preempt_new_channel();
1580 }
1581 
1582 /*
1583  * Setup the postcopy preempt channel with the IOC.  If ERROR is specified,
1584  * setup the error instead.  This helper will free the ERROR if specified.
1585  */
1586 static void
1587 postcopy_preempt_send_channel_done(MigrationState *s,
1588                                    QIOChannel *ioc, Error *local_err)
1589 {
1590     if (local_err) {
1591         migrate_set_error(s, local_err);
1592         error_free(local_err);
1593     } else {
1594         migration_ioc_register_yank(ioc);
1595         s->postcopy_qemufile_src = qemu_file_new_output(ioc);
1596         trace_postcopy_preempt_new_channel();
1597     }
1598 
1599     /*
1600      * Kick the waiter in all cases.  The waiter should check upon
1601      * postcopy_qemufile_src to know whether it failed or not.
1602      */
1603     qemu_sem_post(&s->postcopy_qemufile_src_sem);
1604 }
1605 
1606 static void
1607 postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque)
1608 {
1609     g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1610     MigrationState *s = opaque;
1611     Error *local_err = NULL;
1612 
1613     qio_task_propagate_error(task, &local_err);
1614     postcopy_preempt_send_channel_done(s, ioc, local_err);
1615 }
1616 
1617 static void
1618 postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque)
1619 {
1620     g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1621     MigrationState *s = opaque;
1622     QIOChannelTLS *tioc;
1623     Error *local_err = NULL;
1624 
1625     if (qio_task_propagate_error(task, &local_err)) {
1626         goto out;
1627     }
1628 
1629     if (migrate_channel_requires_tls_upgrade(ioc)) {
1630         tioc = migration_tls_client_create(s, ioc, s->hostname, &local_err);
1631         if (!tioc) {
1632             goto out;
1633         }
1634         trace_postcopy_preempt_tls_handshake();
1635         qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt");
1636         qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake,
1637                                   s, NULL, NULL);
1638         /* Setup the channel until TLS handshake finished */
1639         return;
1640     }
1641 
1642 out:
1643     /* This handles both good and error cases */
1644     postcopy_preempt_send_channel_done(s, ioc, local_err);
1645 }
1646 
1647 /*
1648  * This function will kick off an async task to establish the preempt
1649  * channel, and wait until the connection setup completed.  Returns 0 if
1650  * channel established, -1 for error.
1651  */
1652 int postcopy_preempt_establish_channel(MigrationState *s)
1653 {
1654     /* If preempt not enabled, no need to wait */
1655     if (!migrate_postcopy_preempt()) {
1656         return 0;
1657     }
1658 
1659     /*
1660      * Kick off async task to establish preempt channel.  Only do so with
1661      * 8.0+ machines, because 7.1/7.2 require the channel to be created in
1662      * setup phase of migration (even if racy in an unreliable network).
1663      */
1664     if (!s->preempt_pre_7_2) {
1665         postcopy_preempt_setup(s);
1666     }
1667 
1668     /*
1669      * We need the postcopy preempt channel to be established before
1670      * starting doing anything.
1671      */
1672     qemu_sem_wait(&s->postcopy_qemufile_src_sem);
1673 
1674     return s->postcopy_qemufile_src ? 0 : -1;
1675 }
1676 
1677 void postcopy_preempt_setup(MigrationState *s)
1678 {
1679     /* Kick an async task to connect */
1680     socket_send_channel_create(postcopy_preempt_send_channel_new, s);
1681 }
1682 
1683 static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
1684 {
1685     trace_postcopy_pause_fast_load();
1686     qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
1687     qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);
1688     qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
1689     trace_postcopy_pause_fast_load_continued();
1690 }
1691 
1692 static bool preempt_thread_should_run(MigrationIncomingState *mis)
1693 {
1694     return mis->preempt_thread_status != PREEMPT_THREAD_QUIT;
1695 }
1696 
1697 void *postcopy_preempt_thread(void *opaque)
1698 {
1699     MigrationIncomingState *mis = opaque;
1700     int ret;
1701 
1702     trace_postcopy_preempt_thread_entry();
1703 
1704     rcu_register_thread();
1705 
1706     qemu_sem_post(&mis->thread_sync_sem);
1707 
1708     /*
1709      * The preempt channel is established in asynchronous way.  Wait
1710      * for its completion.
1711      */
1712     qemu_sem_wait(&mis->postcopy_qemufile_dst_done);
1713 
1714     /* Sending RAM_SAVE_FLAG_EOS to terminate this thread */
1715     qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
1716     while (preempt_thread_should_run(mis)) {
1717         ret = ram_load_postcopy(mis->postcopy_qemufile_dst,
1718                                 RAM_CHANNEL_POSTCOPY);
1719         /* If error happened, go into recovery routine */
1720         if (ret && preempt_thread_should_run(mis)) {
1721             postcopy_pause_ram_fast_load(mis);
1722         } else {
1723             /* We're done */
1724             break;
1725         }
1726     }
1727     qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
1728 
1729     rcu_unregister_thread();
1730 
1731     trace_postcopy_preempt_thread_exit();
1732 
1733     return NULL;
1734 }
1735