1 /*
2 * Postcopy migration for RAM
3 *
4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
5 *
6 * Authors:
7 * Dave Gilbert <dgilbert@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
11 *
12 */
13
14 /*
15 * Postcopy is a migration technique where the execution flips from the
16 * source to the destination before all the data has been copied.
17 */
18
19 #include "qemu/osdep.h"
20 #include "qemu/madvise.h"
21 #include "exec/target_page.h"
22 #include "migration.h"
23 #include "qemu-file.h"
24 #include "savevm.h"
25 #include "postcopy-ram.h"
26 #include "ram.h"
27 #include "qapi/error.h"
28 #include "qemu/notify.h"
29 #include "qemu/rcu.h"
30 #include "system/system.h"
31 #include "qemu/error-report.h"
32 #include "trace.h"
33 #include "hw/boards.h"
34 #include "system/ramblock.h"
35 #include "socket.h"
36 #include "yank_functions.h"
37 #include "tls.h"
38 #include "qemu/userfaultfd.h"
39 #include "qemu/mmap-alloc.h"
40 #include "options.h"
41
42 /* Arbitrary limit on size of each discard command,
43 * keeps them around ~200 bytes
44 */
45 #define MAX_DISCARDS_PER_COMMAND 12
46
47 typedef struct PostcopyDiscardState {
48 const char *ramblock_name;
49 uint16_t cur_entry;
50 /*
51 * Start and length of a discard range (bytes)
52 */
53 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
54 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
55 unsigned int nsentwords;
56 unsigned int nsentcmds;
57 } PostcopyDiscardState;
58
59 static NotifierWithReturnList postcopy_notifier_list;
60
postcopy_infrastructure_init(void)61 void postcopy_infrastructure_init(void)
62 {
63 notifier_with_return_list_init(&postcopy_notifier_list);
64 }
65
postcopy_add_notifier(NotifierWithReturn * nn)66 void postcopy_add_notifier(NotifierWithReturn *nn)
67 {
68 notifier_with_return_list_add(&postcopy_notifier_list, nn);
69 }
70
postcopy_remove_notifier(NotifierWithReturn * n)71 void postcopy_remove_notifier(NotifierWithReturn *n)
72 {
73 notifier_with_return_remove(n);
74 }
75
postcopy_notify(enum PostcopyNotifyReason reason,Error ** errp)76 int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
77 {
78 struct PostcopyNotifyData pnd;
79 pnd.reason = reason;
80
81 return notifier_with_return_list_notify(&postcopy_notifier_list,
82 &pnd, errp);
83 }
84
85 /*
86 * NOTE: this routine is not thread safe, we can't call it concurrently. But it
87 * should be good enough for migration's purposes.
88 */
postcopy_thread_create(MigrationIncomingState * mis,QemuThread * thread,const char * name,void * (* fn)(void *),int joinable)89 void postcopy_thread_create(MigrationIncomingState *mis,
90 QemuThread *thread, const char *name,
91 void *(*fn)(void *), int joinable)
92 {
93 qemu_event_init(&mis->thread_sync_event, false);
94 qemu_thread_create(thread, name, fn, mis, joinable);
95 qemu_event_wait(&mis->thread_sync_event);
96 qemu_event_destroy(&mis->thread_sync_event);
97 }
98
99 /* Postcopy needs to detect accesses to pages that haven't yet been copied
100 * across, and efficiently map new pages in, the techniques for doing this
101 * are target OS specific.
102 */
103 #if defined(__linux__)
104 #include <poll.h>
105 #include <sys/ioctl.h>
106 #include <sys/syscall.h>
107 #endif
108
109 #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
110 #include <sys/eventfd.h>
111 #include <linux/userfaultfd.h>
112
113 /*
114 * Here we use 24 buckets, which means the last bucket will cover [2^24 us,
115 * 2^25 us) ~= [16, 32) seconds. It should be far enough to record even
116 * extreme (perf-wise broken) 1G pages moving over, which can sometimes
117 * take a few seconds due to various reasons. Anything more than that
118 * might be unsensible to account anymore.
119 */
120 #define BLOCKTIME_LATENCY_BUCKET_N (24)
121
122 /* All the time records are in unit of nanoseconds */
123 typedef struct PostcopyBlocktimeContext {
124 /* blocktime per vCPU */
125 uint64_t *vcpu_blocktime_total;
126 /* count of faults per vCPU */
127 uint64_t *vcpu_faults_count;
128 /*
129 * count of currently blocked faults per vCPU.
130 *
131 * NOTE: Normally there should only be one fault in-progress per vCPU
132 * thread, so logically it _seems_ vcpu_faults_count[] for any vCPU
133 * should be either zero or one. However, there can be reasons we see
134 * >1 faults on the same vCPU thread.
135 *
136 * CASE (1): since the process to resolve faults (ioctl(UFFDIO_COPY),
137 * for example) is done before taking the mutex that protects the
138 * blocktime context, it can happen that we read more than one faulted
139 * addresses per vCPU.
140 *
141 * One example when we can see >1 faulted addresses for one vCPU:
142 *
143 * vcpu1 thread fault thread resolve thread
144 * ============ ============ ==============
145 *
146 * faulted on addr1
147 * read uffd msg (addr1)
148 * MUTEX_LOCK
149 * add entry (cpu1, addr1)
150 * MUTEX_UNLOCK
151 * request remote fault (addr1)
152 * resolve fault (addr1)
153 * addr1 resolved, continue..
154 * faulted on addr2
155 * read uffd msg (addr2)
156 * MUTEX_LOCK
157 * add entry (cpu1, addr2) <--------------- [A]
158 * MUTEX_UNLOCK
159 * MUTEX_LOCK
160 * remove entry (cpu1, addr1)
161 * MUTEX_UNLOCK
162 *
163 * In above case, we may see (cpu1, addr1) and (cpu1, addr2) entries to
164 * appear together at [A], when it gets the lock before the resolve
165 * thread. Use this counter to maintain such case, and only when it
166 * reaches zero we know the vCPU is not blocked anymore.
167 *
168 * CASE (2): theoretically (the author admit to not have verified
169 * this..), one vCPU thread can also generate more than one userfaultfd
170 * message on the same address. It can happen e.g. for whatever reason
171 * the fault got retried before a resolution arrives. In that extremely
172 * rare case, we could also see two (cpu1, addr1) entries.
173 *
174 * In all cases, be prepared with such re-entrancies with this array.
175 *
176 * Using uint8_t should be far enough for now. For example, when
177 * there're only one resolve thread (postcopy ram listening thread),
178 * the max (concurrent fault entries) should be two.
179 */
180 uint8_t *vcpu_faults_current;
181 /*
182 * The hash that contains addr1->[(cpu1,ts1),(cpu2,ts2) ...] mappings.
183 * Each of the entry is a tuple of (CPU index, fault timestamp) showing
184 * that a fault was requested.
185 */
186 GHashTable *vcpu_addr_hash;
187 /*
188 * Each bucket stores the count of faults that were resolved within the
189 * bucket window [2^N us, 2^(N+1) us).
190 */
191 uint64_t latency_buckets[BLOCKTIME_LATENCY_BUCKET_N];
192 /* total blocktime when all vCPUs are stopped */
193 uint64_t total_blocktime;
194 /* point in time when last page fault was initiated */
195 uint64_t last_begin;
196 /* number of vCPU are suspended */
197 int smp_cpus_down;
198
199 /*
200 * Fast path for looking up vcpu_index from tid. NOTE: this result
201 * only reflects the vcpu setup when postcopy is running. It may not
202 * always match with the current vcpu setup because vcpus can be hot
203 * attached/detached after migration completes. However this should be
204 * stable when blocktime is using the structure.
205 */
206 GHashTable *tid_to_vcpu_hash;
207 /* Count of non-vCPU faults. This is only for debugging purpose. */
208 uint64_t non_vcpu_faults;
209 /* total blocktime when a non-vCPU thread is stopped */
210 uint64_t non_vcpu_blocktime_total;
211
212 /*
213 * Handler for exit event, necessary for
214 * releasing whole blocktime_ctx
215 */
216 Notifier exit_notifier;
217 } PostcopyBlocktimeContext;
218
219 typedef struct {
220 /* The time the fault was triggered */
221 uint64_t fault_time;
222 /*
223 * The vCPU index that was blocked, when cpu==-1, it means it's a
224 * fault from non-vCPU threads.
225 */
226 int cpu;
227 } BlocktimeVCPUEntry;
228
229 /* Alloc an entry to record a vCPU fault */
230 static BlocktimeVCPUEntry *
blocktime_vcpu_entry_alloc(int cpu,uint64_t fault_time)231 blocktime_vcpu_entry_alloc(int cpu, uint64_t fault_time)
232 {
233 BlocktimeVCPUEntry *entry = g_new(BlocktimeVCPUEntry, 1);
234
235 entry->fault_time = fault_time;
236 entry->cpu = cpu;
237
238 return entry;
239 }
240
241 /* Free a @GList of @BlocktimeVCPUEntry */
blocktime_vcpu_list_free(gpointer data)242 static void blocktime_vcpu_list_free(gpointer data)
243 {
244 g_list_free_full(data, g_free);
245 }
246
destroy_blocktime_context(struct PostcopyBlocktimeContext * ctx)247 static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
248 {
249 g_hash_table_destroy(ctx->tid_to_vcpu_hash);
250 g_hash_table_destroy(ctx->vcpu_addr_hash);
251 g_free(ctx->vcpu_blocktime_total);
252 g_free(ctx->vcpu_faults_count);
253 g_free(ctx->vcpu_faults_current);
254 g_free(ctx);
255 }
256
migration_exit_cb(Notifier * n,void * data)257 static void migration_exit_cb(Notifier *n, void *data)
258 {
259 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
260 exit_notifier);
261 destroy_blocktime_context(ctx);
262 }
263
blocktime_init_tid_to_vcpu_hash(void)264 static GHashTable *blocktime_init_tid_to_vcpu_hash(void)
265 {
266 /*
267 * TID as an unsigned int can be directly used as the key. However,
268 * CPU index can NOT be directly used as value, because CPU index can
269 * be 0, which means NULL. Then when lookup we can never know whether
270 * it's 0 or "not found". Hence use an indirection for CPU index.
271 */
272 GHashTable *table = g_hash_table_new_full(g_direct_hash, g_direct_equal,
273 NULL, g_free);
274 CPUState *cpu;
275
276 /*
277 * Initialize the tid->cpu_id mapping for lookups. The caller needs to
278 * make sure when reaching here the CPU topology is frozen and will be
279 * stable for the whole blocktime trapping period.
280 */
281 CPU_FOREACH(cpu) {
282 int *value = g_new(int, 1);
283
284 *value = cpu->cpu_index;
285 g_hash_table_insert(table,
286 GUINT_TO_POINTER((uint32_t)cpu->thread_id),
287 value);
288 trace_postcopy_blocktime_tid_cpu_map(cpu->cpu_index, cpu->thread_id);
289 }
290
291 return table;
292 }
293
blocktime_context_new(void)294 static struct PostcopyBlocktimeContext *blocktime_context_new(void)
295 {
296 MachineState *ms = MACHINE(qdev_get_machine());
297 unsigned int smp_cpus = ms->smp.cpus;
298 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
299
300 /* Initialize all counters to be zeros */
301 memset(ctx->latency_buckets, 0, sizeof(ctx->latency_buckets));
302
303 ctx->vcpu_blocktime_total = g_new0(uint64_t, smp_cpus);
304 ctx->vcpu_faults_count = g_new0(uint64_t, smp_cpus);
305 ctx->vcpu_faults_current = g_new0(uint8_t, smp_cpus);
306 ctx->tid_to_vcpu_hash = blocktime_init_tid_to_vcpu_hash();
307
308 /*
309 * The key (host virtual addresses) will always be gpointer-sized on
310 * either 32bits or 64bits systems, so it'll fit as a direct key.
311 *
312 * The value will be a list of BlocktimeVCPUEntry entries.
313 */
314 ctx->vcpu_addr_hash = g_hash_table_new_full(g_direct_hash,
315 g_direct_equal,
316 NULL,
317 blocktime_vcpu_list_free);
318
319 ctx->exit_notifier.notify = migration_exit_cb;
320 qemu_add_exit_notifier(&ctx->exit_notifier);
321
322 return ctx;
323 }
324
325 /*
326 * This function just populates MigrationInfo from postcopy's
327 * blocktime context. It will not populate MigrationInfo,
328 * unless postcopy-blocktime capability was set.
329 *
330 * @info: pointer to MigrationInfo to populate
331 */
fill_destination_postcopy_migration_info(MigrationInfo * info)332 void fill_destination_postcopy_migration_info(MigrationInfo *info)
333 {
334 MigrationIncomingState *mis = migration_incoming_get_current();
335 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
336 MachineState *ms = MACHINE(qdev_get_machine());
337 uint64_t latency_total = 0, faults = 0;
338 uint32List *list_blocktime = NULL;
339 uint64List *list_latency = NULL;
340 uint64List *latency_buckets = NULL;
341 int i;
342
343 if (!bc) {
344 return;
345 }
346
347 for (i = ms->smp.cpus - 1; i >= 0; i--) {
348 uint64_t latency, total, count;
349
350 /* Convert ns -> ms */
351 QAPI_LIST_PREPEND(list_blocktime,
352 (uint32_t)(bc->vcpu_blocktime_total[i] / SCALE_MS));
353
354 /* The rest in nanoseconds */
355 total = bc->vcpu_blocktime_total[i];
356 latency_total += total;
357 count = bc->vcpu_faults_count[i];
358 faults += count;
359
360 if (count) {
361 latency = total / count;
362 } else {
363 /* No fault detected */
364 latency = 0;
365 }
366
367 QAPI_LIST_PREPEND(list_latency, latency);
368 }
369
370 for (i = BLOCKTIME_LATENCY_BUCKET_N - 1; i >= 0; i--) {
371 QAPI_LIST_PREPEND(latency_buckets, bc->latency_buckets[i]);
372 }
373
374 latency_total += bc->non_vcpu_blocktime_total;
375 faults += bc->non_vcpu_faults;
376
377 info->has_postcopy_non_vcpu_latency = true;
378 info->postcopy_non_vcpu_latency = bc->non_vcpu_faults ?
379 (bc->non_vcpu_blocktime_total / bc->non_vcpu_faults) : 0;
380 info->has_postcopy_blocktime = true;
381 /* Convert ns -> ms */
382 info->postcopy_blocktime = (uint32_t)(bc->total_blocktime / SCALE_MS);
383 info->has_postcopy_vcpu_blocktime = true;
384 info->postcopy_vcpu_blocktime = list_blocktime;
385 info->has_postcopy_latency = true;
386 info->postcopy_latency = faults ? (latency_total / faults) : 0;
387 info->has_postcopy_vcpu_latency = true;
388 info->postcopy_vcpu_latency = list_latency;
389 info->has_postcopy_latency_dist = true;
390 info->postcopy_latency_dist = latency_buckets;
391 }
392
get_postcopy_total_blocktime(void)393 static uint64_t get_postcopy_total_blocktime(void)
394 {
395 MigrationIncomingState *mis = migration_incoming_get_current();
396 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
397
398 if (!bc) {
399 return 0;
400 }
401
402 return bc->total_blocktime;
403 }
404
405 /**
406 * receive_ufd_features: check userfault fd features, to request only supported
407 * features in the future.
408 *
409 * Returns: true on success
410 *
411 * __NR_userfaultfd - should be checked before
412 * @features: out parameter will contain uffdio_api.features provided by kernel
413 * in case of success
414 */
receive_ufd_features(uint64_t * features)415 static bool receive_ufd_features(uint64_t *features)
416 {
417 struct uffdio_api api_struct = {0};
418 int ufd;
419 bool ret = true;
420
421 ufd = uffd_open(O_CLOEXEC);
422 if (ufd == -1) {
423 error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
424 return false;
425 }
426
427 /* ask features */
428 api_struct.api = UFFD_API;
429 api_struct.features = 0;
430 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
431 error_report("%s: UFFDIO_API failed: %s", __func__,
432 strerror(errno));
433 ret = false;
434 goto release_ufd;
435 }
436
437 *features = api_struct.features;
438
439 release_ufd:
440 close(ufd);
441 return ret;
442 }
443
444 /**
445 * request_ufd_features: this function should be called only once on a newly
446 * opened ufd, subsequent calls will lead to error.
447 *
448 * Returns: true on success
449 *
450 * @ufd: fd obtained from userfaultfd syscall
451 * @features: bit mask see UFFD_API_FEATURES
452 */
request_ufd_features(int ufd,uint64_t features)453 static bool request_ufd_features(int ufd, uint64_t features)
454 {
455 struct uffdio_api api_struct = {0};
456 uint64_t ioctl_mask;
457
458 api_struct.api = UFFD_API;
459 api_struct.features = features;
460 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
461 error_report("%s failed: UFFDIO_API failed: %s", __func__,
462 strerror(errno));
463 return false;
464 }
465
466 ioctl_mask = 1ULL << _UFFDIO_REGISTER |
467 1ULL << _UFFDIO_UNREGISTER;
468 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
469 error_report("Missing userfault features: %" PRIx64,
470 (uint64_t)(~api_struct.ioctls & ioctl_mask));
471 return false;
472 }
473
474 return true;
475 }
476
ufd_check_and_apply(int ufd,MigrationIncomingState * mis,Error ** errp)477 static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis,
478 Error **errp)
479 {
480 ERRP_GUARD();
481 uint64_t asked_features = 0;
482 static uint64_t supported_features;
483
484 /*
485 * it's not possible to
486 * request UFFD_API twice per one fd
487 * userfault fd features is persistent
488 */
489 if (!supported_features) {
490 if (!receive_ufd_features(&supported_features)) {
491 error_setg(errp, "Userfault feature detection failed");
492 return false;
493 }
494 }
495
496 #ifdef UFFD_FEATURE_THREAD_ID
497 /*
498 * Postcopy blocktime conditionally needs THREAD_ID feature (introduced
499 * to Linux in 2017). Always try to enable it when QEMU is compiled
500 * with such environment.
501 */
502 if (UFFD_FEATURE_THREAD_ID & supported_features) {
503 asked_features |= UFFD_FEATURE_THREAD_ID;
504 }
505 #endif
506
507 /*
508 * request features, even if asked_features is 0, due to
509 * kernel expects UFFD_API before UFFDIO_REGISTER, per
510 * userfault file descriptor
511 */
512 if (!request_ufd_features(ufd, asked_features)) {
513 error_setg(errp, "Failed features %" PRIu64, asked_features);
514 return false;
515 }
516
517 if (qemu_real_host_page_size() != ram_pagesize_summary()) {
518 bool have_hp = false;
519 /* We've got a huge page */
520 #ifdef UFFD_FEATURE_MISSING_HUGETLBFS
521 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
522 #endif
523 if (!have_hp) {
524 error_setg(errp,
525 "Userfault on this host does not support huge pages");
526 return false;
527 }
528 }
529 return true;
530 }
531
532 /* Callback from postcopy_ram_supported_by_host block iterator.
533 */
test_ramblock_postcopiable(RAMBlock * rb,Error ** errp)534 static int test_ramblock_postcopiable(RAMBlock *rb, Error **errp)
535 {
536 const char *block_name = qemu_ram_get_idstr(rb);
537 ram_addr_t length = qemu_ram_get_used_length(rb);
538 size_t pagesize = qemu_ram_pagesize(rb);
539 QemuFsType fs;
540
541 if (length % pagesize) {
542 error_setg(errp,
543 "Postcopy requires RAM blocks to be a page size multiple,"
544 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
545 "page size of 0x%zx", block_name, length, pagesize);
546 return 1;
547 }
548
549 if (rb->fd >= 0) {
550 fs = qemu_fd_getfs(rb->fd);
551 if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) {
552 error_setg(errp,
553 "Host backend files need to be TMPFS or HUGETLBFS only");
554 return 1;
555 }
556 }
557
558 return 0;
559 }
560
561 /*
562 * Note: This has the side effect of munlock'ing all of RAM, that's
563 * normally fine since if the postcopy succeeds it gets turned back on at the
564 * end.
565 */
postcopy_ram_supported_by_host(MigrationIncomingState * mis,Error ** errp)566 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
567 {
568 ERRP_GUARD();
569 long pagesize = qemu_real_host_page_size();
570 int ufd = -1;
571 bool ret = false; /* Error unless we change it */
572 void *testarea = NULL;
573 struct uffdio_register reg_struct;
574 struct uffdio_range range_struct;
575 uint64_t feature_mask;
576 RAMBlock *block;
577
578 if (qemu_target_page_size() > pagesize) {
579 error_setg(errp, "Target page size bigger than host page size");
580 goto out;
581 }
582
583 ufd = uffd_open(O_CLOEXEC);
584 if (ufd == -1) {
585 error_setg(errp, "Userfaultfd not available: %s", strerror(errno));
586 goto out;
587 }
588
589 /* Give devices a chance to object */
590 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, errp)) {
591 goto out;
592 }
593
594 /* Version and features check */
595 if (!ufd_check_and_apply(ufd, mis, errp)) {
596 goto out;
597 }
598
599 /*
600 * We don't support postcopy with some type of ramblocks.
601 *
602 * NOTE: we explicitly ignored migrate_ram_is_ignored() instead we checked
603 * all possible ramblocks. This is because this function can be called
604 * when creating the migration object, during the phase RAM_MIGRATABLE
605 * is not even properly set for all the ramblocks.
606 *
607 * A side effect of this is we'll also check against RAM_SHARED
608 * ramblocks even if migrate_ignore_shared() is set (in which case
609 * we'll never migrate RAM_SHARED at all), but normally this shouldn't
610 * affect in reality, or we can revisit.
611 */
612 RAMBLOCK_FOREACH(block) {
613 if (test_ramblock_postcopiable(block, errp)) {
614 goto out;
615 }
616 }
617
618 /*
619 * userfault and mlock don't go together; we'll put it back later if
620 * it was enabled.
621 */
622 if (munlockall()) {
623 error_setg(errp, "munlockall() failed: %s", strerror(errno));
624 goto out;
625 }
626
627 /*
628 * We need to check that the ops we need are supported on anon memory
629 * To do that we need to register a chunk and see the flags that
630 * are returned.
631 */
632 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
633 MAP_ANONYMOUS, -1, 0);
634 if (testarea == MAP_FAILED) {
635 error_setg(errp, "Failed to map test area: %s", strerror(errno));
636 goto out;
637 }
638 g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
639
640 reg_struct.range.start = (uintptr_t)testarea;
641 reg_struct.range.len = pagesize;
642 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
643
644 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
645 error_setg(errp, "UFFDIO_REGISTER failed: %s", strerror(errno));
646 goto out;
647 }
648
649 range_struct.start = (uintptr_t)testarea;
650 range_struct.len = pagesize;
651 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
652 error_setg(errp, "UFFDIO_UNREGISTER failed: %s", strerror(errno));
653 goto out;
654 }
655
656 feature_mask = 1ULL << _UFFDIO_WAKE |
657 1ULL << _UFFDIO_COPY |
658 1ULL << _UFFDIO_ZEROPAGE;
659 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
660 error_setg(errp, "Missing userfault map features: %" PRIx64,
661 (uint64_t)(~reg_struct.ioctls & feature_mask));
662 goto out;
663 }
664
665 /* Success! */
666 ret = true;
667 out:
668 if (testarea) {
669 munmap(testarea, pagesize);
670 }
671 if (ufd != -1) {
672 close(ufd);
673 }
674 return ret;
675 }
676
677 /*
678 * Setup an area of RAM so that it *can* be used for postcopy later; this
679 * must be done right at the start prior to pre-copy.
680 * opaque should be the MIS.
681 */
init_range(RAMBlock * rb,void * opaque)682 static int init_range(RAMBlock *rb, void *opaque)
683 {
684 const char *block_name = qemu_ram_get_idstr(rb);
685 void *host_addr = qemu_ram_get_host_addr(rb);
686 ram_addr_t offset = qemu_ram_get_offset(rb);
687 ram_addr_t length = qemu_ram_get_used_length(rb);
688 trace_postcopy_init_range(block_name, host_addr, offset, length);
689
690 /*
691 * Save the used_length before running the guest. In case we have to
692 * resize RAM blocks when syncing RAM block sizes from the source during
693 * precopy, we'll update it manually via the ram block notifier.
694 */
695 rb->postcopy_length = length;
696
697 /*
698 * We need the whole of RAM to be truly empty for postcopy, so things
699 * like ROMs and any data tables built during init must be zero'd
700 * - we're going to get the copy from the source anyway.
701 * (Precopy will just overwrite this data, so doesn't need the discard)
702 */
703 if (ram_discard_range(block_name, 0, length)) {
704 return -1;
705 }
706
707 return 0;
708 }
709
710 /*
711 * At the end of migration, undo the effects of init_range
712 * opaque should be the MIS.
713 */
cleanup_range(RAMBlock * rb,void * opaque)714 static int cleanup_range(RAMBlock *rb, void *opaque)
715 {
716 const char *block_name = qemu_ram_get_idstr(rb);
717 void *host_addr = qemu_ram_get_host_addr(rb);
718 ram_addr_t offset = qemu_ram_get_offset(rb);
719 ram_addr_t length = rb->postcopy_length;
720 MigrationIncomingState *mis = opaque;
721 struct uffdio_range range_struct;
722 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
723
724 /*
725 * We turned off hugepage for the precopy stage with postcopy enabled
726 * we can turn it back on now.
727 */
728 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
729
730 /*
731 * We can also turn off userfault now since we should have all the
732 * pages. It can be useful to leave it on to debug postcopy
733 * if you're not sure it's always getting every page.
734 */
735 range_struct.start = (uintptr_t)host_addr;
736 range_struct.len = length;
737
738 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
739 error_report("%s: userfault unregister %s", __func__, strerror(errno));
740
741 return -1;
742 }
743
744 return 0;
745 }
746
747 /*
748 * Initialise postcopy-ram, setting the RAM to a state where we can go into
749 * postcopy later; must be called prior to any precopy.
750 * called from arch_init's similarly named ram_postcopy_incoming_init
751 */
postcopy_ram_incoming_init(MigrationIncomingState * mis)752 int postcopy_ram_incoming_init(MigrationIncomingState *mis)
753 {
754 if (foreach_not_ignored_block(init_range, NULL)) {
755 return -1;
756 }
757
758 return 0;
759 }
760
postcopy_temp_pages_cleanup(MigrationIncomingState * mis)761 static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis)
762 {
763 int i;
764
765 if (mis->postcopy_tmp_pages) {
766 for (i = 0; i < mis->postcopy_channels; i++) {
767 if (mis->postcopy_tmp_pages[i].tmp_huge_page) {
768 munmap(mis->postcopy_tmp_pages[i].tmp_huge_page,
769 mis->largest_page_size);
770 mis->postcopy_tmp_pages[i].tmp_huge_page = NULL;
771 }
772 }
773 g_free(mis->postcopy_tmp_pages);
774 mis->postcopy_tmp_pages = NULL;
775 }
776
777 if (mis->postcopy_tmp_zero_page) {
778 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
779 mis->postcopy_tmp_zero_page = NULL;
780 }
781 }
782
783 /*
784 * At the end of a migration where postcopy_ram_incoming_init was called.
785 */
postcopy_ram_incoming_cleanup(MigrationIncomingState * mis)786 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
787 {
788 trace_postcopy_ram_incoming_cleanup_entry();
789
790 if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) {
791 /* Notify the fast load thread to quit */
792 mis->preempt_thread_status = PREEMPT_THREAD_QUIT;
793 /*
794 * Update preempt_thread_status before reading count. Note: mutex
795 * lock only provide ACQUIRE semantic, and it doesn't stops this
796 * write to be reordered after reading the count.
797 */
798 smp_mb();
799 /*
800 * It's possible that the preempt thread is still handling the last
801 * pages to arrive which were requested by guest page faults.
802 * Making sure nothing is left behind by waiting on the condvar if
803 * that unlikely case happened.
804 */
805 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
806 if (qatomic_read(&mis->page_requested_count)) {
807 /*
808 * It is guaranteed to receive a signal later, because the
809 * count>0 now, so it's destined to be decreased to zero
810 * very soon by the preempt thread.
811 */
812 qemu_cond_wait(&mis->page_request_cond,
813 &mis->page_request_mutex);
814 }
815 }
816 /* Notify the fast load thread to quit */
817 if (mis->postcopy_qemufile_dst) {
818 qemu_file_shutdown(mis->postcopy_qemufile_dst);
819 }
820 qemu_thread_join(&mis->postcopy_prio_thread);
821 mis->preempt_thread_status = PREEMPT_THREAD_NONE;
822 }
823
824 if (mis->have_fault_thread) {
825 Error *local_err = NULL;
826
827 /* Let the fault thread quit */
828 qatomic_set(&mis->fault_thread_quit, 1);
829 postcopy_fault_thread_notify(mis);
830 trace_postcopy_ram_incoming_cleanup_join();
831 qemu_thread_join(&mis->fault_thread);
832
833 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
834 error_report_err(local_err);
835 return -1;
836 }
837
838 if (foreach_not_ignored_block(cleanup_range, mis)) {
839 return -1;
840 }
841
842 trace_postcopy_ram_incoming_cleanup_closeuf();
843 close(mis->userfault_fd);
844 close(mis->userfault_event_fd);
845 mis->have_fault_thread = false;
846 }
847
848 if (should_mlock(mlock_state)) {
849 if (os_mlock(is_mlock_on_fault(mlock_state)) < 0) {
850 error_report("mlock: %s", strerror(errno));
851 /*
852 * It doesn't feel right to fail at this point, we have a valid
853 * VM state.
854 */
855 }
856 }
857
858 postcopy_temp_pages_cleanup(mis);
859
860 trace_postcopy_ram_incoming_cleanup_blocktime(
861 get_postcopy_total_blocktime());
862
863 trace_postcopy_ram_incoming_cleanup_exit();
864 return 0;
865 }
866
867 /*
868 * Disable huge pages on an area
869 */
nhp_range(RAMBlock * rb,void * opaque)870 static int nhp_range(RAMBlock *rb, void *opaque)
871 {
872 const char *block_name = qemu_ram_get_idstr(rb);
873 void *host_addr = qemu_ram_get_host_addr(rb);
874 ram_addr_t offset = qemu_ram_get_offset(rb);
875 ram_addr_t length = rb->postcopy_length;
876 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
877
878 /*
879 * Before we do discards we need to ensure those discards really
880 * do delete areas of the page, even if THP thinks a hugepage would
881 * be a good idea, so force hugepages off.
882 */
883 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
884
885 return 0;
886 }
887
888 /*
889 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
890 * however leaving it until after precopy means that most of the precopy
891 * data is still THPd
892 */
postcopy_ram_prepare_discard(MigrationIncomingState * mis)893 int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
894 {
895 if (foreach_not_ignored_block(nhp_range, mis)) {
896 return -1;
897 }
898
899 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
900
901 return 0;
902 }
903
904 /*
905 * Mark the given area of RAM as requiring notification to unwritten areas
906 * Used as a callback on foreach_not_ignored_block.
907 * host_addr: Base of area to mark
908 * offset: Offset in the whole ram arena
909 * length: Length of the section
910 * opaque: MigrationIncomingState pointer
911 * Returns 0 on success
912 */
ram_block_enable_notify(RAMBlock * rb,void * opaque)913 static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
914 {
915 MigrationIncomingState *mis = opaque;
916 struct uffdio_register reg_struct;
917
918 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
919 reg_struct.range.len = rb->postcopy_length;
920 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
921
922 /* Now tell our userfault_fd that it's responsible for this area */
923 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
924 error_report("%s userfault register: %s", __func__, strerror(errno));
925 return -1;
926 }
927 if (!(reg_struct.ioctls & (1ULL << _UFFDIO_COPY))) {
928 error_report("%s userfault: Region doesn't support COPY", __func__);
929 return -1;
930 }
931 if (reg_struct.ioctls & (1ULL << _UFFDIO_ZEROPAGE)) {
932 qemu_ram_set_uf_zeroable(rb);
933 }
934
935 return 0;
936 }
937
postcopy_wake_shared(struct PostCopyFD * pcfd,uint64_t client_addr,RAMBlock * rb)938 int postcopy_wake_shared(struct PostCopyFD *pcfd,
939 uint64_t client_addr,
940 RAMBlock *rb)
941 {
942 size_t pagesize = qemu_ram_pagesize(rb);
943 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
944 return uffd_wakeup(pcfd->fd,
945 (void *)(uintptr_t)ROUND_DOWN(client_addr, pagesize),
946 pagesize);
947 }
948
949 /*
950 * NOTE: @tid is only used when postcopy-blocktime feature is enabled, and
951 * also optional: when zero is provided, the fault accounting will be ignored.
952 */
postcopy_request_page(MigrationIncomingState * mis,RAMBlock * rb,ram_addr_t start,uint64_t haddr,uint32_t tid)953 static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
954 ram_addr_t start, uint64_t haddr, uint32_t tid)
955 {
956 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
957
958 /*
959 * Discarded pages (via RamDiscardManager) are never migrated. On unlikely
960 * access, place a zeropage, which will also set the relevant bits in the
961 * recv_bitmap accordingly, so we won't try placing a zeropage twice.
962 *
963 * Checking a single bit is sufficient to handle pagesize > TPS as either
964 * all relevant bits are set or not.
965 */
966 assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
967 if (ramblock_page_is_discarded(rb, start)) {
968 bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
969
970 return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
971 }
972
973 return migrate_send_rp_req_pages(mis, rb, start, haddr, tid);
974 }
975
976 /*
977 * Callback from shared fault handlers to ask for a page,
978 * the page must be specified by a RAMBlock and an offset in that rb
979 * Note: Only for use by shared fault handlers (in fault thread)
980 */
postcopy_request_shared_page(struct PostCopyFD * pcfd,RAMBlock * rb,uint64_t client_addr,uint64_t rb_offset)981 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
982 uint64_t client_addr, uint64_t rb_offset)
983 {
984 uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
985 MigrationIncomingState *mis = migration_incoming_get_current();
986
987 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
988 rb_offset);
989 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
990 trace_postcopy_request_shared_page_present(pcfd->idstr,
991 qemu_ram_get_idstr(rb), rb_offset);
992 return postcopy_wake_shared(pcfd, client_addr, rb);
993 }
994 /* TODO: support blocktime tracking */
995 postcopy_request_page(mis, rb, aligned_rbo, client_addr, 0);
996 return 0;
997 }
998
blocktime_get_vcpu(PostcopyBlocktimeContext * ctx,uint32_t tid)999 static int blocktime_get_vcpu(PostcopyBlocktimeContext *ctx, uint32_t tid)
1000 {
1001 int *found;
1002
1003 found = g_hash_table_lookup(ctx->tid_to_vcpu_hash, GUINT_TO_POINTER(tid));
1004 if (!found) {
1005 /*
1006 * NOTE: this is possible, because QEMU's non-vCPU threads can
1007 * also access a missing page. Or, when KVM async pf is enabled, a
1008 * fault can even happen from a kworker..
1009 */
1010 return -1;
1011 }
1012
1013 return *found;
1014 }
1015
get_current_ns(void)1016 static uint64_t get_current_ns(void)
1017 {
1018 return (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1019 }
1020
1021 /*
1022 * Inject an (cpu, fault_time) entry into the database, using addr as key.
1023 * When cpu==-1, it means it's a non-vCPU fault.
1024 */
blocktime_fault_inject(PostcopyBlocktimeContext * ctx,uintptr_t addr,int cpu,uint64_t time)1025 static void blocktime_fault_inject(PostcopyBlocktimeContext *ctx,
1026 uintptr_t addr, int cpu, uint64_t time)
1027 {
1028 BlocktimeVCPUEntry *entry = blocktime_vcpu_entry_alloc(cpu, time);
1029 GHashTable *table = ctx->vcpu_addr_hash;
1030 gpointer key = (gpointer)addr;
1031 GList *head, *list;
1032 gboolean result;
1033
1034 head = g_hash_table_lookup(table, key);
1035 if (head) {
1036 /*
1037 * If existed, steal the @head for list operation rather than
1038 * freeing it, making sure steal succeeded.
1039 */
1040 result = g_hash_table_steal(table, key);
1041 assert(result == TRUE);
1042 }
1043
1044 /*
1045 * Now the key is guaranteed to be absent. Two cases:
1046 *
1047 * (1) There's no existing entry, list contains the only one. Insert.
1048 * (2) There're existing entries, after stealing we own it, prepend the
1049 * result and re-insert.
1050 */
1051 list = g_list_prepend(head, entry);
1052 g_hash_table_insert(table, key, list);
1053
1054 trace_postcopy_blocktime_begin(addr, time, cpu, !!head);
1055 }
1056
1057 /*
1058 * This function is being called when pagefault occurs. It tracks down vCPU
1059 * blocking time. It's protected by @page_request_mutex.
1060 *
1061 * @addr: faulted host virtual address
1062 * @ptid: faulted process thread id
1063 * @rb: ramblock appropriate to addr
1064 */
mark_postcopy_blocktime_begin(uintptr_t addr,uint32_t ptid,RAMBlock * rb)1065 void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
1066 RAMBlock *rb)
1067 {
1068 int cpu;
1069 MigrationIncomingState *mis = migration_incoming_get_current();
1070 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
1071 uint64_t current;
1072
1073 if (!dc || ptid == 0) {
1074 return;
1075 }
1076
1077 /*
1078 * The caller should only inject a blocktime entry when the page is
1079 * yet missing.
1080 */
1081 assert(!ramblock_recv_bitmap_test(rb, (void *)addr));
1082
1083 current = get_current_ns();
1084 cpu = blocktime_get_vcpu(dc, ptid);
1085
1086 if (cpu >= 0) {
1087 /* How many faults on this vCPU in total? */
1088 dc->vcpu_faults_count[cpu]++;
1089
1090 /*
1091 * Account how many concurrent faults on this vCPU we trapped. See
1092 * comments above vcpu_faults_current[] on why it can be more than one.
1093 */
1094 if (dc->vcpu_faults_current[cpu]++ == 0) {
1095 dc->smp_cpus_down++;
1096 /*
1097 * We use last_begin to cover (1) the 1st fault on this specific
1098 * vCPU, but meanwhile (2) the last vCPU that got blocked. It's
1099 * only used to calculate system-wide blocktime.
1100 */
1101 dc->last_begin = current;
1102 }
1103
1104 /* Making sure it won't overflow - it really should never! */
1105 assert(dc->vcpu_faults_current[cpu] <= 255);
1106 } else {
1107 /*
1108 * For non-vCPU thread faults, we don't care about tid or cpu index
1109 * or time the thread is blocked (e.g., a kworker trying to help
1110 * KVM when async_pf=on is OK to be blocked and not affect guest
1111 * responsiveness), but we care about latency. Track it with
1112 * cpu=-1.
1113 *
1114 * Note that this will NOT affect blocktime reports on vCPU being
1115 * blocked, but only about system-wide latency reports.
1116 */
1117 dc->non_vcpu_faults++;
1118 }
1119
1120 blocktime_fault_inject(dc, addr, cpu, current);
1121 }
1122
blocktime_latency_account(PostcopyBlocktimeContext * ctx,uint64_t time_us)1123 static void blocktime_latency_account(PostcopyBlocktimeContext *ctx,
1124 uint64_t time_us)
1125 {
1126 /*
1127 * Convert time (in us) to bucket index it belongs. Take extra caution
1128 * of time_us==0 even if normally rare - when happens put into bucket 0.
1129 */
1130 int index = time_us ? (63 - clz64(time_us)) : 0;
1131
1132 assert(index >= 0);
1133
1134 /* If it's too large, put into top bucket */
1135 if (index >= BLOCKTIME_LATENCY_BUCKET_N) {
1136 index = BLOCKTIME_LATENCY_BUCKET_N - 1;
1137 }
1138
1139 ctx->latency_buckets[index]++;
1140 }
1141
1142 typedef struct {
1143 PostcopyBlocktimeContext *ctx;
1144 uint64_t current;
1145 int affected_cpus;
1146 int affected_non_cpus;
1147 } BlockTimeVCPUIter;
1148
blocktime_cpu_list_iter_fn(gpointer data,gpointer user_data)1149 static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data)
1150 {
1151 BlockTimeVCPUIter *iter = user_data;
1152 PostcopyBlocktimeContext *ctx = iter->ctx;
1153 BlocktimeVCPUEntry *entry = data;
1154 uint64_t time_passed;
1155 int cpu = entry->cpu;
1156
1157 /*
1158 * Time should never go back.. so when the fault is resolved it must be
1159 * later than when it was faulted.
1160 */
1161 assert(iter->current >= entry->fault_time);
1162 time_passed = iter->current - entry->fault_time;
1163
1164 /* Latency buckets are in microseconds */
1165 blocktime_latency_account(ctx, time_passed / SCALE_US);
1166
1167 if (cpu >= 0) {
1168 /*
1169 * If we resolved all pending faults on one vCPU due to this page
1170 * resolution, take a note.
1171 */
1172 if (--ctx->vcpu_faults_current[cpu] == 0) {
1173 ctx->vcpu_blocktime_total[cpu] += time_passed;
1174 iter->affected_cpus += 1;
1175 }
1176 trace_postcopy_blocktime_end_one(cpu, ctx->vcpu_faults_current[cpu]);
1177 } else {
1178 iter->affected_non_cpus++;
1179 ctx->non_vcpu_blocktime_total += time_passed;
1180 /*
1181 * We do not maintain how many pending non-vCPU faults because we
1182 * do not care about blocktime, only latency.
1183 */
1184 trace_postcopy_blocktime_end_one(-1, 0);
1185 }
1186 }
1187
1188 /*
1189 * This function just provide calculated blocktime per cpu and trace it.
1190 * Total blocktime is calculated in mark_postcopy_blocktime_end. It's
1191 * protected by @page_request_mutex.
1192 *
1193 * Assume we have 3 CPU
1194 *
1195 * S1 E1 S1 E1
1196 * -----***********------------xxx***************------------------------> CPU1
1197 *
1198 * S2 E2
1199 * ------------****************xxx---------------------------------------> CPU2
1200 *
1201 * S3 E3
1202 * ------------------------****xxx********-------------------------------> CPU3
1203 *
1204 * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
1205 * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
1206 * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
1207 * it's a part of total blocktime.
1208 * S1 - here is last_begin
1209 * Legend of the picture is following:
1210 * * - means blocktime per vCPU
1211 * x - means overlapped blocktime (total blocktime)
1212 *
1213 * @addr: host virtual address
1214 */
mark_postcopy_blocktime_end(uintptr_t addr)1215 static void mark_postcopy_blocktime_end(uintptr_t addr)
1216 {
1217 MigrationIncomingState *mis = migration_incoming_get_current();
1218 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
1219 MachineState *ms = MACHINE(qdev_get_machine());
1220 unsigned int smp_cpus = ms->smp.cpus;
1221 BlockTimeVCPUIter iter = {
1222 .current = get_current_ns(),
1223 .affected_cpus = 0,
1224 .affected_non_cpus = 0,
1225 .ctx = dc,
1226 };
1227 gpointer key = (gpointer)addr;
1228 GHashTable *table;
1229 GList *list;
1230
1231 if (!dc) {
1232 return;
1233 }
1234
1235 table = dc->vcpu_addr_hash;
1236 /* the address wasn't tracked at all? */
1237 list = g_hash_table_lookup(table, key);
1238 if (!list) {
1239 return;
1240 }
1241
1242 /*
1243 * Loop over the set of vCPUs that got blocked on this addr, do the
1244 * blocktime accounting. After that, remove the whole list.
1245 */
1246 g_list_foreach(list, blocktime_cpu_list_iter_fn, &iter);
1247 g_hash_table_remove(table, key);
1248
1249 /*
1250 * If all vCPUs used to be down, and copying this page would free some
1251 * vCPUs, then the system-level blocktime ends here.
1252 */
1253 if (dc->smp_cpus_down == smp_cpus && iter.affected_cpus) {
1254 dc->total_blocktime += iter.current - dc->last_begin;
1255 }
1256 dc->smp_cpus_down -= iter.affected_cpus;
1257
1258 trace_postcopy_blocktime_end(addr, iter.current, iter.affected_cpus,
1259 iter.affected_non_cpus);
1260 }
1261
postcopy_pause_fault_thread(MigrationIncomingState * mis)1262 static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
1263 {
1264 trace_postcopy_pause_fault_thread();
1265 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
1266 trace_postcopy_pause_fault_thread_continued();
1267 }
1268
1269 /*
1270 * Handle faults detected by the USERFAULT markings
1271 */
postcopy_ram_fault_thread(void * opaque)1272 static void *postcopy_ram_fault_thread(void *opaque)
1273 {
1274 MigrationIncomingState *mis = opaque;
1275 struct uffd_msg msg;
1276 int ret;
1277 size_t index;
1278 RAMBlock *rb = NULL;
1279
1280 trace_postcopy_ram_fault_thread_entry();
1281 rcu_register_thread();
1282 mis->last_rb = NULL; /* last RAMBlock we sent part of */
1283 qemu_event_set(&mis->thread_sync_event);
1284
1285 struct pollfd *pfd;
1286 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
1287
1288 pfd = g_new0(struct pollfd, pfd_len);
1289
1290 pfd[0].fd = mis->userfault_fd;
1291 pfd[0].events = POLLIN;
1292 pfd[1].fd = mis->userfault_event_fd;
1293 pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
1294 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
1295 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
1296 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
1297 struct PostCopyFD, index);
1298 pfd[2 + index].fd = pcfd->fd;
1299 pfd[2 + index].events = POLLIN;
1300 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
1301 pcfd->fd);
1302 }
1303
1304 while (true) {
1305 ram_addr_t rb_offset;
1306 int poll_result;
1307
1308 /*
1309 * We're mainly waiting for the kernel to give us a faulting HVA,
1310 * however we can be told to quit via userfault_quit_fd which is
1311 * an eventfd
1312 */
1313
1314 poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
1315 if (poll_result == -1) {
1316 error_report("%s: userfault poll: %s", __func__, strerror(errno));
1317 break;
1318 }
1319
1320 if (!mis->to_src_file) {
1321 /*
1322 * Possibly someone tells us that the return path is
1323 * broken already using the event. We should hold until
1324 * the channel is rebuilt.
1325 */
1326 postcopy_pause_fault_thread(mis);
1327 }
1328
1329 if (pfd[1].revents) {
1330 uint64_t tmp64 = 0;
1331
1332 /* Consume the signal */
1333 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
1334 /* Nothing obviously nicer than posting this error. */
1335 error_report("%s: read() failed", __func__);
1336 }
1337
1338 if (qatomic_read(&mis->fault_thread_quit)) {
1339 trace_postcopy_ram_fault_thread_quit();
1340 break;
1341 }
1342 }
1343
1344 if (pfd[0].revents) {
1345 poll_result--;
1346 ret = read(mis->userfault_fd, &msg, sizeof(msg));
1347 if (ret != sizeof(msg)) {
1348 if (errno == EAGAIN) {
1349 /*
1350 * if a wake up happens on the other thread just after
1351 * the poll, there is nothing to read.
1352 */
1353 continue;
1354 }
1355 if (ret < 0) {
1356 error_report("%s: Failed to read full userfault "
1357 "message: %s",
1358 __func__, strerror(errno));
1359 break;
1360 } else {
1361 error_report("%s: Read %d bytes from userfaultfd "
1362 "expected %zd",
1363 __func__, ret, sizeof(msg));
1364 break; /* Lost alignment, don't know what we'd read next */
1365 }
1366 }
1367 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1368 error_report("%s: Read unexpected event %ud from userfaultfd",
1369 __func__, msg.event);
1370 continue; /* It's not a page fault, shouldn't happen */
1371 }
1372
1373 rb = qemu_ram_block_from_host(
1374 (void *)(uintptr_t)msg.arg.pagefault.address,
1375 true, &rb_offset);
1376 if (!rb) {
1377 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
1378 PRIx64, (uint64_t)msg.arg.pagefault.address);
1379 break;
1380 }
1381
1382 rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
1383 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
1384 qemu_ram_get_idstr(rb),
1385 rb_offset,
1386 msg.arg.pagefault.feat.ptid);
1387 retry:
1388 /*
1389 * Send the request to the source - we want to request one
1390 * of our host page sizes (which is >= TPS)
1391 */
1392 ret = postcopy_request_page(mis, rb, rb_offset,
1393 msg.arg.pagefault.address,
1394 msg.arg.pagefault.feat.ptid);
1395 if (ret) {
1396 /* May be network failure, try to wait for recovery */
1397 postcopy_pause_fault_thread(mis);
1398 goto retry;
1399 }
1400 }
1401
1402 /* Now handle any requests from external processes on shared memory */
1403 /* TODO: May need to handle devices deregistering during postcopy */
1404 for (index = 2; index < pfd_len && poll_result; index++) {
1405 if (pfd[index].revents) {
1406 struct PostCopyFD *pcfd =
1407 &g_array_index(mis->postcopy_remote_fds,
1408 struct PostCopyFD, index - 2);
1409
1410 poll_result--;
1411 if (pfd[index].revents & POLLERR) {
1412 error_report("%s: POLLERR on poll %zd fd=%d",
1413 __func__, index, pcfd->fd);
1414 pfd[index].events = 0;
1415 continue;
1416 }
1417
1418 ret = read(pcfd->fd, &msg, sizeof(msg));
1419 if (ret != sizeof(msg)) {
1420 if (errno == EAGAIN) {
1421 /*
1422 * if a wake up happens on the other thread just after
1423 * the poll, there is nothing to read.
1424 */
1425 continue;
1426 }
1427 if (ret < 0) {
1428 error_report("%s: Failed to read full userfault "
1429 "message: %s (shared) revents=%d",
1430 __func__, strerror(errno),
1431 pfd[index].revents);
1432 /*TODO: Could just disable this sharer */
1433 break;
1434 } else {
1435 error_report("%s: Read %d bytes from userfaultfd "
1436 "expected %zd (shared)",
1437 __func__, ret, sizeof(msg));
1438 /*TODO: Could just disable this sharer */
1439 break; /*Lost alignment,don't know what we'd read next*/
1440 }
1441 }
1442 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1443 error_report("%s: Read unexpected event %ud "
1444 "from userfaultfd (shared)",
1445 __func__, msg.event);
1446 continue; /* It's not a page fault, shouldn't happen */
1447 }
1448 /* Call the device handler registered with us */
1449 ret = pcfd->handler(pcfd, &msg);
1450 if (ret) {
1451 error_report("%s: Failed to resolve shared fault on %zd/%s",
1452 __func__, index, pcfd->idstr);
1453 /* TODO: Fail? Disable this sharer? */
1454 }
1455 }
1456 }
1457 }
1458 rcu_unregister_thread();
1459 trace_postcopy_ram_fault_thread_exit();
1460 g_free(pfd);
1461 return NULL;
1462 }
1463
postcopy_temp_pages_setup(MigrationIncomingState * mis)1464 static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
1465 {
1466 PostcopyTmpPage *tmp_page;
1467 int err, i, channels;
1468 void *temp_page;
1469
1470 if (migrate_postcopy_preempt()) {
1471 /* If preemption enabled, need extra channel for urgent requests */
1472 mis->postcopy_channels = RAM_CHANNEL_MAX;
1473 } else {
1474 /* Both precopy/postcopy on the same channel */
1475 mis->postcopy_channels = 1;
1476 }
1477
1478 channels = mis->postcopy_channels;
1479 mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels);
1480
1481 for (i = 0; i < channels; i++) {
1482 tmp_page = &mis->postcopy_tmp_pages[i];
1483 temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE,
1484 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1485 if (temp_page == MAP_FAILED) {
1486 err = errno;
1487 error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s",
1488 __func__, i, strerror(err));
1489 /* Clean up will be done later */
1490 return -err;
1491 }
1492 tmp_page->tmp_huge_page = temp_page;
1493 /* Initialize default states for each tmp page */
1494 postcopy_temp_page_reset(tmp_page);
1495 }
1496
1497 /*
1498 * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
1499 */
1500 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1501 PROT_READ | PROT_WRITE,
1502 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1503 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1504 err = errno;
1505 mis->postcopy_tmp_zero_page = NULL;
1506 error_report("%s: Failed to map large zero page %s",
1507 __func__, strerror(err));
1508 return -err;
1509 }
1510
1511 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1512
1513 return 0;
1514 }
1515
postcopy_ram_incoming_setup(MigrationIncomingState * mis)1516 int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1517 {
1518 Error *local_err = NULL;
1519
1520 /* Open the fd for the kernel to give us userfaults */
1521 mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
1522 if (mis->userfault_fd == -1) {
1523 error_report("%s: Failed to open userfault fd: %s", __func__,
1524 strerror(errno));
1525 return -1;
1526 }
1527
1528 /*
1529 * Although the host check already tested the API, we need to
1530 * do the check again as an ABI handshake on the new fd.
1531 */
1532 if (!ufd_check_and_apply(mis->userfault_fd, mis, &local_err)) {
1533 error_report_err(local_err);
1534 return -1;
1535 }
1536
1537 if (migrate_postcopy_blocktime()) {
1538 assert(mis->blocktime_ctx == NULL);
1539 mis->blocktime_ctx = blocktime_context_new();
1540 }
1541
1542 /* Now an eventfd we use to tell the fault-thread to quit */
1543 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1544 if (mis->userfault_event_fd == -1) {
1545 error_report("%s: Opening userfault_event_fd: %s", __func__,
1546 strerror(errno));
1547 close(mis->userfault_fd);
1548 return -1;
1549 }
1550
1551 postcopy_thread_create(mis, &mis->fault_thread,
1552 MIGRATION_THREAD_DST_FAULT,
1553 postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
1554 mis->have_fault_thread = true;
1555
1556 /* Mark so that we get notified of accesses to unwritten areas */
1557 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1558 error_report("ram_block_enable_notify failed");
1559 return -1;
1560 }
1561
1562 if (postcopy_temp_pages_setup(mis)) {
1563 /* Error dumped in the sub-function */
1564 return -1;
1565 }
1566
1567 if (migrate_postcopy_preempt()) {
1568 /*
1569 * This thread needs to be created after the temp pages because
1570 * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
1571 */
1572 postcopy_thread_create(mis, &mis->postcopy_prio_thread,
1573 MIGRATION_THREAD_DST_PREEMPT,
1574 postcopy_preempt_thread, QEMU_THREAD_JOINABLE);
1575 mis->preempt_thread_status = PREEMPT_THREAD_CREATED;
1576 }
1577
1578 trace_postcopy_ram_enable_notify();
1579
1580 return 0;
1581 }
1582
qemu_ufd_copy_ioctl(MigrationIncomingState * mis,void * host_addr,void * from_addr,uint64_t pagesize,RAMBlock * rb)1583 static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1584 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1585 {
1586 int userfault_fd = mis->userfault_fd;
1587 int ret;
1588
1589 if (from_addr) {
1590 ret = uffd_copy_page(userfault_fd, host_addr, from_addr, pagesize,
1591 false);
1592 } else {
1593 ret = uffd_zero_page(userfault_fd, host_addr, pagesize, false);
1594 }
1595 if (!ret) {
1596 qemu_mutex_lock(&mis->page_request_mutex);
1597 ramblock_recv_bitmap_set_range(rb, host_addr,
1598 pagesize / qemu_target_page_size());
1599 /*
1600 * If this page resolves a page fault for a previous recorded faulted
1601 * address, take a special note to maintain the requested page list.
1602 */
1603 if (g_tree_lookup(mis->page_requested, host_addr)) {
1604 g_tree_remove(mis->page_requested, host_addr);
1605 int left_pages = qatomic_dec_fetch(&mis->page_requested_count);
1606
1607 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1608 /* Order the update of count and read of preempt status */
1609 smp_mb();
1610 if (mis->preempt_thread_status == PREEMPT_THREAD_QUIT &&
1611 left_pages == 0) {
1612 /*
1613 * This probably means the main thread is waiting for us.
1614 * Notify that we've finished receiving the last requested
1615 * page.
1616 */
1617 qemu_cond_signal(&mis->page_request_cond);
1618 }
1619 }
1620 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1621 qemu_mutex_unlock(&mis->page_request_mutex);
1622 }
1623 return ret;
1624 }
1625
postcopy_notify_shared_wake(RAMBlock * rb,uint64_t offset)1626 int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1627 {
1628 int i;
1629 MigrationIncomingState *mis = migration_incoming_get_current();
1630 GArray *pcrfds = mis->postcopy_remote_fds;
1631
1632 for (i = 0; i < pcrfds->len; i++) {
1633 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1634 int ret = cur->waker(cur, rb, offset);
1635 if (ret) {
1636 return ret;
1637 }
1638 }
1639 return 0;
1640 }
1641
1642 /*
1643 * Place a host page (from) at (host) atomically
1644 * returns 0 on success
1645 */
postcopy_place_page(MigrationIncomingState * mis,void * host,void * from,RAMBlock * rb)1646 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1647 RAMBlock *rb)
1648 {
1649 size_t pagesize = qemu_ram_pagesize(rb);
1650 int e;
1651
1652 /* copy also acks to the kernel waking the stalled thread up
1653 * TODO: We can inhibit that ack and only do it if it was requested
1654 * which would be slightly cheaper, but we'd have to be careful
1655 * of the order of updating our page state.
1656 */
1657 e = qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb);
1658 if (e) {
1659 return e;
1660 }
1661
1662 trace_postcopy_place_page(host);
1663 return postcopy_notify_shared_wake(rb,
1664 qemu_ram_block_host_offset(rb, host));
1665 }
1666
1667 /*
1668 * Place a zero page at (host) atomically
1669 * returns 0 on success
1670 */
postcopy_place_page_zero(MigrationIncomingState * mis,void * host,RAMBlock * rb)1671 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1672 RAMBlock *rb)
1673 {
1674 size_t pagesize = qemu_ram_pagesize(rb);
1675 trace_postcopy_place_page_zero(host);
1676
1677 /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
1678 * but it's not available for everything (e.g. hugetlbpages)
1679 */
1680 if (qemu_ram_is_uf_zeroable(rb)) {
1681 int e;
1682 e = qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb);
1683 if (e) {
1684 return e;
1685 }
1686 return postcopy_notify_shared_wake(rb,
1687 qemu_ram_block_host_offset(rb,
1688 host));
1689 } else {
1690 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1691 }
1692 }
1693
1694 #else
1695 /* No target OS support, stubs just fail */
fill_destination_postcopy_migration_info(MigrationInfo * info)1696 void fill_destination_postcopy_migration_info(MigrationInfo *info)
1697 {
1698 }
1699
postcopy_ram_supported_by_host(MigrationIncomingState * mis,Error ** errp)1700 bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
1701 {
1702 error_report("%s: No OS support", __func__);
1703 return false;
1704 }
1705
postcopy_ram_incoming_init(MigrationIncomingState * mis)1706 int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1707 {
1708 error_report("postcopy_ram_incoming_init: No OS support");
1709 return -1;
1710 }
1711
postcopy_ram_incoming_cleanup(MigrationIncomingState * mis)1712 int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1713 {
1714 g_assert_not_reached();
1715 }
1716
postcopy_ram_prepare_discard(MigrationIncomingState * mis)1717 int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1718 {
1719 g_assert_not_reached();
1720 }
1721
postcopy_request_shared_page(struct PostCopyFD * pcfd,RAMBlock * rb,uint64_t client_addr,uint64_t rb_offset)1722 int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1723 uint64_t client_addr, uint64_t rb_offset)
1724 {
1725 g_assert_not_reached();
1726 }
1727
postcopy_ram_incoming_setup(MigrationIncomingState * mis)1728 int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1729 {
1730 g_assert_not_reached();
1731 }
1732
postcopy_place_page(MigrationIncomingState * mis,void * host,void * from,RAMBlock * rb)1733 int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1734 RAMBlock *rb)
1735 {
1736 g_assert_not_reached();
1737 }
1738
postcopy_place_page_zero(MigrationIncomingState * mis,void * host,RAMBlock * rb)1739 int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1740 RAMBlock *rb)
1741 {
1742 g_assert_not_reached();
1743 }
1744
postcopy_wake_shared(struct PostCopyFD * pcfd,uint64_t client_addr,RAMBlock * rb)1745 int postcopy_wake_shared(struct PostCopyFD *pcfd,
1746 uint64_t client_addr,
1747 RAMBlock *rb)
1748 {
1749 g_assert_not_reached();
1750 }
1751
mark_postcopy_blocktime_begin(uintptr_t addr,uint32_t ptid,RAMBlock * rb)1752 void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
1753 RAMBlock *rb)
1754 {
1755 }
1756 #endif
1757
1758 /* ------------------------------------------------------------------------- */
postcopy_temp_page_reset(PostcopyTmpPage * tmp_page)1759 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page)
1760 {
1761 tmp_page->target_pages = 0;
1762 tmp_page->host_addr = NULL;
1763 /*
1764 * This is set to true when reset, and cleared as long as we received any
1765 * of the non-zero small page within this huge page.
1766 */
1767 tmp_page->all_zero = true;
1768 }
1769
postcopy_fault_thread_notify(MigrationIncomingState * mis)1770 void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1771 {
1772 uint64_t tmp64 = 1;
1773
1774 /*
1775 * Wakeup the fault_thread. It's an eventfd that should currently
1776 * be at 0, we're going to increment it to 1
1777 */
1778 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1779 /* Not much we can do here, but may as well report it */
1780 error_report("%s: incrementing failed: %s", __func__,
1781 strerror(errno));
1782 }
1783 }
1784
1785 /**
1786 * postcopy_discard_send_init: Called at the start of each RAMBlock before
1787 * asking to discard individual ranges.
1788 *
1789 * @ms: The current migration state.
1790 * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
1791 * @name: RAMBlock that discards will operate on.
1792 */
1793 static PostcopyDiscardState pds = {0};
postcopy_discard_send_init(MigrationState * ms,const char * name)1794 void postcopy_discard_send_init(MigrationState *ms, const char *name)
1795 {
1796 pds.ramblock_name = name;
1797 pds.cur_entry = 0;
1798 pds.nsentwords = 0;
1799 pds.nsentcmds = 0;
1800 }
1801
1802 /**
1803 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
1804 * discard. May send a discard message, may just leave it queued to
1805 * be sent later.
1806 *
1807 * @ms: Current migration state.
1808 * @start,@length: a range of pages in the migration bitmap in the
1809 * RAM block passed to postcopy_discard_send_init() (length=1 is one page)
1810 */
postcopy_discard_send_range(MigrationState * ms,unsigned long start,unsigned long length)1811 void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1812 unsigned long length)
1813 {
1814 size_t tp_size = qemu_target_page_size();
1815 /* Convert to byte offsets within the RAM block */
1816 pds.start_list[pds.cur_entry] = start * tp_size;
1817 pds.length_list[pds.cur_entry] = length * tp_size;
1818 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1819 pds.cur_entry++;
1820 pds.nsentwords++;
1821
1822 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1823 /* Full set, ship it! */
1824 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1825 pds.ramblock_name,
1826 pds.cur_entry,
1827 pds.start_list,
1828 pds.length_list);
1829 pds.nsentcmds++;
1830 pds.cur_entry = 0;
1831 }
1832 }
1833
1834 /**
1835 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
1836 * bitmap code. Sends any outstanding discard messages, frees the PDS
1837 *
1838 * @ms: Current migration state.
1839 */
postcopy_discard_send_finish(MigrationState * ms)1840 void postcopy_discard_send_finish(MigrationState *ms)
1841 {
1842 /* Anything unsent? */
1843 if (pds.cur_entry) {
1844 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1845 pds.ramblock_name,
1846 pds.cur_entry,
1847 pds.start_list,
1848 pds.length_list);
1849 pds.nsentcmds++;
1850 }
1851
1852 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1853 pds.nsentcmds);
1854 }
1855
1856 /*
1857 * Current state of incoming postcopy; note this is not part of
1858 * MigrationIncomingState since it's state is used during cleanup
1859 * at the end as MIS is being freed.
1860 */
1861 static PostcopyState incoming_postcopy_state;
1862
postcopy_state_get(void)1863 PostcopyState postcopy_state_get(void)
1864 {
1865 return qatomic_load_acquire(&incoming_postcopy_state);
1866 }
1867
1868 /* Set the state and return the old state */
postcopy_state_set(PostcopyState new_state)1869 PostcopyState postcopy_state_set(PostcopyState new_state)
1870 {
1871 return qatomic_xchg(&incoming_postcopy_state, new_state);
1872 }
1873
1874 /* Register a handler for external shared memory postcopy
1875 * called on the destination.
1876 */
postcopy_register_shared_ufd(struct PostCopyFD * pcfd)1877 void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1878 {
1879 MigrationIncomingState *mis = migration_incoming_get_current();
1880
1881 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1882 *pcfd);
1883 }
1884
1885 /* Unregister a handler for external shared memory postcopy
1886 */
postcopy_unregister_shared_ufd(struct PostCopyFD * pcfd)1887 void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1888 {
1889 guint i;
1890 MigrationIncomingState *mis = migration_incoming_get_current();
1891 GArray *pcrfds = mis->postcopy_remote_fds;
1892
1893 if (!pcrfds) {
1894 /* migration has already finished and freed the array */
1895 return;
1896 }
1897 for (i = 0; i < pcrfds->len; i++) {
1898 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1899 if (cur->fd == pcfd->fd) {
1900 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1901 return;
1902 }
1903 }
1904 }
1905
postcopy_preempt_new_channel(MigrationIncomingState * mis,QEMUFile * file)1906 void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
1907 {
1908 /*
1909 * The new loading channel has its own threads, so it needs to be
1910 * blocked too. It's by default true, just be explicit.
1911 */
1912 qemu_file_set_blocking(file, true);
1913 mis->postcopy_qemufile_dst = file;
1914 qemu_sem_post(&mis->postcopy_qemufile_dst_done);
1915 trace_postcopy_preempt_new_channel();
1916 }
1917
1918 /*
1919 * Setup the postcopy preempt channel with the IOC. If ERROR is specified,
1920 * setup the error instead. This helper will free the ERROR if specified.
1921 */
1922 static void
postcopy_preempt_send_channel_done(MigrationState * s,QIOChannel * ioc,Error * local_err)1923 postcopy_preempt_send_channel_done(MigrationState *s,
1924 QIOChannel *ioc, Error *local_err)
1925 {
1926 if (local_err) {
1927 migrate_set_error(s, local_err);
1928 error_free(local_err);
1929 } else {
1930 migration_ioc_register_yank(ioc);
1931 s->postcopy_qemufile_src = qemu_file_new_output(ioc);
1932 trace_postcopy_preempt_new_channel();
1933 }
1934
1935 /*
1936 * Kick the waiter in all cases. The waiter should check upon
1937 * postcopy_qemufile_src to know whether it failed or not.
1938 */
1939 qemu_sem_post(&s->postcopy_qemufile_src_sem);
1940 }
1941
1942 static void
postcopy_preempt_tls_handshake(QIOTask * task,gpointer opaque)1943 postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque)
1944 {
1945 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1946 MigrationState *s = opaque;
1947 Error *local_err = NULL;
1948
1949 qio_task_propagate_error(task, &local_err);
1950 postcopy_preempt_send_channel_done(s, ioc, local_err);
1951 }
1952
1953 static void
postcopy_preempt_send_channel_new(QIOTask * task,gpointer opaque)1954 postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque)
1955 {
1956 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1957 MigrationState *s = opaque;
1958 QIOChannelTLS *tioc;
1959 Error *local_err = NULL;
1960
1961 if (qio_task_propagate_error(task, &local_err)) {
1962 goto out;
1963 }
1964
1965 if (migrate_channel_requires_tls_upgrade(ioc)) {
1966 tioc = migration_tls_client_create(ioc, s->hostname, &local_err);
1967 if (!tioc) {
1968 goto out;
1969 }
1970 trace_postcopy_preempt_tls_handshake();
1971 qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt");
1972 qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake,
1973 s, NULL, NULL);
1974 /* Setup the channel until TLS handshake finished */
1975 return;
1976 }
1977
1978 out:
1979 /* This handles both good and error cases */
1980 postcopy_preempt_send_channel_done(s, ioc, local_err);
1981 }
1982
1983 /*
1984 * This function will kick off an async task to establish the preempt
1985 * channel, and wait until the connection setup completed. Returns 0 if
1986 * channel established, -1 for error.
1987 */
postcopy_preempt_establish_channel(MigrationState * s)1988 int postcopy_preempt_establish_channel(MigrationState *s)
1989 {
1990 /* If preempt not enabled, no need to wait */
1991 if (!migrate_postcopy_preempt()) {
1992 return 0;
1993 }
1994
1995 /*
1996 * Kick off async task to establish preempt channel. Only do so with
1997 * 8.0+ machines, because 7.1/7.2 require the channel to be created in
1998 * setup phase of migration (even if racy in an unreliable network).
1999 */
2000 if (!s->preempt_pre_7_2) {
2001 postcopy_preempt_setup(s);
2002 }
2003
2004 /*
2005 * We need the postcopy preempt channel to be established before
2006 * starting doing anything.
2007 */
2008 qemu_sem_wait(&s->postcopy_qemufile_src_sem);
2009
2010 return s->postcopy_qemufile_src ? 0 : -1;
2011 }
2012
postcopy_preempt_setup(MigrationState * s)2013 void postcopy_preempt_setup(MigrationState *s)
2014 {
2015 /* Kick an async task to connect */
2016 socket_send_channel_create(postcopy_preempt_send_channel_new, s);
2017 }
2018
postcopy_pause_ram_fast_load(MigrationIncomingState * mis)2019 static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
2020 {
2021 trace_postcopy_pause_fast_load();
2022 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
2023 qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);
2024 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
2025 trace_postcopy_pause_fast_load_continued();
2026 }
2027
preempt_thread_should_run(MigrationIncomingState * mis)2028 static bool preempt_thread_should_run(MigrationIncomingState *mis)
2029 {
2030 return mis->preempt_thread_status != PREEMPT_THREAD_QUIT;
2031 }
2032
postcopy_preempt_thread(void * opaque)2033 void *postcopy_preempt_thread(void *opaque)
2034 {
2035 MigrationIncomingState *mis = opaque;
2036 int ret;
2037
2038 trace_postcopy_preempt_thread_entry();
2039
2040 rcu_register_thread();
2041
2042 qemu_event_set(&mis->thread_sync_event);
2043
2044 /*
2045 * The preempt channel is established in asynchronous way. Wait
2046 * for its completion.
2047 */
2048 qemu_sem_wait(&mis->postcopy_qemufile_dst_done);
2049
2050 /* Sending RAM_SAVE_FLAG_EOS to terminate this thread */
2051 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
2052 while (preempt_thread_should_run(mis)) {
2053 ret = ram_load_postcopy(mis->postcopy_qemufile_dst,
2054 RAM_CHANNEL_POSTCOPY);
2055 /* If error happened, go into recovery routine */
2056 if (ret && preempt_thread_should_run(mis)) {
2057 postcopy_pause_ram_fast_load(mis);
2058 } else {
2059 /* We're done */
2060 break;
2061 }
2062 }
2063 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
2064
2065 rcu_unregister_thread();
2066
2067 trace_postcopy_preempt_thread_exit();
2068
2069 return NULL;
2070 }
2071
postcopy_is_paused(MigrationStatus status)2072 bool postcopy_is_paused(MigrationStatus status)
2073 {
2074 return status == MIGRATION_STATUS_POSTCOPY_PAUSED ||
2075 status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP;
2076 }
2077