1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Userfaultfd tests util functions
4  *
5  * Copyright (C) 2015-2023  Red Hat, Inc.
6  */
7 
8 #include "uffd-common.h"
9 
10 #define BASE_PMD_ADDR ((void *)(1UL << 30))
11 
12 volatile bool test_uffdio_copy_eexist = true;
13 unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
14 char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
15 int uffd = -1, uffd_flags, finished, *pipefd, test_type;
16 bool map_shared;
17 bool test_uffdio_wp = true;
18 unsigned long long *count_verify;
19 uffd_test_ops_t *uffd_test_ops;
20 
21 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
22 {
23 	unsigned int memfd_flags = 0;
24 	int mem_fd;
25 
26 	if (hugetlb)
27 		memfd_flags = MFD_HUGETLB;
28 	mem_fd = memfd_create("uffd-test", memfd_flags);
29 	if (mem_fd < 0)
30 		err("memfd_create");
31 	if (ftruncate(mem_fd, mem_size))
32 		err("ftruncate");
33 	if (fallocate(mem_fd,
34 		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
35 		      mem_size))
36 		err("fallocate");
37 
38 	return mem_fd;
39 }
40 
41 static void anon_release_pages(char *rel_area)
42 {
43 	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
44 		err("madvise(MADV_DONTNEED) failed");
45 }
46 
47 static int anon_allocate_area(void **alloc_area, bool is_src)
48 {
49 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
50 			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
51 	if (*alloc_area == MAP_FAILED) {
52 		*alloc_area = NULL;
53 		return -errno;
54 	}
55 	return 0;
56 }
57 
58 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
59 {
60 }
61 
62 static void hugetlb_release_pages(char *rel_area)
63 {
64 	if (!map_shared) {
65 		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
66 			err("madvise(MADV_DONTNEED) failed");
67 	} else {
68 		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
69 			err("madvise(MADV_REMOVE) failed");
70 	}
71 }
72 
73 static int hugetlb_allocate_area(void **alloc_area, bool is_src)
74 {
75 	off_t size = nr_pages * page_size;
76 	off_t offset = is_src ? 0 : size;
77 	void *area_alias = NULL;
78 	char **alloc_area_alias;
79 	int mem_fd = uffd_mem_fd_create(size * 2, true);
80 
81 	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
82 			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
83 			   (is_src ? 0 : MAP_NORESERVE),
84 			   mem_fd, offset);
85 	if (*alloc_area == MAP_FAILED) {
86 		*alloc_area = NULL;
87 		return -errno;
88 	}
89 
90 	if (map_shared) {
91 		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
92 				  MAP_SHARED, mem_fd, offset);
93 		if (area_alias == MAP_FAILED)
94 			return -errno;
95 	}
96 
97 	if (is_src) {
98 		alloc_area_alias = &area_src_alias;
99 	} else {
100 		alloc_area_alias = &area_dst_alias;
101 	}
102 	if (area_alias)
103 		*alloc_area_alias = area_alias;
104 
105 	close(mem_fd);
106 	return 0;
107 }
108 
109 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
110 {
111 	if (!map_shared)
112 		return;
113 
114 	*start = (unsigned long) area_dst_alias + offset;
115 }
116 
117 static void shmem_release_pages(char *rel_area)
118 {
119 	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
120 		err("madvise(MADV_REMOVE) failed");
121 }
122 
123 static int shmem_allocate_area(void **alloc_area, bool is_src)
124 {
125 	void *area_alias = NULL;
126 	size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
127 	unsigned long offset = is_src ? 0 : bytes;
128 	char *p = NULL, *p_alias = NULL;
129 	int mem_fd = uffd_mem_fd_create(bytes * 2, false);
130 
131 	/* TODO: clean this up.  Use a static addr is ugly */
132 	p = BASE_PMD_ADDR;
133 	if (!is_src)
134 		/* src map + alias + interleaved hpages */
135 		p += 2 * (bytes + hpage_size);
136 	p_alias = p;
137 	p_alias += bytes;
138 	p_alias += hpage_size;  /* Prevent src/dst VMA merge */
139 
140 	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
141 			   mem_fd, offset);
142 	if (*alloc_area == MAP_FAILED) {
143 		*alloc_area = NULL;
144 		return -errno;
145 	}
146 	if (*alloc_area != p)
147 		err("mmap of memfd failed at %p", p);
148 
149 	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
150 			  mem_fd, offset);
151 	if (area_alias == MAP_FAILED) {
152 		munmap(*alloc_area, bytes);
153 		*alloc_area = NULL;
154 		return -errno;
155 	}
156 	if (area_alias != p_alias)
157 		err("mmap of anonymous memory failed at %p", p_alias);
158 
159 	if (is_src)
160 		area_src_alias = area_alias;
161 	else
162 		area_dst_alias = area_alias;
163 
164 	close(mem_fd);
165 	return 0;
166 }
167 
168 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
169 {
170 	*start = (unsigned long)area_dst_alias + offset;
171 }
172 
173 static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
174 {
175 	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
176 			      read_pmd_pagesize()))
177 		err("Did not find expected %d number of hugepages",
178 		    expect_nr_hpages);
179 }
180 
181 struct uffd_test_ops anon_uffd_test_ops = {
182 	.allocate_area = anon_allocate_area,
183 	.release_pages = anon_release_pages,
184 	.alias_mapping = noop_alias_mapping,
185 	.check_pmd_mapping = NULL,
186 };
187 
188 struct uffd_test_ops shmem_uffd_test_ops = {
189 	.allocate_area = shmem_allocate_area,
190 	.release_pages = shmem_release_pages,
191 	.alias_mapping = shmem_alias_mapping,
192 	.check_pmd_mapping = shmem_check_pmd_mapping,
193 };
194 
195 struct uffd_test_ops hugetlb_uffd_test_ops = {
196 	.allocate_area = hugetlb_allocate_area,
197 	.release_pages = hugetlb_release_pages,
198 	.alias_mapping = hugetlb_alias_mapping,
199 	.check_pmd_mapping = NULL,
200 };
201 
202 void uffd_stats_report(struct uffd_args *args, int n_cpus)
203 {
204 	int i;
205 	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
206 
207 	for (i = 0; i < n_cpus; i++) {
208 		miss_total += args[i].missing_faults;
209 		wp_total += args[i].wp_faults;
210 		minor_total += args[i].minor_faults;
211 	}
212 
213 	printf("userfaults: ");
214 	if (miss_total) {
215 		printf("%llu missing (", miss_total);
216 		for (i = 0; i < n_cpus; i++)
217 			printf("%lu+", args[i].missing_faults);
218 		printf("\b) ");
219 	}
220 	if (wp_total) {
221 		printf("%llu wp (", wp_total);
222 		for (i = 0; i < n_cpus; i++)
223 			printf("%lu+", args[i].wp_faults);
224 		printf("\b) ");
225 	}
226 	if (minor_total) {
227 		printf("%llu minor (", minor_total);
228 		for (i = 0; i < n_cpus; i++)
229 			printf("%lu+", args[i].minor_faults);
230 		printf("\b)");
231 	}
232 	printf("\n");
233 }
234 
235 int userfaultfd_open(uint64_t *features)
236 {
237 	struct uffdio_api uffdio_api;
238 
239 	uffd = uffd_open(UFFD_FLAGS);
240 	if (uffd < 0)
241 		return -1;
242 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
243 
244 	uffdio_api.api = UFFD_API;
245 	uffdio_api.features = *features;
246 	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
247 		/* Probably lack of CAP_PTRACE? */
248 		return -1;
249 	if (uffdio_api.api != UFFD_API)
250 		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
251 
252 	*features = uffdio_api.features;
253 	return 0;
254 }
255 
256 static inline void munmap_area(void **area)
257 {
258 	if (*area)
259 		if (munmap(*area, nr_pages * page_size))
260 			err("munmap");
261 
262 	*area = NULL;
263 }
264 
265 static void uffd_test_ctx_clear(void)
266 {
267 	size_t i;
268 
269 	if (pipefd) {
270 		for (i = 0; i < nr_cpus * 2; ++i) {
271 			if (close(pipefd[i]))
272 				err("close pipefd");
273 		}
274 		free(pipefd);
275 		pipefd = NULL;
276 	}
277 
278 	if (count_verify) {
279 		free(count_verify);
280 		count_verify = NULL;
281 	}
282 
283 	if (uffd != -1) {
284 		if (close(uffd))
285 			err("close uffd");
286 		uffd = -1;
287 	}
288 
289 	munmap_area((void **)&area_src);
290 	munmap_area((void **)&area_src_alias);
291 	munmap_area((void **)&area_dst);
292 	munmap_area((void **)&area_dst_alias);
293 	munmap_area((void **)&area_remap);
294 }
295 
296 int uffd_test_ctx_init(uint64_t features, const char **errmsg)
297 {
298 	unsigned long nr, cpu;
299 	int ret;
300 
301 	uffd_test_ctx_clear();
302 
303 	ret = uffd_test_ops->allocate_area((void **)&area_src, true);
304 	ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
305 	if (ret) {
306 		if (errmsg)
307 			*errmsg = "memory allocation failed";
308 		return ret;
309 	}
310 
311 	ret = userfaultfd_open(&features);
312 	if (ret) {
313 		if (errmsg)
314 			*errmsg = "possible lack of priviledge";
315 		return ret;
316 	}
317 
318 	count_verify = malloc(nr_pages * sizeof(unsigned long long));
319 	if (!count_verify)
320 		err("count_verify");
321 
322 	for (nr = 0; nr < nr_pages; nr++) {
323 		*area_mutex(area_src, nr) =
324 			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
325 		count_verify[nr] = *area_count(area_src, nr) = 1;
326 		/*
327 		 * In the transition between 255 to 256, powerpc will
328 		 * read out of order in my_bcmp and see both bytes as
329 		 * zero, so leave a placeholder below always non-zero
330 		 * after the count, to avoid my_bcmp to trigger false
331 		 * positives.
332 		 */
333 		*(area_count(area_src, nr) + 1) = 1;
334 	}
335 
336 	/*
337 	 * After initialization of area_src, we must explicitly release pages
338 	 * for area_dst to make sure it's fully empty.  Otherwise we could have
339 	 * some area_dst pages be errornously initialized with zero pages,
340 	 * hence we could hit memory corruption later in the test.
341 	 *
342 	 * One example is when THP is globally enabled, above allocate_area()
343 	 * calls could have the two areas merged into a single VMA (as they
344 	 * will have the same VMA flags so they're mergeable).  When we
345 	 * initialize the area_src above, it's possible that some part of
346 	 * area_dst could have been faulted in via one huge THP that will be
347 	 * shared between area_src and area_dst.  It could cause some of the
348 	 * area_dst won't be trapped by missing userfaults.
349 	 *
350 	 * This release_pages() will guarantee even if that happened, we'll
351 	 * proactively split the thp and drop any accidentally initialized
352 	 * pages within area_dst.
353 	 */
354 	uffd_test_ops->release_pages(area_dst);
355 
356 	pipefd = malloc(sizeof(int) * nr_cpus * 2);
357 	if (!pipefd)
358 		err("pipefd");
359 	for (cpu = 0; cpu < nr_cpus; cpu++)
360 		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
361 			err("pipe");
362 
363 	return 0;
364 }
365 
366 void wp_range(int ufd, __u64 start, __u64 len, bool wp)
367 {
368 	struct uffdio_writeprotect prms;
369 
370 	/* Write protection page faults */
371 	prms.range.start = start;
372 	prms.range.len = len;
373 	/* Undo write-protect, do wakeup after that */
374 	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
375 
376 	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
377 		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
378 }
379 
380 static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
381 {
382 	struct uffdio_continue req;
383 	int ret;
384 
385 	req.range.start = start;
386 	req.range.len = len;
387 	req.mode = 0;
388 	if (wp)
389 		req.mode |= UFFDIO_CONTINUE_MODE_WP;
390 
391 	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
392 		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
393 		    (uint64_t)start);
394 
395 	/*
396 	 * Error handling within the kernel for continue is subtly different
397 	 * from copy or zeropage, so it may be a source of bugs. Trigger an
398 	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
399 	 */
400 	req.mapped = 0;
401 	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
402 	if (ret >= 0 || req.mapped != -EEXIST)
403 		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
404 		    ret, (int64_t) req.mapped);
405 }
406 
407 int uffd_read_msg(int ufd, struct uffd_msg *msg)
408 {
409 	int ret = read(uffd, msg, sizeof(*msg));
410 
411 	if (ret != sizeof(*msg)) {
412 		if (ret < 0) {
413 			if (errno == EAGAIN || errno == EINTR)
414 				return 1;
415 			err("blocking read error");
416 		} else {
417 			err("short read");
418 		}
419 	}
420 
421 	return 0;
422 }
423 
424 void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
425 {
426 	unsigned long offset;
427 
428 	if (msg->event != UFFD_EVENT_PAGEFAULT)
429 		err("unexpected msg event %u", msg->event);
430 
431 	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
432 		/* Write protect page faults */
433 		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
434 		args->wp_faults++;
435 	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
436 		uint8_t *area;
437 		int b;
438 
439 		/*
440 		 * Minor page faults
441 		 *
442 		 * To prove we can modify the original range for testing
443 		 * purposes, we're going to bit flip this range before
444 		 * continuing.
445 		 *
446 		 * Note that this requires all minor page fault tests operate on
447 		 * area_dst (non-UFFD-registered) and area_dst_alias
448 		 * (UFFD-registered).
449 		 */
450 
451 		area = (uint8_t *)(area_dst +
452 				   ((char *)msg->arg.pagefault.address -
453 				    area_dst_alias));
454 		for (b = 0; b < page_size; ++b)
455 			area[b] = ~area[b];
456 		continue_range(uffd, msg->arg.pagefault.address, page_size,
457 			       args->apply_wp);
458 		args->minor_faults++;
459 	} else {
460 		/*
461 		 * Missing page faults.
462 		 *
463 		 * Here we force a write check for each of the missing mode
464 		 * faults.  It's guaranteed because the only threads that
465 		 * will trigger uffd faults are the locking threads, and
466 		 * their first instruction to touch the missing page will
467 		 * always be pthread_mutex_lock().
468 		 *
469 		 * Note that here we relied on an NPTL glibc impl detail to
470 		 * always read the lock type at the entry of the lock op
471 		 * (pthread_mutex_t.__data.__type, offset 0x10) before
472 		 * doing any locking operations to guarantee that.  It's
473 		 * actually not good to rely on this impl detail because
474 		 * logically a pthread-compatible lib can implement the
475 		 * locks without types and we can fail when linking with
476 		 * them.  However since we used to find bugs with this
477 		 * strict check we still keep it around.  Hopefully this
478 		 * could be a good hint when it fails again.  If one day
479 		 * it'll break on some other impl of glibc we'll revisit.
480 		 */
481 		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
482 			err("unexpected write fault");
483 
484 		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
485 		offset &= ~(page_size-1);
486 
487 		if (copy_page(uffd, offset, args->apply_wp))
488 			args->missing_faults++;
489 	}
490 }
491 
492 void *uffd_poll_thread(void *arg)
493 {
494 	struct uffd_args *args = (struct uffd_args *)arg;
495 	unsigned long cpu = args->cpu;
496 	struct pollfd pollfd[2];
497 	struct uffd_msg msg;
498 	struct uffdio_register uffd_reg;
499 	int ret;
500 	char tmp_chr;
501 
502 	pollfd[0].fd = uffd;
503 	pollfd[0].events = POLLIN;
504 	pollfd[1].fd = pipefd[cpu*2];
505 	pollfd[1].events = POLLIN;
506 
507 	for (;;) {
508 		ret = poll(pollfd, 2, -1);
509 		if (ret <= 0) {
510 			if (errno == EINTR || errno == EAGAIN)
511 				continue;
512 			err("poll error: %d", ret);
513 		}
514 		if (pollfd[1].revents) {
515 			if (!(pollfd[1].revents & POLLIN))
516 				err("pollfd[1].revents %d", pollfd[1].revents);
517 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
518 				err("read pipefd error");
519 			break;
520 		}
521 		if (!(pollfd[0].revents & POLLIN))
522 			err("pollfd[0].revents %d", pollfd[0].revents);
523 		if (uffd_read_msg(uffd, &msg))
524 			continue;
525 		switch (msg.event) {
526 		default:
527 			err("unexpected msg event %u\n", msg.event);
528 			break;
529 		case UFFD_EVENT_PAGEFAULT:
530 			uffd_handle_page_fault(&msg, args);
531 			break;
532 		case UFFD_EVENT_FORK:
533 			close(uffd);
534 			uffd = msg.arg.fork.ufd;
535 			pollfd[0].fd = uffd;
536 			break;
537 		case UFFD_EVENT_REMOVE:
538 			uffd_reg.range.start = msg.arg.remove.start;
539 			uffd_reg.range.len = msg.arg.remove.end -
540 				msg.arg.remove.start;
541 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
542 				err("remove failure");
543 			break;
544 		case UFFD_EVENT_REMAP:
545 			area_remap = area_dst;  /* save for later unmap */
546 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
547 			break;
548 		}
549 	}
550 
551 	return NULL;
552 }
553 
554 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
555 			    unsigned long offset)
556 {
557 	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
558 				     uffdio_copy->len,
559 				     offset);
560 	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
561 		/* real retval in ufdio_copy.copy */
562 		if (uffdio_copy->copy != -EEXIST)
563 			err("UFFDIO_COPY retry error: %"PRId64,
564 			    (int64_t)uffdio_copy->copy);
565 	} else {
566 		err("UFFDIO_COPY retry unexpected: %"PRId64,
567 		    (int64_t)uffdio_copy->copy);
568 	}
569 }
570 
571 static void wake_range(int ufd, unsigned long addr, unsigned long len)
572 {
573 	struct uffdio_range uffdio_wake;
574 
575 	uffdio_wake.start = addr;
576 	uffdio_wake.len = len;
577 
578 	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
579 		fprintf(stderr, "error waking %lu\n",
580 			addr), exit(1);
581 }
582 
583 int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
584 {
585 	struct uffdio_copy uffdio_copy;
586 
587 	if (offset >= nr_pages * page_size)
588 		err("unexpected offset %lu\n", offset);
589 	uffdio_copy.dst = (unsigned long) area_dst + offset;
590 	uffdio_copy.src = (unsigned long) area_src + offset;
591 	uffdio_copy.len = page_size;
592 	if (wp)
593 		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
594 	else
595 		uffdio_copy.mode = 0;
596 	uffdio_copy.copy = 0;
597 	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
598 		/* real retval in ufdio_copy.copy */
599 		if (uffdio_copy.copy != -EEXIST)
600 			err("UFFDIO_COPY error: %"PRId64,
601 			    (int64_t)uffdio_copy.copy);
602 		wake_range(ufd, uffdio_copy.dst, page_size);
603 	} else if (uffdio_copy.copy != page_size) {
604 		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
605 	} else {
606 		if (test_uffdio_copy_eexist && retry) {
607 			test_uffdio_copy_eexist = false;
608 			retry_copy_page(ufd, &uffdio_copy, offset);
609 		}
610 		return 1;
611 	}
612 	return 0;
613 }
614 
615 int copy_page(int ufd, unsigned long offset, bool wp)
616 {
617 	return __copy_page(ufd, offset, false, wp);
618 }
619