xref: /openbmc/linux/tools/testing/selftests/mm/uffd-common.c (revision 5f8b7d4b2e9604d03ae06f1a2dd5a1f34c33e533)
133be4e89SPeter Xu // SPDX-License-Identifier: GPL-2.0-only
233be4e89SPeter Xu /*
333be4e89SPeter Xu  * Userfaultfd tests util functions
433be4e89SPeter Xu  *
533be4e89SPeter Xu  * Copyright (C) 2015-2023  Red Hat, Inc.
633be4e89SPeter Xu  */
733be4e89SPeter Xu 
833be4e89SPeter Xu #include "uffd-common.h"
933be4e89SPeter Xu 
1033be4e89SPeter Xu #define BASE_PMD_ADDR ((void *)(1UL << 30))
1133be4e89SPeter Xu 
1233be4e89SPeter Xu volatile bool test_uffdio_copy_eexist = true;
13265818efSPeter Xu unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
1433be4e89SPeter Xu char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
15c5cb9036SPeter Xu int uffd = -1, uffd_flags, finished, *pipefd, test_type;
16111fd29bSPeter Xu bool map_shared;
1762515b5fSPeter Xu bool test_uffdio_wp = true;
1833be4e89SPeter Xu unsigned long long *count_verify;
1933be4e89SPeter Xu uffd_test_ops_t *uffd_test_ops;
20*0b9be246SEdward Liaw atomic_bool ready_for_fork;
2133be4e89SPeter Xu 
uffd_mem_fd_create(off_t mem_size,bool hugetlb)22c5cb9036SPeter Xu static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
23c5cb9036SPeter Xu {
24c5cb9036SPeter Xu 	unsigned int memfd_flags = 0;
25c5cb9036SPeter Xu 	int mem_fd;
26c5cb9036SPeter Xu 
27c5cb9036SPeter Xu 	if (hugetlb)
28c5cb9036SPeter Xu 		memfd_flags = MFD_HUGETLB;
29c5cb9036SPeter Xu 	mem_fd = memfd_create("uffd-test", memfd_flags);
30c5cb9036SPeter Xu 	if (mem_fd < 0)
31c5cb9036SPeter Xu 		err("memfd_create");
32c5cb9036SPeter Xu 	if (ftruncate(mem_fd, mem_size))
33c5cb9036SPeter Xu 		err("ftruncate");
34c5cb9036SPeter Xu 	if (fallocate(mem_fd,
35c5cb9036SPeter Xu 		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
36c5cb9036SPeter Xu 		      mem_size))
37c5cb9036SPeter Xu 		err("fallocate");
38c5cb9036SPeter Xu 
39c5cb9036SPeter Xu 	return mem_fd;
40c5cb9036SPeter Xu }
41c5cb9036SPeter Xu 
anon_release_pages(char * rel_area)4233be4e89SPeter Xu static void anon_release_pages(char *rel_area)
4333be4e89SPeter Xu {
4433be4e89SPeter Xu 	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
4533be4e89SPeter Xu 		err("madvise(MADV_DONTNEED) failed");
4633be4e89SPeter Xu }
4733be4e89SPeter Xu 
anon_allocate_area(void ** alloc_area,bool is_src)48be39fec4SPeter Xu static int anon_allocate_area(void **alloc_area, bool is_src)
4933be4e89SPeter Xu {
5033be4e89SPeter Xu 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
5133be4e89SPeter Xu 			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
52be39fec4SPeter Xu 	if (*alloc_area == MAP_FAILED) {
53be39fec4SPeter Xu 		*alloc_area = NULL;
54be39fec4SPeter Xu 		return -errno;
55be39fec4SPeter Xu 	}
56be39fec4SPeter Xu 	return 0;
5733be4e89SPeter Xu }
5833be4e89SPeter Xu 
noop_alias_mapping(__u64 * start,size_t len,unsigned long offset)5933be4e89SPeter Xu static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
6033be4e89SPeter Xu {
6133be4e89SPeter Xu }
6233be4e89SPeter Xu 
hugetlb_release_pages(char * rel_area)6333be4e89SPeter Xu static void hugetlb_release_pages(char *rel_area)
6433be4e89SPeter Xu {
6533be4e89SPeter Xu 	if (!map_shared) {
6633be4e89SPeter Xu 		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
6733be4e89SPeter Xu 			err("madvise(MADV_DONTNEED) failed");
6833be4e89SPeter Xu 	} else {
6933be4e89SPeter Xu 		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
7033be4e89SPeter Xu 			err("madvise(MADV_REMOVE) failed");
7133be4e89SPeter Xu 	}
7233be4e89SPeter Xu }
7333be4e89SPeter Xu 
hugetlb_allocate_area(void ** alloc_area,bool is_src)74be39fec4SPeter Xu static int hugetlb_allocate_area(void **alloc_area, bool is_src)
7533be4e89SPeter Xu {
7633be4e89SPeter Xu 	off_t size = nr_pages * page_size;
7733be4e89SPeter Xu 	off_t offset = is_src ? 0 : size;
7833be4e89SPeter Xu 	void *area_alias = NULL;
7933be4e89SPeter Xu 	char **alloc_area_alias;
80c5cb9036SPeter Xu 	int mem_fd = uffd_mem_fd_create(size * 2, true);
8133be4e89SPeter Xu 
8233be4e89SPeter Xu 	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
8333be4e89SPeter Xu 			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
8433be4e89SPeter Xu 			   (is_src ? 0 : MAP_NORESERVE),
8533be4e89SPeter Xu 			   mem_fd, offset);
86be39fec4SPeter Xu 	if (*alloc_area == MAP_FAILED) {
87be39fec4SPeter Xu 		*alloc_area = NULL;
88be39fec4SPeter Xu 		return -errno;
89be39fec4SPeter Xu 	}
9033be4e89SPeter Xu 
9133be4e89SPeter Xu 	if (map_shared) {
9233be4e89SPeter Xu 		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
9333be4e89SPeter Xu 				  MAP_SHARED, mem_fd, offset);
9433be4e89SPeter Xu 		if (area_alias == MAP_FAILED)
95be39fec4SPeter Xu 			return -errno;
9633be4e89SPeter Xu 	}
9733be4e89SPeter Xu 
9833be4e89SPeter Xu 	if (is_src) {
9933be4e89SPeter Xu 		alloc_area_alias = &area_src_alias;
10033be4e89SPeter Xu 	} else {
10133be4e89SPeter Xu 		alloc_area_alias = &area_dst_alias;
10233be4e89SPeter Xu 	}
10333be4e89SPeter Xu 	if (area_alias)
10433be4e89SPeter Xu 		*alloc_area_alias = area_alias;
105c5cb9036SPeter Xu 
106c5cb9036SPeter Xu 	close(mem_fd);
107be39fec4SPeter Xu 	return 0;
10833be4e89SPeter Xu }
10933be4e89SPeter Xu 
hugetlb_alias_mapping(__u64 * start,size_t len,unsigned long offset)11033be4e89SPeter Xu static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
11133be4e89SPeter Xu {
11233be4e89SPeter Xu 	if (!map_shared)
11333be4e89SPeter Xu 		return;
11433be4e89SPeter Xu 
11533be4e89SPeter Xu 	*start = (unsigned long) area_dst_alias + offset;
11633be4e89SPeter Xu }
11733be4e89SPeter Xu 
shmem_release_pages(char * rel_area)11833be4e89SPeter Xu static void shmem_release_pages(char *rel_area)
11933be4e89SPeter Xu {
12033be4e89SPeter Xu 	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
12133be4e89SPeter Xu 		err("madvise(MADV_REMOVE) failed");
12233be4e89SPeter Xu }
12333be4e89SPeter Xu 
shmem_allocate_area(void ** alloc_area,bool is_src)124be39fec4SPeter Xu static int shmem_allocate_area(void **alloc_area, bool is_src)
12533be4e89SPeter Xu {
12633be4e89SPeter Xu 	void *area_alias = NULL;
127265818efSPeter Xu 	size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
12833be4e89SPeter Xu 	unsigned long offset = is_src ? 0 : bytes;
12933be4e89SPeter Xu 	char *p = NULL, *p_alias = NULL;
130c5cb9036SPeter Xu 	int mem_fd = uffd_mem_fd_create(bytes * 2, false);
13133be4e89SPeter Xu 
13262515b5fSPeter Xu 	/* TODO: clean this up.  Use a static addr is ugly */
13333be4e89SPeter Xu 	p = BASE_PMD_ADDR;
13433be4e89SPeter Xu 	if (!is_src)
13533be4e89SPeter Xu 		/* src map + alias + interleaved hpages */
13633be4e89SPeter Xu 		p += 2 * (bytes + hpage_size);
13733be4e89SPeter Xu 	p_alias = p;
13833be4e89SPeter Xu 	p_alias += bytes;
13933be4e89SPeter Xu 	p_alias += hpage_size;  /* Prevent src/dst VMA merge */
14033be4e89SPeter Xu 
14133be4e89SPeter Xu 	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
14233be4e89SPeter Xu 			   mem_fd, offset);
143be39fec4SPeter Xu 	if (*alloc_area == MAP_FAILED) {
144be39fec4SPeter Xu 		*alloc_area = NULL;
145be39fec4SPeter Xu 		return -errno;
146be39fec4SPeter Xu 	}
14762515b5fSPeter Xu 	if (*alloc_area != p)
14833be4e89SPeter Xu 		err("mmap of memfd failed at %p", p);
14933be4e89SPeter Xu 
15033be4e89SPeter Xu 	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
15133be4e89SPeter Xu 			  mem_fd, offset);
152be39fec4SPeter Xu 	if (area_alias == MAP_FAILED) {
153be39fec4SPeter Xu 		munmap(*alloc_area, bytes);
154be39fec4SPeter Xu 		*alloc_area = NULL;
155be39fec4SPeter Xu 		return -errno;
156be39fec4SPeter Xu 	}
15762515b5fSPeter Xu 	if (area_alias != p_alias)
15833be4e89SPeter Xu 		err("mmap of anonymous memory failed at %p", p_alias);
15933be4e89SPeter Xu 
16033be4e89SPeter Xu 	if (is_src)
16133be4e89SPeter Xu 		area_src_alias = area_alias;
16233be4e89SPeter Xu 	else
16333be4e89SPeter Xu 		area_dst_alias = area_alias;
164c5cb9036SPeter Xu 
165c5cb9036SPeter Xu 	close(mem_fd);
166be39fec4SPeter Xu 	return 0;
16733be4e89SPeter Xu }
16833be4e89SPeter Xu 
shmem_alias_mapping(__u64 * start,size_t len,unsigned long offset)16933be4e89SPeter Xu static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
17033be4e89SPeter Xu {
17133be4e89SPeter Xu 	*start = (unsigned long)area_dst_alias + offset;
17233be4e89SPeter Xu }
17333be4e89SPeter Xu 
shmem_check_pmd_mapping(void * p,int expect_nr_hpages)17433be4e89SPeter Xu static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
17533be4e89SPeter Xu {
176265818efSPeter Xu 	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
177265818efSPeter Xu 			      read_pmd_pagesize()))
17833be4e89SPeter Xu 		err("Did not find expected %d number of hugepages",
17933be4e89SPeter Xu 		    expect_nr_hpages);
18033be4e89SPeter Xu }
18133be4e89SPeter Xu 
18233be4e89SPeter Xu struct uffd_test_ops anon_uffd_test_ops = {
18333be4e89SPeter Xu 	.allocate_area = anon_allocate_area,
18433be4e89SPeter Xu 	.release_pages = anon_release_pages,
18533be4e89SPeter Xu 	.alias_mapping = noop_alias_mapping,
18633be4e89SPeter Xu 	.check_pmd_mapping = NULL,
18733be4e89SPeter Xu };
18833be4e89SPeter Xu 
18933be4e89SPeter Xu struct uffd_test_ops shmem_uffd_test_ops = {
19033be4e89SPeter Xu 	.allocate_area = shmem_allocate_area,
19133be4e89SPeter Xu 	.release_pages = shmem_release_pages,
19233be4e89SPeter Xu 	.alias_mapping = shmem_alias_mapping,
19333be4e89SPeter Xu 	.check_pmd_mapping = shmem_check_pmd_mapping,
19433be4e89SPeter Xu };
19533be4e89SPeter Xu 
19633be4e89SPeter Xu struct uffd_test_ops hugetlb_uffd_test_ops = {
19733be4e89SPeter Xu 	.allocate_area = hugetlb_allocate_area,
19833be4e89SPeter Xu 	.release_pages = hugetlb_release_pages,
19933be4e89SPeter Xu 	.alias_mapping = hugetlb_alias_mapping,
20033be4e89SPeter Xu 	.check_pmd_mapping = NULL,
20133be4e89SPeter Xu };
20233be4e89SPeter Xu 
uffd_stats_report(struct uffd_args * args,int n_cpus)20350834084SPeter Xu void uffd_stats_report(struct uffd_args *args, int n_cpus)
20433be4e89SPeter Xu {
20533be4e89SPeter Xu 	int i;
20633be4e89SPeter Xu 	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
20733be4e89SPeter Xu 
20833be4e89SPeter Xu 	for (i = 0; i < n_cpus; i++) {
20950834084SPeter Xu 		miss_total += args[i].missing_faults;
21050834084SPeter Xu 		wp_total += args[i].wp_faults;
21150834084SPeter Xu 		minor_total += args[i].minor_faults;
21233be4e89SPeter Xu 	}
21333be4e89SPeter Xu 
21433be4e89SPeter Xu 	printf("userfaults: ");
21533be4e89SPeter Xu 	if (miss_total) {
21633be4e89SPeter Xu 		printf("%llu missing (", miss_total);
21733be4e89SPeter Xu 		for (i = 0; i < n_cpus; i++)
21850834084SPeter Xu 			printf("%lu+", args[i].missing_faults);
21933be4e89SPeter Xu 		printf("\b) ");
22033be4e89SPeter Xu 	}
22133be4e89SPeter Xu 	if (wp_total) {
22233be4e89SPeter Xu 		printf("%llu wp (", wp_total);
22333be4e89SPeter Xu 		for (i = 0; i < n_cpus; i++)
22450834084SPeter Xu 			printf("%lu+", args[i].wp_faults);
22533be4e89SPeter Xu 		printf("\b) ");
22633be4e89SPeter Xu 	}
22733be4e89SPeter Xu 	if (minor_total) {
22833be4e89SPeter Xu 		printf("%llu minor (", minor_total);
22933be4e89SPeter Xu 		for (i = 0; i < n_cpus; i++)
23050834084SPeter Xu 			printf("%lu+", args[i].minor_faults);
23133be4e89SPeter Xu 		printf("\b)");
23233be4e89SPeter Xu 	}
23333be4e89SPeter Xu 	printf("\n");
23433be4e89SPeter Xu }
23533be4e89SPeter Xu 
userfaultfd_open(uint64_t * features)236f9da2426SPeter Xu int userfaultfd_open(uint64_t *features)
23733be4e89SPeter Xu {
23833be4e89SPeter Xu 	struct uffdio_api uffdio_api;
23933be4e89SPeter Xu 
240111fd29bSPeter Xu 	uffd = uffd_open(UFFD_FLAGS);
24133be4e89SPeter Xu 	if (uffd < 0)
242f9da2426SPeter Xu 		return -1;
24333be4e89SPeter Xu 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
24433be4e89SPeter Xu 
24533be4e89SPeter Xu 	uffdio_api.api = UFFD_API;
24633be4e89SPeter Xu 	uffdio_api.features = *features;
24733be4e89SPeter Xu 	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
248f9da2426SPeter Xu 		/* Probably lack of CAP_PTRACE? */
249f9da2426SPeter Xu 		return -1;
25033be4e89SPeter Xu 	if (uffdio_api.api != UFFD_API)
25133be4e89SPeter Xu 		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
25233be4e89SPeter Xu 
25333be4e89SPeter Xu 	*features = uffdio_api.features;
254f9da2426SPeter Xu 	return 0;
25533be4e89SPeter Xu }
25633be4e89SPeter Xu 
munmap_area(void ** area)25733be4e89SPeter Xu static inline void munmap_area(void **area)
25833be4e89SPeter Xu {
25933be4e89SPeter Xu 	if (*area)
26033be4e89SPeter Xu 		if (munmap(*area, nr_pages * page_size))
26133be4e89SPeter Xu 			err("munmap");
26233be4e89SPeter Xu 
26333be4e89SPeter Xu 	*area = NULL;
26433be4e89SPeter Xu }
26533be4e89SPeter Xu 
uffd_test_ctx_clear(void)26633be4e89SPeter Xu static void uffd_test_ctx_clear(void)
26733be4e89SPeter Xu {
26833be4e89SPeter Xu 	size_t i;
26933be4e89SPeter Xu 
27033be4e89SPeter Xu 	if (pipefd) {
27133be4e89SPeter Xu 		for (i = 0; i < nr_cpus * 2; ++i) {
27233be4e89SPeter Xu 			if (close(pipefd[i]))
27333be4e89SPeter Xu 				err("close pipefd");
27433be4e89SPeter Xu 		}
27533be4e89SPeter Xu 		free(pipefd);
27633be4e89SPeter Xu 		pipefd = NULL;
27733be4e89SPeter Xu 	}
27833be4e89SPeter Xu 
27933be4e89SPeter Xu 	if (count_verify) {
28033be4e89SPeter Xu 		free(count_verify);
28133be4e89SPeter Xu 		count_verify = NULL;
28233be4e89SPeter Xu 	}
28333be4e89SPeter Xu 
28433be4e89SPeter Xu 	if (uffd != -1) {
28533be4e89SPeter Xu 		if (close(uffd))
28633be4e89SPeter Xu 			err("close uffd");
28733be4e89SPeter Xu 		uffd = -1;
28833be4e89SPeter Xu 	}
28933be4e89SPeter Xu 
29033be4e89SPeter Xu 	munmap_area((void **)&area_src);
29133be4e89SPeter Xu 	munmap_area((void **)&area_src_alias);
29233be4e89SPeter Xu 	munmap_area((void **)&area_dst);
29333be4e89SPeter Xu 	munmap_area((void **)&area_dst_alias);
29433be4e89SPeter Xu 	munmap_area((void **)&area_remap);
29533be4e89SPeter Xu }
29633be4e89SPeter Xu 
uffd_test_ctx_init(uint64_t features,const char ** errmsg)297f9da2426SPeter Xu int uffd_test_ctx_init(uint64_t features, const char **errmsg)
29833be4e89SPeter Xu {
29933be4e89SPeter Xu 	unsigned long nr, cpu;
300be39fec4SPeter Xu 	int ret;
30133be4e89SPeter Xu 
30233be4e89SPeter Xu 	uffd_test_ctx_clear();
30333be4e89SPeter Xu 
304be39fec4SPeter Xu 	ret = uffd_test_ops->allocate_area((void **)&area_src, true);
305f9da2426SPeter Xu 	ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
306f9da2426SPeter Xu 	if (ret) {
307f9da2426SPeter Xu 		if (errmsg)
308f9da2426SPeter Xu 			*errmsg = "memory allocation failed";
309be39fec4SPeter Xu 		return ret;
310f9da2426SPeter Xu 	}
31133be4e89SPeter Xu 
312f9da2426SPeter Xu 	ret = userfaultfd_open(&features);
313f9da2426SPeter Xu 	if (ret) {
314f9da2426SPeter Xu 		if (errmsg)
315f9da2426SPeter Xu 			*errmsg = "possible lack of priviledge";
316f9da2426SPeter Xu 		return ret;
317f9da2426SPeter Xu 	}
31833be4e89SPeter Xu 
31933be4e89SPeter Xu 	count_verify = malloc(nr_pages * sizeof(unsigned long long));
32033be4e89SPeter Xu 	if (!count_verify)
32133be4e89SPeter Xu 		err("count_verify");
32233be4e89SPeter Xu 
32333be4e89SPeter Xu 	for (nr = 0; nr < nr_pages; nr++) {
32433be4e89SPeter Xu 		*area_mutex(area_src, nr) =
32533be4e89SPeter Xu 			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
32633be4e89SPeter Xu 		count_verify[nr] = *area_count(area_src, nr) = 1;
32733be4e89SPeter Xu 		/*
32833be4e89SPeter Xu 		 * In the transition between 255 to 256, powerpc will
32933be4e89SPeter Xu 		 * read out of order in my_bcmp and see both bytes as
33033be4e89SPeter Xu 		 * zero, so leave a placeholder below always non-zero
33133be4e89SPeter Xu 		 * after the count, to avoid my_bcmp to trigger false
33233be4e89SPeter Xu 		 * positives.
33333be4e89SPeter Xu 		 */
33433be4e89SPeter Xu 		*(area_count(area_src, nr) + 1) = 1;
33533be4e89SPeter Xu 	}
33633be4e89SPeter Xu 
33733be4e89SPeter Xu 	/*
33833be4e89SPeter Xu 	 * After initialization of area_src, we must explicitly release pages
33933be4e89SPeter Xu 	 * for area_dst to make sure it's fully empty.  Otherwise we could have
34033be4e89SPeter Xu 	 * some area_dst pages be errornously initialized with zero pages,
34133be4e89SPeter Xu 	 * hence we could hit memory corruption later in the test.
34233be4e89SPeter Xu 	 *
34333be4e89SPeter Xu 	 * One example is when THP is globally enabled, above allocate_area()
34433be4e89SPeter Xu 	 * calls could have the two areas merged into a single VMA (as they
34533be4e89SPeter Xu 	 * will have the same VMA flags so they're mergeable).  When we
34633be4e89SPeter Xu 	 * initialize the area_src above, it's possible that some part of
34733be4e89SPeter Xu 	 * area_dst could have been faulted in via one huge THP that will be
34833be4e89SPeter Xu 	 * shared between area_src and area_dst.  It could cause some of the
34933be4e89SPeter Xu 	 * area_dst won't be trapped by missing userfaults.
35033be4e89SPeter Xu 	 *
35133be4e89SPeter Xu 	 * This release_pages() will guarantee even if that happened, we'll
35233be4e89SPeter Xu 	 * proactively split the thp and drop any accidentally initialized
35333be4e89SPeter Xu 	 * pages within area_dst.
35433be4e89SPeter Xu 	 */
35533be4e89SPeter Xu 	uffd_test_ops->release_pages(area_dst);
35633be4e89SPeter Xu 
35733be4e89SPeter Xu 	pipefd = malloc(sizeof(int) * nr_cpus * 2);
35833be4e89SPeter Xu 	if (!pipefd)
35933be4e89SPeter Xu 		err("pipefd");
36033be4e89SPeter Xu 	for (cpu = 0; cpu < nr_cpus; cpu++)
36133be4e89SPeter Xu 		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
36233be4e89SPeter Xu 			err("pipe");
363be39fec4SPeter Xu 
364be39fec4SPeter Xu 	return 0;
36533be4e89SPeter Xu }
36633be4e89SPeter Xu 
wp_range(int ufd,__u64 start,__u64 len,bool wp)36733be4e89SPeter Xu void wp_range(int ufd, __u64 start, __u64 len, bool wp)
36833be4e89SPeter Xu {
36933be4e89SPeter Xu 	struct uffdio_writeprotect prms;
37033be4e89SPeter Xu 
37133be4e89SPeter Xu 	/* Write protection page faults */
37233be4e89SPeter Xu 	prms.range.start = start;
37333be4e89SPeter Xu 	prms.range.len = len;
37433be4e89SPeter Xu 	/* Undo write-protect, do wakeup after that */
37533be4e89SPeter Xu 	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
37633be4e89SPeter Xu 
37733be4e89SPeter Xu 	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
37833be4e89SPeter Xu 		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
37933be4e89SPeter Xu }
38033be4e89SPeter Xu 
continue_range(int ufd,__u64 start,__u64 len,bool wp)3810210c43eSPeter Xu static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
38233be4e89SPeter Xu {
38333be4e89SPeter Xu 	struct uffdio_continue req;
38433be4e89SPeter Xu 	int ret;
38533be4e89SPeter Xu 
38633be4e89SPeter Xu 	req.range.start = start;
38733be4e89SPeter Xu 	req.range.len = len;
38833be4e89SPeter Xu 	req.mode = 0;
3890210c43eSPeter Xu 	if (wp)
39033be4e89SPeter Xu 		req.mode |= UFFDIO_CONTINUE_MODE_WP;
39133be4e89SPeter Xu 
39233be4e89SPeter Xu 	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
39333be4e89SPeter Xu 		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
39433be4e89SPeter Xu 		    (uint64_t)start);
39533be4e89SPeter Xu 
39633be4e89SPeter Xu 	/*
39733be4e89SPeter Xu 	 * Error handling within the kernel for continue is subtly different
39833be4e89SPeter Xu 	 * from copy or zeropage, so it may be a source of bugs. Trigger an
39933be4e89SPeter Xu 	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
40033be4e89SPeter Xu 	 */
40133be4e89SPeter Xu 	req.mapped = 0;
40233be4e89SPeter Xu 	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
40333be4e89SPeter Xu 	if (ret >= 0 || req.mapped != -EEXIST)
40433be4e89SPeter Xu 		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
40533be4e89SPeter Xu 		    ret, (int64_t) req.mapped);
40633be4e89SPeter Xu }
40733be4e89SPeter Xu 
uffd_read_msg(int ufd,struct uffd_msg * msg)40833be4e89SPeter Xu int uffd_read_msg(int ufd, struct uffd_msg *msg)
40933be4e89SPeter Xu {
41033be4e89SPeter Xu 	int ret = read(uffd, msg, sizeof(*msg));
41133be4e89SPeter Xu 
41233be4e89SPeter Xu 	if (ret != sizeof(*msg)) {
41333be4e89SPeter Xu 		if (ret < 0) {
41433be4e89SPeter Xu 			if (errno == EAGAIN || errno == EINTR)
41533be4e89SPeter Xu 				return 1;
41633be4e89SPeter Xu 			err("blocking read error");
41733be4e89SPeter Xu 		} else {
41833be4e89SPeter Xu 			err("short read");
41933be4e89SPeter Xu 		}
42033be4e89SPeter Xu 	}
42133be4e89SPeter Xu 
42233be4e89SPeter Xu 	return 0;
42333be4e89SPeter Xu }
42433be4e89SPeter Xu 
uffd_handle_page_fault(struct uffd_msg * msg,struct uffd_args * args)42550834084SPeter Xu void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
42633be4e89SPeter Xu {
42733be4e89SPeter Xu 	unsigned long offset;
42833be4e89SPeter Xu 
42933be4e89SPeter Xu 	if (msg->event != UFFD_EVENT_PAGEFAULT)
43033be4e89SPeter Xu 		err("unexpected msg event %u", msg->event);
43133be4e89SPeter Xu 
43233be4e89SPeter Xu 	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
43333be4e89SPeter Xu 		/* Write protect page faults */
43433be4e89SPeter Xu 		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
43550834084SPeter Xu 		args->wp_faults++;
43633be4e89SPeter Xu 	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
43733be4e89SPeter Xu 		uint8_t *area;
43833be4e89SPeter Xu 		int b;
43933be4e89SPeter Xu 
44033be4e89SPeter Xu 		/*
44133be4e89SPeter Xu 		 * Minor page faults
44233be4e89SPeter Xu 		 *
44333be4e89SPeter Xu 		 * To prove we can modify the original range for testing
44433be4e89SPeter Xu 		 * purposes, we're going to bit flip this range before
44533be4e89SPeter Xu 		 * continuing.
44633be4e89SPeter Xu 		 *
44733be4e89SPeter Xu 		 * Note that this requires all minor page fault tests operate on
44833be4e89SPeter Xu 		 * area_dst (non-UFFD-registered) and area_dst_alias
44933be4e89SPeter Xu 		 * (UFFD-registered).
45033be4e89SPeter Xu 		 */
45133be4e89SPeter Xu 
45233be4e89SPeter Xu 		area = (uint8_t *)(area_dst +
45333be4e89SPeter Xu 				   ((char *)msg->arg.pagefault.address -
45433be4e89SPeter Xu 				    area_dst_alias));
45533be4e89SPeter Xu 		for (b = 0; b < page_size; ++b)
45633be4e89SPeter Xu 			area[b] = ~area[b];
4570210c43eSPeter Xu 		continue_range(uffd, msg->arg.pagefault.address, page_size,
4580210c43eSPeter Xu 			       args->apply_wp);
45950834084SPeter Xu 		args->minor_faults++;
46033be4e89SPeter Xu 	} else {
46133be4e89SPeter Xu 		/*
46233be4e89SPeter Xu 		 * Missing page faults.
46333be4e89SPeter Xu 		 *
46433be4e89SPeter Xu 		 * Here we force a write check for each of the missing mode
46533be4e89SPeter Xu 		 * faults.  It's guaranteed because the only threads that
46633be4e89SPeter Xu 		 * will trigger uffd faults are the locking threads, and
46733be4e89SPeter Xu 		 * their first instruction to touch the missing page will
46833be4e89SPeter Xu 		 * always be pthread_mutex_lock().
46933be4e89SPeter Xu 		 *
47033be4e89SPeter Xu 		 * Note that here we relied on an NPTL glibc impl detail to
47133be4e89SPeter Xu 		 * always read the lock type at the entry of the lock op
47233be4e89SPeter Xu 		 * (pthread_mutex_t.__data.__type, offset 0x10) before
47333be4e89SPeter Xu 		 * doing any locking operations to guarantee that.  It's
47433be4e89SPeter Xu 		 * actually not good to rely on this impl detail because
47533be4e89SPeter Xu 		 * logically a pthread-compatible lib can implement the
47633be4e89SPeter Xu 		 * locks without types and we can fail when linking with
47733be4e89SPeter Xu 		 * them.  However since we used to find bugs with this
47833be4e89SPeter Xu 		 * strict check we still keep it around.  Hopefully this
47933be4e89SPeter Xu 		 * could be a good hint when it fails again.  If one day
48033be4e89SPeter Xu 		 * it'll break on some other impl of glibc we'll revisit.
48133be4e89SPeter Xu 		 */
48233be4e89SPeter Xu 		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
48333be4e89SPeter Xu 			err("unexpected write fault");
48433be4e89SPeter Xu 
48533be4e89SPeter Xu 		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
48633be4e89SPeter Xu 		offset &= ~(page_size-1);
48733be4e89SPeter Xu 
4880210c43eSPeter Xu 		if (copy_page(uffd, offset, args->apply_wp))
48950834084SPeter Xu 			args->missing_faults++;
49033be4e89SPeter Xu 	}
49133be4e89SPeter Xu }
49233be4e89SPeter Xu 
uffd_poll_thread(void * arg)49333be4e89SPeter Xu void *uffd_poll_thread(void *arg)
49433be4e89SPeter Xu {
49550834084SPeter Xu 	struct uffd_args *args = (struct uffd_args *)arg;
49650834084SPeter Xu 	unsigned long cpu = args->cpu;
49733be4e89SPeter Xu 	struct pollfd pollfd[2];
49833be4e89SPeter Xu 	struct uffd_msg msg;
49933be4e89SPeter Xu 	struct uffdio_register uffd_reg;
50033be4e89SPeter Xu 	int ret;
50133be4e89SPeter Xu 	char tmp_chr;
50233be4e89SPeter Xu 
5037cf0f9e8SAxel Rasmussen 	if (!args->handle_fault)
5047cf0f9e8SAxel Rasmussen 		args->handle_fault = uffd_handle_page_fault;
5057cf0f9e8SAxel Rasmussen 
50633be4e89SPeter Xu 	pollfd[0].fd = uffd;
50733be4e89SPeter Xu 	pollfd[0].events = POLLIN;
50833be4e89SPeter Xu 	pollfd[1].fd = pipefd[cpu*2];
50933be4e89SPeter Xu 	pollfd[1].events = POLLIN;
51033be4e89SPeter Xu 
511*0b9be246SEdward Liaw 	ready_for_fork = true;
51207cf57ebSEdward Liaw 
51333be4e89SPeter Xu 	for (;;) {
51433be4e89SPeter Xu 		ret = poll(pollfd, 2, -1);
51533be4e89SPeter Xu 		if (ret <= 0) {
51633be4e89SPeter Xu 			if (errno == EINTR || errno == EAGAIN)
51733be4e89SPeter Xu 				continue;
51833be4e89SPeter Xu 			err("poll error: %d", ret);
51933be4e89SPeter Xu 		}
52033be4e89SPeter Xu 		if (pollfd[1].revents) {
52133be4e89SPeter Xu 			if (!(pollfd[1].revents & POLLIN))
52233be4e89SPeter Xu 				err("pollfd[1].revents %d", pollfd[1].revents);
52333be4e89SPeter Xu 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
52433be4e89SPeter Xu 				err("read pipefd error");
52533be4e89SPeter Xu 			break;
52633be4e89SPeter Xu 		}
52733be4e89SPeter Xu 		if (!(pollfd[0].revents & POLLIN))
52833be4e89SPeter Xu 			err("pollfd[0].revents %d", pollfd[0].revents);
52933be4e89SPeter Xu 		if (uffd_read_msg(uffd, &msg))
53033be4e89SPeter Xu 			continue;
53133be4e89SPeter Xu 		switch (msg.event) {
53233be4e89SPeter Xu 		default:
53333be4e89SPeter Xu 			err("unexpected msg event %u\n", msg.event);
53433be4e89SPeter Xu 			break;
53533be4e89SPeter Xu 		case UFFD_EVENT_PAGEFAULT:
5367cf0f9e8SAxel Rasmussen 			args->handle_fault(&msg, args);
53733be4e89SPeter Xu 			break;
53833be4e89SPeter Xu 		case UFFD_EVENT_FORK:
53933be4e89SPeter Xu 			close(uffd);
54033be4e89SPeter Xu 			uffd = msg.arg.fork.ufd;
54133be4e89SPeter Xu 			pollfd[0].fd = uffd;
54233be4e89SPeter Xu 			break;
54333be4e89SPeter Xu 		case UFFD_EVENT_REMOVE:
54433be4e89SPeter Xu 			uffd_reg.range.start = msg.arg.remove.start;
54533be4e89SPeter Xu 			uffd_reg.range.len = msg.arg.remove.end -
54633be4e89SPeter Xu 				msg.arg.remove.start;
54733be4e89SPeter Xu 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
54833be4e89SPeter Xu 				err("remove failure");
54933be4e89SPeter Xu 			break;
55033be4e89SPeter Xu 		case UFFD_EVENT_REMAP:
55133be4e89SPeter Xu 			area_remap = area_dst;  /* save for later unmap */
55233be4e89SPeter Xu 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
55333be4e89SPeter Xu 			break;
55433be4e89SPeter Xu 		}
55533be4e89SPeter Xu 	}
55633be4e89SPeter Xu 
55733be4e89SPeter Xu 	return NULL;
55833be4e89SPeter Xu }
55933be4e89SPeter Xu 
retry_copy_page(int ufd,struct uffdio_copy * uffdio_copy,unsigned long offset)56033be4e89SPeter Xu static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
56133be4e89SPeter Xu 			    unsigned long offset)
56233be4e89SPeter Xu {
56333be4e89SPeter Xu 	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
56433be4e89SPeter Xu 				     uffdio_copy->len,
56533be4e89SPeter Xu 				     offset);
56633be4e89SPeter Xu 	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
56733be4e89SPeter Xu 		/* real retval in ufdio_copy.copy */
56833be4e89SPeter Xu 		if (uffdio_copy->copy != -EEXIST)
56933be4e89SPeter Xu 			err("UFFDIO_COPY retry error: %"PRId64,
57033be4e89SPeter Xu 			    (int64_t)uffdio_copy->copy);
57133be4e89SPeter Xu 	} else {
57233be4e89SPeter Xu 		err("UFFDIO_COPY retry unexpected: %"PRId64,
57333be4e89SPeter Xu 		    (int64_t)uffdio_copy->copy);
57433be4e89SPeter Xu 	}
57533be4e89SPeter Xu }
57633be4e89SPeter Xu 
wake_range(int ufd,unsigned long addr,unsigned long len)57733be4e89SPeter Xu static void wake_range(int ufd, unsigned long addr, unsigned long len)
57833be4e89SPeter Xu {
57933be4e89SPeter Xu 	struct uffdio_range uffdio_wake;
58033be4e89SPeter Xu 
58133be4e89SPeter Xu 	uffdio_wake.start = addr;
58233be4e89SPeter Xu 	uffdio_wake.len = len;
58333be4e89SPeter Xu 
58433be4e89SPeter Xu 	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
58533be4e89SPeter Xu 		fprintf(stderr, "error waking %lu\n",
58633be4e89SPeter Xu 			addr), exit(1);
58733be4e89SPeter Xu }
58833be4e89SPeter Xu 
__copy_page(int ufd,unsigned long offset,bool retry,bool wp)5890210c43eSPeter Xu int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
59033be4e89SPeter Xu {
59133be4e89SPeter Xu 	struct uffdio_copy uffdio_copy;
59233be4e89SPeter Xu 
59333be4e89SPeter Xu 	if (offset >= nr_pages * page_size)
59433be4e89SPeter Xu 		err("unexpected offset %lu\n", offset);
59533be4e89SPeter Xu 	uffdio_copy.dst = (unsigned long) area_dst + offset;
59633be4e89SPeter Xu 	uffdio_copy.src = (unsigned long) area_src + offset;
59733be4e89SPeter Xu 	uffdio_copy.len = page_size;
5980210c43eSPeter Xu 	if (wp)
59933be4e89SPeter Xu 		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
60033be4e89SPeter Xu 	else
60133be4e89SPeter Xu 		uffdio_copy.mode = 0;
60233be4e89SPeter Xu 	uffdio_copy.copy = 0;
60333be4e89SPeter Xu 	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
60433be4e89SPeter Xu 		/* real retval in ufdio_copy.copy */
60533be4e89SPeter Xu 		if (uffdio_copy.copy != -EEXIST)
60633be4e89SPeter Xu 			err("UFFDIO_COPY error: %"PRId64,
60733be4e89SPeter Xu 			    (int64_t)uffdio_copy.copy);
60833be4e89SPeter Xu 		wake_range(ufd, uffdio_copy.dst, page_size);
60933be4e89SPeter Xu 	} else if (uffdio_copy.copy != page_size) {
61033be4e89SPeter Xu 		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
61133be4e89SPeter Xu 	} else {
61233be4e89SPeter Xu 		if (test_uffdio_copy_eexist && retry) {
61333be4e89SPeter Xu 			test_uffdio_copy_eexist = false;
61433be4e89SPeter Xu 			retry_copy_page(ufd, &uffdio_copy, offset);
61533be4e89SPeter Xu 		}
61633be4e89SPeter Xu 		return 1;
61733be4e89SPeter Xu 	}
61833be4e89SPeter Xu 	return 0;
61933be4e89SPeter Xu }
62033be4e89SPeter Xu 
copy_page(int ufd,unsigned long offset,bool wp)6210210c43eSPeter Xu int copy_page(int ufd, unsigned long offset, bool wp)
62233be4e89SPeter Xu {
6230210c43eSPeter Xu 	return __copy_page(ufd, offset, false, wp);
62433be4e89SPeter Xu }
62556d2afffSJohn Hubbard 
uffd_open_dev(unsigned int flags)62656d2afffSJohn Hubbard int uffd_open_dev(unsigned int flags)
62756d2afffSJohn Hubbard {
62856d2afffSJohn Hubbard 	int fd, uffd;
62956d2afffSJohn Hubbard 
63056d2afffSJohn Hubbard 	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
63156d2afffSJohn Hubbard 	if (fd < 0)
63256d2afffSJohn Hubbard 		return fd;
63356d2afffSJohn Hubbard 	uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
63456d2afffSJohn Hubbard 	close(fd);
63556d2afffSJohn Hubbard 
63656d2afffSJohn Hubbard 	return uffd;
63756d2afffSJohn Hubbard }
63856d2afffSJohn Hubbard 
uffd_open_sys(unsigned int flags)63956d2afffSJohn Hubbard int uffd_open_sys(unsigned int flags)
64056d2afffSJohn Hubbard {
64156d2afffSJohn Hubbard #ifdef __NR_userfaultfd
64256d2afffSJohn Hubbard 	return syscall(__NR_userfaultfd, flags);
64356d2afffSJohn Hubbard #else
64456d2afffSJohn Hubbard 	return -1;
64556d2afffSJohn Hubbard #endif
64656d2afffSJohn Hubbard }
64756d2afffSJohn Hubbard 
uffd_open(unsigned int flags)64856d2afffSJohn Hubbard int uffd_open(unsigned int flags)
64956d2afffSJohn Hubbard {
65056d2afffSJohn Hubbard 	int uffd = uffd_open_sys(flags);
65156d2afffSJohn Hubbard 
65256d2afffSJohn Hubbard 	if (uffd < 0)
65356d2afffSJohn Hubbard 		uffd = uffd_open_dev(flags);
65456d2afffSJohn Hubbard 
65556d2afffSJohn Hubbard 	return uffd;
65656d2afffSJohn Hubbard }
65756d2afffSJohn Hubbard 
uffd_get_features(uint64_t * features)65856d2afffSJohn Hubbard int uffd_get_features(uint64_t *features)
65956d2afffSJohn Hubbard {
66056d2afffSJohn Hubbard 	struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
66156d2afffSJohn Hubbard 	/*
66256d2afffSJohn Hubbard 	 * This should by default work in most kernels; the feature list
66356d2afffSJohn Hubbard 	 * will be the same no matter what we pass in here.
66456d2afffSJohn Hubbard 	 */
66556d2afffSJohn Hubbard 	int fd = uffd_open(UFFD_USER_MODE_ONLY);
66656d2afffSJohn Hubbard 
66756d2afffSJohn Hubbard 	if (fd < 0)
66856d2afffSJohn Hubbard 		/* Maybe the kernel is older than user-only mode? */
66956d2afffSJohn Hubbard 		fd = uffd_open(0);
67056d2afffSJohn Hubbard 
67156d2afffSJohn Hubbard 	if (fd < 0)
67256d2afffSJohn Hubbard 		return fd;
67356d2afffSJohn Hubbard 
67456d2afffSJohn Hubbard 	if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
67556d2afffSJohn Hubbard 		close(fd);
67656d2afffSJohn Hubbard 		return -errno;
67756d2afffSJohn Hubbard 	}
67856d2afffSJohn Hubbard 
67956d2afffSJohn Hubbard 	*features = uffdio_api.features;
68056d2afffSJohn Hubbard 	close(fd);
68156d2afffSJohn Hubbard 
68256d2afffSJohn Hubbard 	return 0;
68356d2afffSJohn Hubbard }
684