xref: /openbmc/linux/tools/testing/selftests/mm/cow.c (revision ecefa105)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <dirent.h>
18 #include <assert.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23 
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28 
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 
33 #ifndef MADV_PAGEOUT
34 #define MADV_PAGEOUT 21
35 #endif
36 #ifndef MADV_COLLAPSE
37 #define MADV_COLLAPSE 25
38 #endif
39 
40 static size_t pagesize;
41 static int pagemap_fd;
42 static size_t thpsize;
43 static int nr_hugetlbsizes;
44 static size_t hugetlbsizes[10];
45 static int gup_fd;
46 static bool has_huge_zeropage;
47 
48 static void detect_thpsize(void)
49 {
50 	int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
51 		      O_RDONLY);
52 	size_t size = 0;
53 	char buf[15];
54 	int ret;
55 
56 	if (fd < 0)
57 		return;
58 
59 	ret = pread(fd, buf, sizeof(buf), 0);
60 	if (ret > 0 && ret < sizeof(buf)) {
61 		buf[ret] = 0;
62 
63 		size = strtoul(buf, NULL, 10);
64 		if (size < pagesize)
65 			size = 0;
66 		if (size > 0) {
67 			thpsize = size;
68 			ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
69 				       thpsize / 1024);
70 		}
71 	}
72 
73 	close(fd);
74 }
75 
76 static void detect_huge_zeropage(void)
77 {
78 	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
79 		      O_RDONLY);
80 	size_t enabled = 0;
81 	char buf[15];
82 	int ret;
83 
84 	if (fd < 0)
85 		return;
86 
87 	ret = pread(fd, buf, sizeof(buf), 0);
88 	if (ret > 0 && ret < sizeof(buf)) {
89 		buf[ret] = 0;
90 
91 		enabled = strtoul(buf, NULL, 10);
92 		if (enabled == 1) {
93 			has_huge_zeropage = true;
94 			ksft_print_msg("[INFO] huge zeropage is enabled\n");
95 		}
96 	}
97 
98 	close(fd);
99 }
100 
101 static void detect_hugetlbsizes(void)
102 {
103 	DIR *dir = opendir("/sys/kernel/mm/hugepages/");
104 
105 	if (!dir)
106 		return;
107 
108 	while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) {
109 		struct dirent *entry = readdir(dir);
110 		size_t kb;
111 
112 		if (!entry)
113 			break;
114 		if (entry->d_type != DT_DIR)
115 			continue;
116 		if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
117 			continue;
118 		hugetlbsizes[nr_hugetlbsizes] = kb * 1024;
119 		nr_hugetlbsizes++;
120 		ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n",
121 			       kb);
122 	}
123 	closedir(dir);
124 }
125 
126 static bool range_is_swapped(void *addr, size_t size)
127 {
128 	for (; size; addr += pagesize, size -= pagesize)
129 		if (!pagemap_is_swapped(pagemap_fd, addr))
130 			return false;
131 	return true;
132 }
133 
134 struct comm_pipes {
135 	int child_ready[2];
136 	int parent_ready[2];
137 };
138 
139 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
140 {
141 	if (pipe(comm_pipes->child_ready) < 0)
142 		return -errno;
143 	if (pipe(comm_pipes->parent_ready) < 0) {
144 		close(comm_pipes->child_ready[0]);
145 		close(comm_pipes->child_ready[1]);
146 		return -errno;
147 	}
148 
149 	return 0;
150 }
151 
152 static void close_comm_pipes(struct comm_pipes *comm_pipes)
153 {
154 	close(comm_pipes->child_ready[0]);
155 	close(comm_pipes->child_ready[1]);
156 	close(comm_pipes->parent_ready[0]);
157 	close(comm_pipes->parent_ready[1]);
158 }
159 
160 static int child_memcmp_fn(char *mem, size_t size,
161 			   struct comm_pipes *comm_pipes)
162 {
163 	char *old = malloc(size);
164 	char buf;
165 
166 	/* Backup the original content. */
167 	memcpy(old, mem, size);
168 
169 	/* Wait until the parent modified the page. */
170 	write(comm_pipes->child_ready[1], "0", 1);
171 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
172 		;
173 
174 	/* See if we still read the old values. */
175 	return memcmp(old, mem, size);
176 }
177 
178 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
179 				    struct comm_pipes *comm_pipes)
180 {
181 	struct iovec iov = {
182 		.iov_base = mem,
183 		.iov_len = size,
184 	};
185 	ssize_t cur, total, transferred;
186 	char *old, *new;
187 	int fds[2];
188 	char buf;
189 
190 	old = malloc(size);
191 	new = malloc(size);
192 
193 	/* Backup the original content. */
194 	memcpy(old, mem, size);
195 
196 	if (pipe(fds) < 0)
197 		return -errno;
198 
199 	/* Trigger a read-only pin. */
200 	transferred = vmsplice(fds[1], &iov, 1, 0);
201 	if (transferred < 0)
202 		return -errno;
203 	if (transferred == 0)
204 		return -EINVAL;
205 
206 	/* Unmap it from our page tables. */
207 	if (munmap(mem, size) < 0)
208 		return -errno;
209 
210 	/* Wait until the parent modified it. */
211 	write(comm_pipes->child_ready[1], "0", 1);
212 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
213 		;
214 
215 	/* See if we still read the old values via the pipe. */
216 	for (total = 0; total < transferred; total += cur) {
217 		cur = read(fds[0], new + total, transferred - total);
218 		if (cur < 0)
219 			return -errno;
220 	}
221 
222 	return memcmp(old, new, transferred);
223 }
224 
225 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
226 
227 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
228 				  child_fn fn)
229 {
230 	struct comm_pipes comm_pipes;
231 	char buf;
232 	int ret;
233 
234 	ret = setup_comm_pipes(&comm_pipes);
235 	if (ret) {
236 		ksft_test_result_fail("pipe() failed\n");
237 		return;
238 	}
239 
240 	ret = fork();
241 	if (ret < 0) {
242 		ksft_test_result_fail("fork() failed\n");
243 		goto close_comm_pipes;
244 	} else if (!ret) {
245 		exit(fn(mem, size, &comm_pipes));
246 	}
247 
248 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
249 		;
250 
251 	if (do_mprotect) {
252 		/*
253 		 * mprotect() optimizations might try avoiding
254 		 * write-faults by directly mapping pages writable.
255 		 */
256 		ret = mprotect(mem, size, PROT_READ);
257 		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
258 		if (ret) {
259 			ksft_test_result_fail("mprotect() failed\n");
260 			write(comm_pipes.parent_ready[1], "0", 1);
261 			wait(&ret);
262 			goto close_comm_pipes;
263 		}
264 	}
265 
266 	/* Modify the page. */
267 	memset(mem, 0xff, size);
268 	write(comm_pipes.parent_ready[1], "0", 1);
269 
270 	wait(&ret);
271 	if (WIFEXITED(ret))
272 		ret = WEXITSTATUS(ret);
273 	else
274 		ret = -EINVAL;
275 
276 	ksft_test_result(!ret, "No leak from parent into child\n");
277 close_comm_pipes:
278 	close_comm_pipes(&comm_pipes);
279 }
280 
281 static void test_cow_in_parent(char *mem, size_t size)
282 {
283 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
284 }
285 
286 static void test_cow_in_parent_mprotect(char *mem, size_t size)
287 {
288 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
289 }
290 
291 static void test_vmsplice_in_child(char *mem, size_t size)
292 {
293 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
294 }
295 
296 static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
297 {
298 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
299 }
300 
301 static void do_test_vmsplice_in_parent(char *mem, size_t size,
302 				       bool before_fork)
303 {
304 	struct iovec iov = {
305 		.iov_base = mem,
306 		.iov_len = size,
307 	};
308 	ssize_t cur, total, transferred;
309 	struct comm_pipes comm_pipes;
310 	char *old, *new;
311 	int ret, fds[2];
312 	char buf;
313 
314 	old = malloc(size);
315 	new = malloc(size);
316 
317 	memcpy(old, mem, size);
318 
319 	ret = setup_comm_pipes(&comm_pipes);
320 	if (ret) {
321 		ksft_test_result_fail("pipe() failed\n");
322 		goto free;
323 	}
324 
325 	if (pipe(fds) < 0) {
326 		ksft_test_result_fail("pipe() failed\n");
327 		goto close_comm_pipes;
328 	}
329 
330 	if (before_fork) {
331 		transferred = vmsplice(fds[1], &iov, 1, 0);
332 		if (transferred <= 0) {
333 			ksft_test_result_fail("vmsplice() failed\n");
334 			goto close_pipe;
335 		}
336 	}
337 
338 	ret = fork();
339 	if (ret < 0) {
340 		ksft_test_result_fail("fork() failed\n");
341 		goto close_pipe;
342 	} else if (!ret) {
343 		write(comm_pipes.child_ready[1], "0", 1);
344 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
345 			;
346 		/* Modify page content in the child. */
347 		memset(mem, 0xff, size);
348 		exit(0);
349 	}
350 
351 	if (!before_fork) {
352 		transferred = vmsplice(fds[1], &iov, 1, 0);
353 		if (transferred <= 0) {
354 			ksft_test_result_fail("vmsplice() failed\n");
355 			wait(&ret);
356 			goto close_pipe;
357 		}
358 	}
359 
360 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
361 		;
362 	if (munmap(mem, size) < 0) {
363 		ksft_test_result_fail("munmap() failed\n");
364 		goto close_pipe;
365 	}
366 	write(comm_pipes.parent_ready[1], "0", 1);
367 
368 	/* Wait until the child is done writing. */
369 	wait(&ret);
370 	if (!WIFEXITED(ret)) {
371 		ksft_test_result_fail("wait() failed\n");
372 		goto close_pipe;
373 	}
374 
375 	/* See if we still read the old values. */
376 	for (total = 0; total < transferred; total += cur) {
377 		cur = read(fds[0], new + total, transferred - total);
378 		if (cur < 0) {
379 			ksft_test_result_fail("read() failed\n");
380 			goto close_pipe;
381 		}
382 	}
383 
384 	ksft_test_result(!memcmp(old, new, transferred),
385 			 "No leak from child into parent\n");
386 close_pipe:
387 	close(fds[0]);
388 	close(fds[1]);
389 close_comm_pipes:
390 	close_comm_pipes(&comm_pipes);
391 free:
392 	free(old);
393 	free(new);
394 }
395 
396 static void test_vmsplice_before_fork(char *mem, size_t size)
397 {
398 	do_test_vmsplice_in_parent(mem, size, true);
399 }
400 
401 static void test_vmsplice_after_fork(char *mem, size_t size)
402 {
403 	do_test_vmsplice_in_parent(mem, size, false);
404 }
405 
406 #ifdef LOCAL_CONFIG_HAVE_LIBURING
407 static void do_test_iouring(char *mem, size_t size, bool use_fork)
408 {
409 	struct comm_pipes comm_pipes;
410 	struct io_uring_cqe *cqe;
411 	struct io_uring_sqe *sqe;
412 	struct io_uring ring;
413 	ssize_t cur, total;
414 	struct iovec iov;
415 	char *buf, *tmp;
416 	int ret, fd;
417 	FILE *file;
418 
419 	ret = setup_comm_pipes(&comm_pipes);
420 	if (ret) {
421 		ksft_test_result_fail("pipe() failed\n");
422 		return;
423 	}
424 
425 	file = tmpfile();
426 	if (!file) {
427 		ksft_test_result_fail("tmpfile() failed\n");
428 		goto close_comm_pipes;
429 	}
430 	fd = fileno(file);
431 	assert(fd);
432 
433 	tmp = malloc(size);
434 	if (!tmp) {
435 		ksft_test_result_fail("malloc() failed\n");
436 		goto close_file;
437 	}
438 
439 	/* Skip on errors, as we might just lack kernel support. */
440 	ret = io_uring_queue_init(1, &ring, 0);
441 	if (ret < 0) {
442 		ksft_test_result_skip("io_uring_queue_init() failed\n");
443 		goto free_tmp;
444 	}
445 
446 	/*
447 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
448 	 * | FOLL_LONGTERM the range.
449 	 *
450 	 * Skip on errors, as we might just lack kernel support or might not
451 	 * have sufficient MEMLOCK permissions.
452 	 */
453 	iov.iov_base = mem;
454 	iov.iov_len = size;
455 	ret = io_uring_register_buffers(&ring, &iov, 1);
456 	if (ret) {
457 		ksft_test_result_skip("io_uring_register_buffers() failed\n");
458 		goto queue_exit;
459 	}
460 
461 	if (use_fork) {
462 		/*
463 		 * fork() and keep the child alive until we're done. Note that
464 		 * we expect the pinned page to not get shared with the child.
465 		 */
466 		ret = fork();
467 		if (ret < 0) {
468 			ksft_test_result_fail("fork() failed\n");
469 			goto unregister_buffers;
470 		} else if (!ret) {
471 			write(comm_pipes.child_ready[1], "0", 1);
472 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
473 				;
474 			exit(0);
475 		}
476 
477 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
478 			;
479 	} else {
480 		/*
481 		 * Map the page R/O into the page table. Enable softdirty
482 		 * tracking to stop the page from getting mapped R/W immediately
483 		 * again by mprotect() optimizations. Note that we don't have an
484 		 * easy way to test if that worked (the pagemap does not export
485 		 * if the page is mapped R/O vs. R/W).
486 		 */
487 		ret = mprotect(mem, size, PROT_READ);
488 		clear_softdirty();
489 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
490 		if (ret) {
491 			ksft_test_result_fail("mprotect() failed\n");
492 			goto unregister_buffers;
493 		}
494 	}
495 
496 	/*
497 	 * Modify the page and write page content as observed by the fixed
498 	 * buffer pin to the file so we can verify it.
499 	 */
500 	memset(mem, 0xff, size);
501 	sqe = io_uring_get_sqe(&ring);
502 	if (!sqe) {
503 		ksft_test_result_fail("io_uring_get_sqe() failed\n");
504 		goto quit_child;
505 	}
506 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
507 
508 	ret = io_uring_submit(&ring);
509 	if (ret < 0) {
510 		ksft_test_result_fail("io_uring_submit() failed\n");
511 		goto quit_child;
512 	}
513 
514 	ret = io_uring_wait_cqe(&ring, &cqe);
515 	if (ret < 0) {
516 		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
517 		goto quit_child;
518 	}
519 
520 	if (cqe->res != size) {
521 		ksft_test_result_fail("write_fixed failed\n");
522 		goto quit_child;
523 	}
524 	io_uring_cqe_seen(&ring, cqe);
525 
526 	/* Read back the file content to the temporary buffer. */
527 	total = 0;
528 	while (total < size) {
529 		cur = pread(fd, tmp + total, size - total, total);
530 		if (cur < 0) {
531 			ksft_test_result_fail("pread() failed\n");
532 			goto quit_child;
533 		}
534 		total += cur;
535 	}
536 
537 	/* Finally, check if we read what we expected. */
538 	ksft_test_result(!memcmp(mem, tmp, size),
539 			 "Longterm R/W pin is reliable\n");
540 
541 quit_child:
542 	if (use_fork) {
543 		write(comm_pipes.parent_ready[1], "0", 1);
544 		wait(&ret);
545 	}
546 unregister_buffers:
547 	io_uring_unregister_buffers(&ring);
548 queue_exit:
549 	io_uring_queue_exit(&ring);
550 free_tmp:
551 	free(tmp);
552 close_file:
553 	fclose(file);
554 close_comm_pipes:
555 	close_comm_pipes(&comm_pipes);
556 }
557 
558 static void test_iouring_ro(char *mem, size_t size)
559 {
560 	do_test_iouring(mem, size, false);
561 }
562 
563 static void test_iouring_fork(char *mem, size_t size)
564 {
565 	do_test_iouring(mem, size, true);
566 }
567 
568 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
569 
570 enum ro_pin_test {
571 	RO_PIN_TEST,
572 	RO_PIN_TEST_SHARED,
573 	RO_PIN_TEST_PREVIOUSLY_SHARED,
574 	RO_PIN_TEST_RO_EXCLUSIVE,
575 };
576 
577 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
578 			   bool fast)
579 {
580 	struct pin_longterm_test args;
581 	struct comm_pipes comm_pipes;
582 	char *tmp, buf;
583 	__u64 tmp_val;
584 	int ret;
585 
586 	if (gup_fd < 0) {
587 		ksft_test_result_skip("gup_test not available\n");
588 		return;
589 	}
590 
591 	tmp = malloc(size);
592 	if (!tmp) {
593 		ksft_test_result_fail("malloc() failed\n");
594 		return;
595 	}
596 
597 	ret = setup_comm_pipes(&comm_pipes);
598 	if (ret) {
599 		ksft_test_result_fail("pipe() failed\n");
600 		goto free_tmp;
601 	}
602 
603 	switch (test) {
604 	case RO_PIN_TEST:
605 		break;
606 	case RO_PIN_TEST_SHARED:
607 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
608 		/*
609 		 * Share the pages with our child. As the pages are not pinned,
610 		 * this should just work.
611 		 */
612 		ret = fork();
613 		if (ret < 0) {
614 			ksft_test_result_fail("fork() failed\n");
615 			goto close_comm_pipes;
616 		} else if (!ret) {
617 			write(comm_pipes.child_ready[1], "0", 1);
618 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
619 				;
620 			exit(0);
621 		}
622 
623 		/* Wait until our child is ready. */
624 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
625 			;
626 
627 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
628 			/*
629 			 * Tell the child to quit now and wait until it quit.
630 			 * The pages should now be mapped R/O into our page
631 			 * tables, but they are no longer shared.
632 			 */
633 			write(comm_pipes.parent_ready[1], "0", 1);
634 			wait(&ret);
635 			if (!WIFEXITED(ret))
636 				ksft_print_msg("[INFO] wait() failed\n");
637 		}
638 		break;
639 	case RO_PIN_TEST_RO_EXCLUSIVE:
640 		/*
641 		 * Map the page R/O into the page table. Enable softdirty
642 		 * tracking to stop the page from getting mapped R/W immediately
643 		 * again by mprotect() optimizations. Note that we don't have an
644 		 * easy way to test if that worked (the pagemap does not export
645 		 * if the page is mapped R/O vs. R/W).
646 		 */
647 		ret = mprotect(mem, size, PROT_READ);
648 		clear_softdirty();
649 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
650 		if (ret) {
651 			ksft_test_result_fail("mprotect() failed\n");
652 			goto close_comm_pipes;
653 		}
654 		break;
655 	default:
656 		assert(false);
657 	}
658 
659 	/* Take a R/O pin. This should trigger unsharing. */
660 	args.addr = (__u64)(uintptr_t)mem;
661 	args.size = size;
662 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
663 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
664 	if (ret) {
665 		if (errno == EINVAL)
666 			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
667 		else
668 			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
669 		goto wait;
670 	}
671 
672 	/* Modify the page. */
673 	memset(mem, 0xff, size);
674 
675 	/*
676 	 * Read back the content via the pin to the temporary buffer and
677 	 * test if we observed the modification.
678 	 */
679 	tmp_val = (__u64)(uintptr_t)tmp;
680 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
681 	if (ret)
682 		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
683 	else
684 		ksft_test_result(!memcmp(mem, tmp, size),
685 				 "Longterm R/O pin is reliable\n");
686 
687 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
688 	if (ret)
689 		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
690 wait:
691 	switch (test) {
692 	case RO_PIN_TEST_SHARED:
693 		write(comm_pipes.parent_ready[1], "0", 1);
694 		wait(&ret);
695 		if (!WIFEXITED(ret))
696 			ksft_print_msg("[INFO] wait() failed\n");
697 		break;
698 	default:
699 		break;
700 	}
701 close_comm_pipes:
702 	close_comm_pipes(&comm_pipes);
703 free_tmp:
704 	free(tmp);
705 }
706 
707 static void test_ro_pin_on_shared(char *mem, size_t size)
708 {
709 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
710 }
711 
712 static void test_ro_fast_pin_on_shared(char *mem, size_t size)
713 {
714 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
715 }
716 
717 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
718 {
719 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
720 }
721 
722 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
723 {
724 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
725 }
726 
727 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
728 {
729 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
730 }
731 
732 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
733 {
734 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
735 }
736 
737 typedef void (*test_fn)(char *mem, size_t size);
738 
739 static void do_run_with_base_page(test_fn fn, bool swapout)
740 {
741 	char *mem;
742 	int ret;
743 
744 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
745 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
746 	if (mem == MAP_FAILED) {
747 		ksft_test_result_fail("mmap() failed\n");
748 		return;
749 	}
750 
751 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
752 	/* Ignore if not around on a kernel. */
753 	if (ret && errno != EINVAL) {
754 		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
755 		goto munmap;
756 	}
757 
758 	/* Populate a base page. */
759 	memset(mem, 0, pagesize);
760 
761 	if (swapout) {
762 		madvise(mem, pagesize, MADV_PAGEOUT);
763 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
764 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
765 			goto munmap;
766 		}
767 	}
768 
769 	fn(mem, pagesize);
770 munmap:
771 	munmap(mem, pagesize);
772 }
773 
774 static void run_with_base_page(test_fn fn, const char *desc)
775 {
776 	ksft_print_msg("[RUN] %s ... with base page\n", desc);
777 	do_run_with_base_page(fn, false);
778 }
779 
780 static void run_with_base_page_swap(test_fn fn, const char *desc)
781 {
782 	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
783 	do_run_with_base_page(fn, true);
784 }
785 
786 enum thp_run {
787 	THP_RUN_PMD,
788 	THP_RUN_PMD_SWAPOUT,
789 	THP_RUN_PTE,
790 	THP_RUN_PTE_SWAPOUT,
791 	THP_RUN_SINGLE_PTE,
792 	THP_RUN_SINGLE_PTE_SWAPOUT,
793 	THP_RUN_PARTIAL_MREMAP,
794 	THP_RUN_PARTIAL_SHARED,
795 };
796 
797 static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
798 {
799 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
800 	size_t size, mmap_size, mremap_size;
801 	int ret;
802 
803 	/* For alignment purposes, we need twice the thp size. */
804 	mmap_size = 2 * thpsize;
805 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
806 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
807 	if (mmap_mem == MAP_FAILED) {
808 		ksft_test_result_fail("mmap() failed\n");
809 		return;
810 	}
811 
812 	/* We need a THP-aligned memory area. */
813 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
814 
815 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
816 	if (ret) {
817 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
818 		goto munmap;
819 	}
820 
821 	/*
822 	 * Try to populate a THP. Touch the first sub-page and test if we get
823 	 * another sub-page populated automatically.
824 	 */
825 	mem[0] = 0;
826 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
827 		ksft_test_result_skip("Did not get a THP populated\n");
828 		goto munmap;
829 	}
830 	memset(mem, 0, thpsize);
831 
832 	size = thpsize;
833 	switch (thp_run) {
834 	case THP_RUN_PMD:
835 	case THP_RUN_PMD_SWAPOUT:
836 		break;
837 	case THP_RUN_PTE:
838 	case THP_RUN_PTE_SWAPOUT:
839 		/*
840 		 * Trigger PTE-mapping the THP by temporarily mapping a single
841 		 * subpage R/O.
842 		 */
843 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
844 		if (ret) {
845 			ksft_test_result_fail("mprotect() failed\n");
846 			goto munmap;
847 		}
848 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
849 		if (ret) {
850 			ksft_test_result_fail("mprotect() failed\n");
851 			goto munmap;
852 		}
853 		break;
854 	case THP_RUN_SINGLE_PTE:
855 	case THP_RUN_SINGLE_PTE_SWAPOUT:
856 		/*
857 		 * Discard all but a single subpage of that PTE-mapped THP. What
858 		 * remains is a single PTE mapping a single subpage.
859 		 */
860 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
861 		if (ret) {
862 			ksft_test_result_fail("MADV_DONTNEED failed\n");
863 			goto munmap;
864 		}
865 		size = pagesize;
866 		break;
867 	case THP_RUN_PARTIAL_MREMAP:
868 		/*
869 		 * Remap half of the THP. We need some new memory location
870 		 * for that.
871 		 */
872 		mremap_size = thpsize / 2;
873 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
874 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
875 		if (mem == MAP_FAILED) {
876 			ksft_test_result_fail("mmap() failed\n");
877 			goto munmap;
878 		}
879 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
880 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
881 		if (tmp != mremap_mem) {
882 			ksft_test_result_fail("mremap() failed\n");
883 			goto munmap;
884 		}
885 		size = mremap_size;
886 		break;
887 	case THP_RUN_PARTIAL_SHARED:
888 		/*
889 		 * Share the first page of the THP with a child and quit the
890 		 * child. This will result in some parts of the THP never
891 		 * have been shared.
892 		 */
893 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
894 		if (ret) {
895 			ksft_test_result_fail("MADV_DONTFORK failed\n");
896 			goto munmap;
897 		}
898 		ret = fork();
899 		if (ret < 0) {
900 			ksft_test_result_fail("fork() failed\n");
901 			goto munmap;
902 		} else if (!ret) {
903 			exit(0);
904 		}
905 		wait(&ret);
906 		/* Allow for sharing all pages again. */
907 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
908 		if (ret) {
909 			ksft_test_result_fail("MADV_DOFORK failed\n");
910 			goto munmap;
911 		}
912 		break;
913 	default:
914 		assert(false);
915 	}
916 
917 	switch (thp_run) {
918 	case THP_RUN_PMD_SWAPOUT:
919 	case THP_RUN_PTE_SWAPOUT:
920 	case THP_RUN_SINGLE_PTE_SWAPOUT:
921 		madvise(mem, size, MADV_PAGEOUT);
922 		if (!range_is_swapped(mem, size)) {
923 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
924 			goto munmap;
925 		}
926 		break;
927 	default:
928 		break;
929 	}
930 
931 	fn(mem, size);
932 munmap:
933 	munmap(mmap_mem, mmap_size);
934 	if (mremap_mem != MAP_FAILED)
935 		munmap(mremap_mem, mremap_size);
936 }
937 
938 static void run_with_thp(test_fn fn, const char *desc)
939 {
940 	ksft_print_msg("[RUN] %s ... with THP\n", desc);
941 	do_run_with_thp(fn, THP_RUN_PMD);
942 }
943 
944 static void run_with_thp_swap(test_fn fn, const char *desc)
945 {
946 	ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
947 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
948 }
949 
950 static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
951 {
952 	ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
953 	do_run_with_thp(fn, THP_RUN_PTE);
954 }
955 
956 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
957 {
958 	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
959 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
960 }
961 
962 static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
963 {
964 	ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
965 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
966 }
967 
968 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
969 {
970 	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
971 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
972 }
973 
974 static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
975 {
976 	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
977 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
978 }
979 
980 static void run_with_partial_shared_thp(test_fn fn, const char *desc)
981 {
982 	ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
983 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
984 }
985 
986 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
987 {
988 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
989 	char *mem, *dummy;
990 
991 	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
992 		       hugetlbsize / 1024);
993 
994 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
995 
996 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
997 	if (mem == MAP_FAILED) {
998 		ksft_test_result_skip("need more free huge pages\n");
999 		return;
1000 	}
1001 
1002 	/* Populate an huge page. */
1003 	memset(mem, 0, hugetlbsize);
1004 
1005 	/*
1006 	 * We need a total of two hugetlb pages to handle COW/unsharing
1007 	 * properly, otherwise we might get zapped by a SIGBUS.
1008 	 */
1009 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
1010 	if (dummy == MAP_FAILED) {
1011 		ksft_test_result_skip("need more free huge pages\n");
1012 		goto munmap;
1013 	}
1014 	munmap(dummy, hugetlbsize);
1015 
1016 	fn(mem, hugetlbsize);
1017 munmap:
1018 	munmap(mem, hugetlbsize);
1019 }
1020 
1021 struct test_case {
1022 	const char *desc;
1023 	test_fn fn;
1024 };
1025 
1026 /*
1027  * Test cases that are specific to anonymous pages: pages in private mappings
1028  * that may get shared via COW during fork().
1029  */
1030 static const struct test_case anon_test_cases[] = {
1031 	/*
1032 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1033 	 * either the child can observe modifications by the parent or the
1034 	 * other way around.
1035 	 */
1036 	{
1037 		"Basic COW after fork()",
1038 		test_cow_in_parent,
1039 	},
1040 	/*
1041 	 * Basic test, but do an additional mprotect(PROT_READ)+
1042 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1043 	 */
1044 	{
1045 		"Basic COW after fork() with mprotect() optimization",
1046 		test_cow_in_parent_mprotect,
1047 	},
1048 	/*
1049 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1050 	 * we miss to break COW, the child observes modifications by the parent.
1051 	 * This is CVE-2020-29374 reported by Jann Horn.
1052 	 */
1053 	{
1054 		"vmsplice() + unmap in child",
1055 		test_vmsplice_in_child
1056 	},
1057 	/*
1058 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1059 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1060 	 */
1061 	{
1062 		"vmsplice() + unmap in child with mprotect() optimization",
1063 		test_vmsplice_in_child_mprotect
1064 	},
1065 	/*
1066 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1067 	 * fork(); modify in the child. If we miss to break COW, the parent
1068 	 * observes modifications by the child.
1069 	 */
1070 	{
1071 		"vmsplice() before fork(), unmap in parent after fork()",
1072 		test_vmsplice_before_fork,
1073 	},
1074 	/*
1075 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1076 	 * child. If we miss to break COW, the parent observes modifications by
1077 	 * the child.
1078 	 */
1079 	{
1080 		"vmsplice() + unmap in parent after fork()",
1081 		test_vmsplice_after_fork,
1082 	},
1083 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1084 	/*
1085 	 * Take a R/W longterm pin and then map the page R/O into the page
1086 	 * table to trigger a write fault on next access. When modifying the
1087 	 * page, the page content must be visible via the pin.
1088 	 */
1089 	{
1090 		"R/O-mapping a page registered as iouring fixed buffer",
1091 		test_iouring_ro,
1092 	},
1093 	/*
1094 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1095 	 * page, the page content must be visible via the pin. We expect the
1096 	 * pinned page to not get shared with the child.
1097 	 */
1098 	{
1099 		"fork() with an iouring fixed buffer",
1100 		test_iouring_fork,
1101 	},
1102 
1103 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1104 	/*
1105 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1106 	 * When modifying the page via the page table, the page content change
1107 	 * must be visible via the pin.
1108 	 */
1109 	{
1110 		"R/O GUP pin on R/O-mapped shared page",
1111 		test_ro_pin_on_shared,
1112 	},
1113 	/* Same as above, but using GUP-fast. */
1114 	{
1115 		"R/O GUP-fast pin on R/O-mapped shared page",
1116 		test_ro_fast_pin_on_shared,
1117 	},
1118 	/*
1119 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1120 	 * was previously shared. When modifying the page via the page table,
1121 	 * the page content change must be visible via the pin.
1122 	 */
1123 	{
1124 		"R/O GUP pin on R/O-mapped previously-shared page",
1125 		test_ro_pin_on_ro_previously_shared,
1126 	},
1127 	/* Same as above, but using GUP-fast. */
1128 	{
1129 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1130 		test_ro_fast_pin_on_ro_previously_shared,
1131 	},
1132 	/*
1133 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1134 	 * When modifying the page via the page table, the page content change
1135 	 * must be visible via the pin.
1136 	 */
1137 	{
1138 		"R/O GUP pin on R/O-mapped exclusive page",
1139 		test_ro_pin_on_ro_exclusive,
1140 	},
1141 	/* Same as above, but using GUP-fast. */
1142 	{
1143 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1144 		test_ro_fast_pin_on_ro_exclusive,
1145 	},
1146 };
1147 
1148 static void run_anon_test_case(struct test_case const *test_case)
1149 {
1150 	int i;
1151 
1152 	run_with_base_page(test_case->fn, test_case->desc);
1153 	run_with_base_page_swap(test_case->fn, test_case->desc);
1154 	if (thpsize) {
1155 		run_with_thp(test_case->fn, test_case->desc);
1156 		run_with_thp_swap(test_case->fn, test_case->desc);
1157 		run_with_pte_mapped_thp(test_case->fn, test_case->desc);
1158 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
1159 		run_with_single_pte_of_thp(test_case->fn, test_case->desc);
1160 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
1161 		run_with_partial_mremap_thp(test_case->fn, test_case->desc);
1162 		run_with_partial_shared_thp(test_case->fn, test_case->desc);
1163 	}
1164 	for (i = 0; i < nr_hugetlbsizes; i++)
1165 		run_with_hugetlb(test_case->fn, test_case->desc,
1166 				 hugetlbsizes[i]);
1167 }
1168 
1169 static void run_anon_test_cases(void)
1170 {
1171 	int i;
1172 
1173 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1174 
1175 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1176 		run_anon_test_case(&anon_test_cases[i]);
1177 }
1178 
1179 static int tests_per_anon_test_case(void)
1180 {
1181 	int tests = 2 + nr_hugetlbsizes;
1182 
1183 	if (thpsize)
1184 		tests += 8;
1185 	return tests;
1186 }
1187 
1188 enum anon_thp_collapse_test {
1189 	ANON_THP_COLLAPSE_UNSHARED,
1190 	ANON_THP_COLLAPSE_FULLY_SHARED,
1191 	ANON_THP_COLLAPSE_LOWER_SHARED,
1192 	ANON_THP_COLLAPSE_UPPER_SHARED,
1193 };
1194 
1195 static void do_test_anon_thp_collapse(char *mem, size_t size,
1196 				      enum anon_thp_collapse_test test)
1197 {
1198 	struct comm_pipes comm_pipes;
1199 	char buf;
1200 	int ret;
1201 
1202 	ret = setup_comm_pipes(&comm_pipes);
1203 	if (ret) {
1204 		ksft_test_result_fail("pipe() failed\n");
1205 		return;
1206 	}
1207 
1208 	/*
1209 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1210 	 * R/O, such that we can try collapsing it later.
1211 	 */
1212 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1213 	if (ret) {
1214 		ksft_test_result_fail("mprotect() failed\n");
1215 		goto close_comm_pipes;
1216 	}
1217 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1218 	if (ret) {
1219 		ksft_test_result_fail("mprotect() failed\n");
1220 		goto close_comm_pipes;
1221 	}
1222 
1223 	switch (test) {
1224 	case ANON_THP_COLLAPSE_UNSHARED:
1225 		/* Collapse before actually COW-sharing the page. */
1226 		ret = madvise(mem, size, MADV_COLLAPSE);
1227 		if (ret) {
1228 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1229 					      strerror(errno));
1230 			goto close_comm_pipes;
1231 		}
1232 		break;
1233 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1234 		/* COW-share the full PTE-mapped THP. */
1235 		break;
1236 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1237 		/* Don't COW-share the upper part of the THP. */
1238 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1239 		if (ret) {
1240 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1241 			goto close_comm_pipes;
1242 		}
1243 		break;
1244 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1245 		/* Don't COW-share the lower part of the THP. */
1246 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1247 		if (ret) {
1248 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1249 			goto close_comm_pipes;
1250 		}
1251 		break;
1252 	default:
1253 		assert(false);
1254 	}
1255 
1256 	ret = fork();
1257 	if (ret < 0) {
1258 		ksft_test_result_fail("fork() failed\n");
1259 		goto close_comm_pipes;
1260 	} else if (!ret) {
1261 		switch (test) {
1262 		case ANON_THP_COLLAPSE_UNSHARED:
1263 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1264 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1265 			break;
1266 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1267 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1268 			break;
1269 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1270 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1271 					     &comm_pipes));
1272 			break;
1273 		default:
1274 			assert(false);
1275 		}
1276 	}
1277 
1278 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1279 		;
1280 
1281 	switch (test) {
1282 	case ANON_THP_COLLAPSE_UNSHARED:
1283 		break;
1284 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1285 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1286 		/*
1287 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1288 		 * able to actually collapse.
1289 		 */
1290 		ret = madvise(mem, size, MADV_DOFORK);
1291 		if (ret) {
1292 			ksft_test_result_fail("MADV_DOFORK failed\n");
1293 			write(comm_pipes.parent_ready[1], "0", 1);
1294 			wait(&ret);
1295 			goto close_comm_pipes;
1296 		}
1297 		/* FALLTHROUGH */
1298 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1299 		/* Collapse before anyone modified the COW-shared page. */
1300 		ret = madvise(mem, size, MADV_COLLAPSE);
1301 		if (ret) {
1302 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1303 					      strerror(errno));
1304 			write(comm_pipes.parent_ready[1], "0", 1);
1305 			wait(&ret);
1306 			goto close_comm_pipes;
1307 		}
1308 		break;
1309 	default:
1310 		assert(false);
1311 	}
1312 
1313 	/* Modify the page. */
1314 	memset(mem, 0xff, size);
1315 	write(comm_pipes.parent_ready[1], "0", 1);
1316 
1317 	wait(&ret);
1318 	if (WIFEXITED(ret))
1319 		ret = WEXITSTATUS(ret);
1320 	else
1321 		ret = -EINVAL;
1322 
1323 	ksft_test_result(!ret, "No leak from parent into child\n");
1324 close_comm_pipes:
1325 	close_comm_pipes(&comm_pipes);
1326 }
1327 
1328 static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1329 {
1330 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1331 }
1332 
1333 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1334 {
1335 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1336 }
1337 
1338 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1339 {
1340 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1341 }
1342 
1343 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1344 {
1345 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1346 }
1347 
1348 /*
1349  * Test cases that are specific to anonymous THP: pages in private mappings
1350  * that may get shared via COW during fork().
1351  */
1352 static const struct test_case anon_thp_test_cases[] = {
1353 	/*
1354 	 * Basic COW test for fork() without any GUP when collapsing a THP
1355 	 * before fork().
1356 	 *
1357 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1358 	 * collapse") might easily get COW handling wrong when not collapsing
1359 	 * exclusivity information properly.
1360 	 */
1361 	{
1362 		"Basic COW after fork() when collapsing before fork()",
1363 		test_anon_thp_collapse_unshared,
1364 	},
1365 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1366 	{
1367 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1368 		test_anon_thp_collapse_fully_shared,
1369 	},
1370 	/*
1371 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1372 	 * THP.
1373 	 */
1374 	{
1375 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1376 		test_anon_thp_collapse_lower_shared,
1377 	},
1378 	/*
1379 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1380 	 * THP.
1381 	 */
1382 	{
1383 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1384 		test_anon_thp_collapse_upper_shared,
1385 	},
1386 };
1387 
1388 static void run_anon_thp_test_cases(void)
1389 {
1390 	int i;
1391 
1392 	if (!thpsize)
1393 		return;
1394 
1395 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1396 
1397 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1398 		struct test_case const *test_case = &anon_thp_test_cases[i];
1399 
1400 		ksft_print_msg("[RUN] %s\n", test_case->desc);
1401 		do_run_with_thp(test_case->fn, THP_RUN_PMD);
1402 	}
1403 }
1404 
1405 static int tests_per_anon_thp_test_case(void)
1406 {
1407 	return thpsize ? 1 : 0;
1408 }
1409 
1410 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1411 
1412 static void test_cow(char *mem, const char *smem, size_t size)
1413 {
1414 	char *old = malloc(size);
1415 
1416 	/* Backup the original content. */
1417 	memcpy(old, smem, size);
1418 
1419 	/* Modify the page. */
1420 	memset(mem, 0xff, size);
1421 
1422 	/* See if we still read the old values via the other mapping. */
1423 	ksft_test_result(!memcmp(smem, old, size),
1424 			 "Other mapping not modified\n");
1425 	free(old);
1426 }
1427 
1428 static void test_ro_pin(char *mem, const char *smem, size_t size)
1429 {
1430 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1431 }
1432 
1433 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1434 {
1435 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1436 }
1437 
1438 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1439 {
1440 	char *mem, *smem, tmp;
1441 
1442 	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1443 
1444 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1445 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1446 	if (mem == MAP_FAILED) {
1447 		ksft_test_result_fail("mmap() failed\n");
1448 		return;
1449 	}
1450 
1451 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1452 	if (mem == MAP_FAILED) {
1453 		ksft_test_result_fail("mmap() failed\n");
1454 		goto munmap;
1455 	}
1456 
1457 	/* Read from the page to populate the shared zeropage. */
1458 	tmp = *mem + *smem;
1459 	asm volatile("" : "+r" (tmp));
1460 
1461 	fn(mem, smem, pagesize);
1462 munmap:
1463 	munmap(mem, pagesize);
1464 	if (smem != MAP_FAILED)
1465 		munmap(smem, pagesize);
1466 }
1467 
1468 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1469 {
1470 	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1471 	size_t mmap_size;
1472 	int ret;
1473 
1474 	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1475 
1476 	if (!has_huge_zeropage) {
1477 		ksft_test_result_skip("Huge zeropage not enabled\n");
1478 		return;
1479 	}
1480 
1481 	/* For alignment purposes, we need twice the thp size. */
1482 	mmap_size = 2 * thpsize;
1483 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1484 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1485 	if (mmap_mem == MAP_FAILED) {
1486 		ksft_test_result_fail("mmap() failed\n");
1487 		return;
1488 	}
1489 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1490 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1491 	if (mmap_smem == MAP_FAILED) {
1492 		ksft_test_result_fail("mmap() failed\n");
1493 		goto munmap;
1494 	}
1495 
1496 	/* We need a THP-aligned memory area. */
1497 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
1498 	smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
1499 
1500 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
1501 	ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
1502 	if (ret) {
1503 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1504 		goto munmap;
1505 	}
1506 
1507 	/*
1508 	 * Read from the memory to populate the huge shared zeropage. Read from
1509 	 * the first sub-page and test if we get another sub-page populated
1510 	 * automatically.
1511 	 */
1512 	tmp = *mem + *smem;
1513 	asm volatile("" : "+r" (tmp));
1514 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1515 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1516 		ksft_test_result_skip("Did not get THPs populated\n");
1517 		goto munmap;
1518 	}
1519 
1520 	fn(mem, smem, thpsize);
1521 munmap:
1522 	munmap(mmap_mem, mmap_size);
1523 	if (mmap_smem != MAP_FAILED)
1524 		munmap(mmap_smem, mmap_size);
1525 }
1526 
1527 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1528 {
1529 	char *mem, *smem, tmp;
1530 	int fd;
1531 
1532 	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1533 
1534 	fd = memfd_create("test", 0);
1535 	if (fd < 0) {
1536 		ksft_test_result_fail("memfd_create() failed\n");
1537 		return;
1538 	}
1539 
1540 	/* File consists of a single page filled with zeroes. */
1541 	if (fallocate(fd, 0, 0, pagesize)) {
1542 		ksft_test_result_fail("fallocate() failed\n");
1543 		goto close;
1544 	}
1545 
1546 	/* Create a private mapping of the memfd. */
1547 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1548 	if (mem == MAP_FAILED) {
1549 		ksft_test_result_fail("mmap() failed\n");
1550 		goto close;
1551 	}
1552 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1553 	if (mem == MAP_FAILED) {
1554 		ksft_test_result_fail("mmap() failed\n");
1555 		goto munmap;
1556 	}
1557 
1558 	/* Fault the page in. */
1559 	tmp = *mem + *smem;
1560 	asm volatile("" : "+r" (tmp));
1561 
1562 	fn(mem, smem, pagesize);
1563 munmap:
1564 	munmap(mem, pagesize);
1565 	if (smem != MAP_FAILED)
1566 		munmap(smem, pagesize);
1567 close:
1568 	close(fd);
1569 }
1570 
1571 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1572 {
1573 	char *mem, *smem, tmp;
1574 	FILE *file;
1575 	int fd;
1576 
1577 	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1578 
1579 	file = tmpfile();
1580 	if (!file) {
1581 		ksft_test_result_fail("tmpfile() failed\n");
1582 		return;
1583 	}
1584 
1585 	fd = fileno(file);
1586 	if (fd < 0) {
1587 		ksft_test_result_skip("fileno() failed\n");
1588 		return;
1589 	}
1590 
1591 	/* File consists of a single page filled with zeroes. */
1592 	if (fallocate(fd, 0, 0, pagesize)) {
1593 		ksft_test_result_fail("fallocate() failed\n");
1594 		goto close;
1595 	}
1596 
1597 	/* Create a private mapping of the memfd. */
1598 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1599 	if (mem == MAP_FAILED) {
1600 		ksft_test_result_fail("mmap() failed\n");
1601 		goto close;
1602 	}
1603 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1604 	if (mem == MAP_FAILED) {
1605 		ksft_test_result_fail("mmap() failed\n");
1606 		goto munmap;
1607 	}
1608 
1609 	/* Fault the page in. */
1610 	tmp = *mem + *smem;
1611 	asm volatile("" : "+r" (tmp));
1612 
1613 	fn(mem, smem, pagesize);
1614 munmap:
1615 	munmap(mem, pagesize);
1616 	if (smem != MAP_FAILED)
1617 		munmap(smem, pagesize);
1618 close:
1619 	fclose(file);
1620 }
1621 
1622 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1623 				   size_t hugetlbsize)
1624 {
1625 	int flags = MFD_HUGETLB;
1626 	char *mem, *smem, tmp;
1627 	int fd;
1628 
1629 	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1630 		       hugetlbsize / 1024);
1631 
1632 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1633 
1634 	fd = memfd_create("test", flags);
1635 	if (fd < 0) {
1636 		ksft_test_result_skip("memfd_create() failed\n");
1637 		return;
1638 	}
1639 
1640 	/* File consists of a single page filled with zeroes. */
1641 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1642 		ksft_test_result_skip("need more free huge pages\n");
1643 		goto close;
1644 	}
1645 
1646 	/* Create a private mapping of the memfd. */
1647 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1648 		   0);
1649 	if (mem == MAP_FAILED) {
1650 		ksft_test_result_skip("need more free huge pages\n");
1651 		goto close;
1652 	}
1653 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1654 	if (mem == MAP_FAILED) {
1655 		ksft_test_result_fail("mmap() failed\n");
1656 		goto munmap;
1657 	}
1658 
1659 	/* Fault the page in. */
1660 	tmp = *mem + *smem;
1661 	asm volatile("" : "+r" (tmp));
1662 
1663 	fn(mem, smem, hugetlbsize);
1664 munmap:
1665 	munmap(mem, hugetlbsize);
1666 	if (mem != MAP_FAILED)
1667 		munmap(smem, hugetlbsize);
1668 close:
1669 	close(fd);
1670 }
1671 
1672 struct non_anon_test_case {
1673 	const char *desc;
1674 	non_anon_test_fn fn;
1675 };
1676 
1677 /*
1678  * Test cases that target any pages in private mappings that are not anonymous:
1679  * pages that may get shared via COW ndependent of fork(). This includes
1680  * the shared zeropage(s), pagecache pages, ...
1681  */
1682 static const struct non_anon_test_case non_anon_test_cases[] = {
1683 	/*
1684 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1685 	 * visible via other private/shared mappings.
1686 	 */
1687 	{
1688 		"Basic COW",
1689 		test_cow,
1690 	},
1691 	/*
1692 	 * Take a R/O longterm pin. When modifying the page via the page table,
1693 	 * the page content change must be visible via the pin.
1694 	 */
1695 	{
1696 		"R/O longterm GUP pin",
1697 		test_ro_pin,
1698 	},
1699 	/* Same as above, but using GUP-fast. */
1700 	{
1701 		"R/O longterm GUP-fast pin",
1702 		test_ro_fast_pin,
1703 	},
1704 };
1705 
1706 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1707 {
1708 	int i;
1709 
1710 	run_with_zeropage(test_case->fn, test_case->desc);
1711 	run_with_memfd(test_case->fn, test_case->desc);
1712 	run_with_tmpfile(test_case->fn, test_case->desc);
1713 	if (thpsize)
1714 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1715 	for (i = 0; i < nr_hugetlbsizes; i++)
1716 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1717 				       hugetlbsizes[i]);
1718 }
1719 
1720 static void run_non_anon_test_cases(void)
1721 {
1722 	int i;
1723 
1724 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1725 
1726 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1727 		run_non_anon_test_case(&non_anon_test_cases[i]);
1728 }
1729 
1730 static int tests_per_non_anon_test_case(void)
1731 {
1732 	int tests = 3 + nr_hugetlbsizes;
1733 
1734 	if (thpsize)
1735 		tests += 1;
1736 	return tests;
1737 }
1738 
1739 int main(int argc, char **argv)
1740 {
1741 	int err;
1742 
1743 	pagesize = getpagesize();
1744 	detect_thpsize();
1745 	detect_hugetlbsizes();
1746 	detect_huge_zeropage();
1747 
1748 	ksft_print_header();
1749 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1750 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1751 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1752 
1753 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1754 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1755 	if (pagemap_fd < 0)
1756 		ksft_exit_fail_msg("opening pagemap failed\n");
1757 
1758 	run_anon_test_cases();
1759 	run_anon_thp_test_cases();
1760 	run_non_anon_test_cases();
1761 
1762 	err = ksft_get_fail_cnt();
1763 	if (err)
1764 		ksft_exit_fail_msg("%d out of %d tests failed\n",
1765 				   err, ksft_test_num());
1766 	return ksft_exit_pass();
1767 }
1768