xref: /openbmc/linux/tools/testing/selftests/mm/cow.c (revision d21077fb)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <dirent.h>
18 #include <assert.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23 
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28 
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 
33 #ifndef MADV_PAGEOUT
34 #define MADV_PAGEOUT 21
35 #endif
36 #ifndef MADV_COLLAPSE
37 #define MADV_COLLAPSE 25
38 #endif
39 
40 static size_t pagesize;
41 static int pagemap_fd;
42 static size_t thpsize;
43 static int nr_hugetlbsizes;
44 static size_t hugetlbsizes[10];
45 static int gup_fd;
46 static bool has_huge_zeropage;
47 
48 static void detect_huge_zeropage(void)
49 {
50 	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
51 		      O_RDONLY);
52 	size_t enabled = 0;
53 	char buf[15];
54 	int ret;
55 
56 	if (fd < 0)
57 		return;
58 
59 	ret = pread(fd, buf, sizeof(buf), 0);
60 	if (ret > 0 && ret < sizeof(buf)) {
61 		buf[ret] = 0;
62 
63 		enabled = strtoul(buf, NULL, 10);
64 		if (enabled == 1) {
65 			has_huge_zeropage = true;
66 			ksft_print_msg("[INFO] huge zeropage is enabled\n");
67 		}
68 	}
69 
70 	close(fd);
71 }
72 
73 static void detect_hugetlbsizes(void)
74 {
75 	DIR *dir = opendir("/sys/kernel/mm/hugepages/");
76 
77 	if (!dir)
78 		return;
79 
80 	while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) {
81 		struct dirent *entry = readdir(dir);
82 		size_t kb;
83 
84 		if (!entry)
85 			break;
86 		if (entry->d_type != DT_DIR)
87 			continue;
88 		if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
89 			continue;
90 		hugetlbsizes[nr_hugetlbsizes] = kb * 1024;
91 		nr_hugetlbsizes++;
92 		ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n",
93 			       kb);
94 	}
95 	closedir(dir);
96 }
97 
98 static bool range_is_swapped(void *addr, size_t size)
99 {
100 	for (; size; addr += pagesize, size -= pagesize)
101 		if (!pagemap_is_swapped(pagemap_fd, addr))
102 			return false;
103 	return true;
104 }
105 
106 struct comm_pipes {
107 	int child_ready[2];
108 	int parent_ready[2];
109 };
110 
111 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
112 {
113 	if (pipe(comm_pipes->child_ready) < 0)
114 		return -errno;
115 	if (pipe(comm_pipes->parent_ready) < 0) {
116 		close(comm_pipes->child_ready[0]);
117 		close(comm_pipes->child_ready[1]);
118 		return -errno;
119 	}
120 
121 	return 0;
122 }
123 
124 static void close_comm_pipes(struct comm_pipes *comm_pipes)
125 {
126 	close(comm_pipes->child_ready[0]);
127 	close(comm_pipes->child_ready[1]);
128 	close(comm_pipes->parent_ready[0]);
129 	close(comm_pipes->parent_ready[1]);
130 }
131 
132 static int child_memcmp_fn(char *mem, size_t size,
133 			   struct comm_pipes *comm_pipes)
134 {
135 	char *old = malloc(size);
136 	char buf;
137 
138 	/* Backup the original content. */
139 	memcpy(old, mem, size);
140 
141 	/* Wait until the parent modified the page. */
142 	write(comm_pipes->child_ready[1], "0", 1);
143 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
144 		;
145 
146 	/* See if we still read the old values. */
147 	return memcmp(old, mem, size);
148 }
149 
150 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
151 				    struct comm_pipes *comm_pipes)
152 {
153 	struct iovec iov = {
154 		.iov_base = mem,
155 		.iov_len = size,
156 	};
157 	ssize_t cur, total, transferred;
158 	char *old, *new;
159 	int fds[2];
160 	char buf;
161 
162 	old = malloc(size);
163 	new = malloc(size);
164 
165 	/* Backup the original content. */
166 	memcpy(old, mem, size);
167 
168 	if (pipe(fds) < 0)
169 		return -errno;
170 
171 	/* Trigger a read-only pin. */
172 	transferred = vmsplice(fds[1], &iov, 1, 0);
173 	if (transferred < 0)
174 		return -errno;
175 	if (transferred == 0)
176 		return -EINVAL;
177 
178 	/* Unmap it from our page tables. */
179 	if (munmap(mem, size) < 0)
180 		return -errno;
181 
182 	/* Wait until the parent modified it. */
183 	write(comm_pipes->child_ready[1], "0", 1);
184 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
185 		;
186 
187 	/* See if we still read the old values via the pipe. */
188 	for (total = 0; total < transferred; total += cur) {
189 		cur = read(fds[0], new + total, transferred - total);
190 		if (cur < 0)
191 			return -errno;
192 	}
193 
194 	return memcmp(old, new, transferred);
195 }
196 
197 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
198 
199 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
200 				  child_fn fn)
201 {
202 	struct comm_pipes comm_pipes;
203 	char buf;
204 	int ret;
205 
206 	ret = setup_comm_pipes(&comm_pipes);
207 	if (ret) {
208 		ksft_test_result_fail("pipe() failed\n");
209 		return;
210 	}
211 
212 	ret = fork();
213 	if (ret < 0) {
214 		ksft_test_result_fail("fork() failed\n");
215 		goto close_comm_pipes;
216 	} else if (!ret) {
217 		exit(fn(mem, size, &comm_pipes));
218 	}
219 
220 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
221 		;
222 
223 	if (do_mprotect) {
224 		/*
225 		 * mprotect() optimizations might try avoiding
226 		 * write-faults by directly mapping pages writable.
227 		 */
228 		ret = mprotect(mem, size, PROT_READ);
229 		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
230 		if (ret) {
231 			ksft_test_result_fail("mprotect() failed\n");
232 			write(comm_pipes.parent_ready[1], "0", 1);
233 			wait(&ret);
234 			goto close_comm_pipes;
235 		}
236 	}
237 
238 	/* Modify the page. */
239 	memset(mem, 0xff, size);
240 	write(comm_pipes.parent_ready[1], "0", 1);
241 
242 	wait(&ret);
243 	if (WIFEXITED(ret))
244 		ret = WEXITSTATUS(ret);
245 	else
246 		ret = -EINVAL;
247 
248 	ksft_test_result(!ret, "No leak from parent into child\n");
249 close_comm_pipes:
250 	close_comm_pipes(&comm_pipes);
251 }
252 
253 static void test_cow_in_parent(char *mem, size_t size)
254 {
255 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
256 }
257 
258 static void test_cow_in_parent_mprotect(char *mem, size_t size)
259 {
260 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
261 }
262 
263 static void test_vmsplice_in_child(char *mem, size_t size)
264 {
265 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
266 }
267 
268 static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
269 {
270 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
271 }
272 
273 static void do_test_vmsplice_in_parent(char *mem, size_t size,
274 				       bool before_fork)
275 {
276 	struct iovec iov = {
277 		.iov_base = mem,
278 		.iov_len = size,
279 	};
280 	ssize_t cur, total, transferred;
281 	struct comm_pipes comm_pipes;
282 	char *old, *new;
283 	int ret, fds[2];
284 	char buf;
285 
286 	old = malloc(size);
287 	new = malloc(size);
288 
289 	memcpy(old, mem, size);
290 
291 	ret = setup_comm_pipes(&comm_pipes);
292 	if (ret) {
293 		ksft_test_result_fail("pipe() failed\n");
294 		goto free;
295 	}
296 
297 	if (pipe(fds) < 0) {
298 		ksft_test_result_fail("pipe() failed\n");
299 		goto close_comm_pipes;
300 	}
301 
302 	if (before_fork) {
303 		transferred = vmsplice(fds[1], &iov, 1, 0);
304 		if (transferred <= 0) {
305 			ksft_test_result_fail("vmsplice() failed\n");
306 			goto close_pipe;
307 		}
308 	}
309 
310 	ret = fork();
311 	if (ret < 0) {
312 		ksft_test_result_fail("fork() failed\n");
313 		goto close_pipe;
314 	} else if (!ret) {
315 		write(comm_pipes.child_ready[1], "0", 1);
316 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
317 			;
318 		/* Modify page content in the child. */
319 		memset(mem, 0xff, size);
320 		exit(0);
321 	}
322 
323 	if (!before_fork) {
324 		transferred = vmsplice(fds[1], &iov, 1, 0);
325 		if (transferred <= 0) {
326 			ksft_test_result_fail("vmsplice() failed\n");
327 			wait(&ret);
328 			goto close_pipe;
329 		}
330 	}
331 
332 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
333 		;
334 	if (munmap(mem, size) < 0) {
335 		ksft_test_result_fail("munmap() failed\n");
336 		goto close_pipe;
337 	}
338 	write(comm_pipes.parent_ready[1], "0", 1);
339 
340 	/* Wait until the child is done writing. */
341 	wait(&ret);
342 	if (!WIFEXITED(ret)) {
343 		ksft_test_result_fail("wait() failed\n");
344 		goto close_pipe;
345 	}
346 
347 	/* See if we still read the old values. */
348 	for (total = 0; total < transferred; total += cur) {
349 		cur = read(fds[0], new + total, transferred - total);
350 		if (cur < 0) {
351 			ksft_test_result_fail("read() failed\n");
352 			goto close_pipe;
353 		}
354 	}
355 
356 	ksft_test_result(!memcmp(old, new, transferred),
357 			 "No leak from child into parent\n");
358 close_pipe:
359 	close(fds[0]);
360 	close(fds[1]);
361 close_comm_pipes:
362 	close_comm_pipes(&comm_pipes);
363 free:
364 	free(old);
365 	free(new);
366 }
367 
368 static void test_vmsplice_before_fork(char *mem, size_t size)
369 {
370 	do_test_vmsplice_in_parent(mem, size, true);
371 }
372 
373 static void test_vmsplice_after_fork(char *mem, size_t size)
374 {
375 	do_test_vmsplice_in_parent(mem, size, false);
376 }
377 
378 #ifdef LOCAL_CONFIG_HAVE_LIBURING
379 static void do_test_iouring(char *mem, size_t size, bool use_fork)
380 {
381 	struct comm_pipes comm_pipes;
382 	struct io_uring_cqe *cqe;
383 	struct io_uring_sqe *sqe;
384 	struct io_uring ring;
385 	ssize_t cur, total;
386 	struct iovec iov;
387 	char *buf, *tmp;
388 	int ret, fd;
389 	FILE *file;
390 
391 	ret = setup_comm_pipes(&comm_pipes);
392 	if (ret) {
393 		ksft_test_result_fail("pipe() failed\n");
394 		return;
395 	}
396 
397 	file = tmpfile();
398 	if (!file) {
399 		ksft_test_result_fail("tmpfile() failed\n");
400 		goto close_comm_pipes;
401 	}
402 	fd = fileno(file);
403 	assert(fd);
404 
405 	tmp = malloc(size);
406 	if (!tmp) {
407 		ksft_test_result_fail("malloc() failed\n");
408 		goto close_file;
409 	}
410 
411 	/* Skip on errors, as we might just lack kernel support. */
412 	ret = io_uring_queue_init(1, &ring, 0);
413 	if (ret < 0) {
414 		ksft_test_result_skip("io_uring_queue_init() failed\n");
415 		goto free_tmp;
416 	}
417 
418 	/*
419 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
420 	 * | FOLL_LONGTERM the range.
421 	 *
422 	 * Skip on errors, as we might just lack kernel support or might not
423 	 * have sufficient MEMLOCK permissions.
424 	 */
425 	iov.iov_base = mem;
426 	iov.iov_len = size;
427 	ret = io_uring_register_buffers(&ring, &iov, 1);
428 	if (ret) {
429 		ksft_test_result_skip("io_uring_register_buffers() failed\n");
430 		goto queue_exit;
431 	}
432 
433 	if (use_fork) {
434 		/*
435 		 * fork() and keep the child alive until we're done. Note that
436 		 * we expect the pinned page to not get shared with the child.
437 		 */
438 		ret = fork();
439 		if (ret < 0) {
440 			ksft_test_result_fail("fork() failed\n");
441 			goto unregister_buffers;
442 		} else if (!ret) {
443 			write(comm_pipes.child_ready[1], "0", 1);
444 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
445 				;
446 			exit(0);
447 		}
448 
449 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
450 			;
451 	} else {
452 		/*
453 		 * Map the page R/O into the page table. Enable softdirty
454 		 * tracking to stop the page from getting mapped R/W immediately
455 		 * again by mprotect() optimizations. Note that we don't have an
456 		 * easy way to test if that worked (the pagemap does not export
457 		 * if the page is mapped R/O vs. R/W).
458 		 */
459 		ret = mprotect(mem, size, PROT_READ);
460 		clear_softdirty();
461 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
462 		if (ret) {
463 			ksft_test_result_fail("mprotect() failed\n");
464 			goto unregister_buffers;
465 		}
466 	}
467 
468 	/*
469 	 * Modify the page and write page content as observed by the fixed
470 	 * buffer pin to the file so we can verify it.
471 	 */
472 	memset(mem, 0xff, size);
473 	sqe = io_uring_get_sqe(&ring);
474 	if (!sqe) {
475 		ksft_test_result_fail("io_uring_get_sqe() failed\n");
476 		goto quit_child;
477 	}
478 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
479 
480 	ret = io_uring_submit(&ring);
481 	if (ret < 0) {
482 		ksft_test_result_fail("io_uring_submit() failed\n");
483 		goto quit_child;
484 	}
485 
486 	ret = io_uring_wait_cqe(&ring, &cqe);
487 	if (ret < 0) {
488 		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
489 		goto quit_child;
490 	}
491 
492 	if (cqe->res != size) {
493 		ksft_test_result_fail("write_fixed failed\n");
494 		goto quit_child;
495 	}
496 	io_uring_cqe_seen(&ring, cqe);
497 
498 	/* Read back the file content to the temporary buffer. */
499 	total = 0;
500 	while (total < size) {
501 		cur = pread(fd, tmp + total, size - total, total);
502 		if (cur < 0) {
503 			ksft_test_result_fail("pread() failed\n");
504 			goto quit_child;
505 		}
506 		total += cur;
507 	}
508 
509 	/* Finally, check if we read what we expected. */
510 	ksft_test_result(!memcmp(mem, tmp, size),
511 			 "Longterm R/W pin is reliable\n");
512 
513 quit_child:
514 	if (use_fork) {
515 		write(comm_pipes.parent_ready[1], "0", 1);
516 		wait(&ret);
517 	}
518 unregister_buffers:
519 	io_uring_unregister_buffers(&ring);
520 queue_exit:
521 	io_uring_queue_exit(&ring);
522 free_tmp:
523 	free(tmp);
524 close_file:
525 	fclose(file);
526 close_comm_pipes:
527 	close_comm_pipes(&comm_pipes);
528 }
529 
530 static void test_iouring_ro(char *mem, size_t size)
531 {
532 	do_test_iouring(mem, size, false);
533 }
534 
535 static void test_iouring_fork(char *mem, size_t size)
536 {
537 	do_test_iouring(mem, size, true);
538 }
539 
540 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
541 
542 enum ro_pin_test {
543 	RO_PIN_TEST,
544 	RO_PIN_TEST_SHARED,
545 	RO_PIN_TEST_PREVIOUSLY_SHARED,
546 	RO_PIN_TEST_RO_EXCLUSIVE,
547 };
548 
549 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
550 			   bool fast)
551 {
552 	struct pin_longterm_test args;
553 	struct comm_pipes comm_pipes;
554 	char *tmp, buf;
555 	__u64 tmp_val;
556 	int ret;
557 
558 	if (gup_fd < 0) {
559 		ksft_test_result_skip("gup_test not available\n");
560 		return;
561 	}
562 
563 	tmp = malloc(size);
564 	if (!tmp) {
565 		ksft_test_result_fail("malloc() failed\n");
566 		return;
567 	}
568 
569 	ret = setup_comm_pipes(&comm_pipes);
570 	if (ret) {
571 		ksft_test_result_fail("pipe() failed\n");
572 		goto free_tmp;
573 	}
574 
575 	switch (test) {
576 	case RO_PIN_TEST:
577 		break;
578 	case RO_PIN_TEST_SHARED:
579 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
580 		/*
581 		 * Share the pages with our child. As the pages are not pinned,
582 		 * this should just work.
583 		 */
584 		ret = fork();
585 		if (ret < 0) {
586 			ksft_test_result_fail("fork() failed\n");
587 			goto close_comm_pipes;
588 		} else if (!ret) {
589 			write(comm_pipes.child_ready[1], "0", 1);
590 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
591 				;
592 			exit(0);
593 		}
594 
595 		/* Wait until our child is ready. */
596 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
597 			;
598 
599 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
600 			/*
601 			 * Tell the child to quit now and wait until it quit.
602 			 * The pages should now be mapped R/O into our page
603 			 * tables, but they are no longer shared.
604 			 */
605 			write(comm_pipes.parent_ready[1], "0", 1);
606 			wait(&ret);
607 			if (!WIFEXITED(ret))
608 				ksft_print_msg("[INFO] wait() failed\n");
609 		}
610 		break;
611 	case RO_PIN_TEST_RO_EXCLUSIVE:
612 		/*
613 		 * Map the page R/O into the page table. Enable softdirty
614 		 * tracking to stop the page from getting mapped R/W immediately
615 		 * again by mprotect() optimizations. Note that we don't have an
616 		 * easy way to test if that worked (the pagemap does not export
617 		 * if the page is mapped R/O vs. R/W).
618 		 */
619 		ret = mprotect(mem, size, PROT_READ);
620 		clear_softdirty();
621 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
622 		if (ret) {
623 			ksft_test_result_fail("mprotect() failed\n");
624 			goto close_comm_pipes;
625 		}
626 		break;
627 	default:
628 		assert(false);
629 	}
630 
631 	/* Take a R/O pin. This should trigger unsharing. */
632 	args.addr = (__u64)(uintptr_t)mem;
633 	args.size = size;
634 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
635 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
636 	if (ret) {
637 		if (errno == EINVAL)
638 			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
639 		else
640 			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
641 		goto wait;
642 	}
643 
644 	/* Modify the page. */
645 	memset(mem, 0xff, size);
646 
647 	/*
648 	 * Read back the content via the pin to the temporary buffer and
649 	 * test if we observed the modification.
650 	 */
651 	tmp_val = (__u64)(uintptr_t)tmp;
652 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
653 	if (ret)
654 		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
655 	else
656 		ksft_test_result(!memcmp(mem, tmp, size),
657 				 "Longterm R/O pin is reliable\n");
658 
659 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
660 	if (ret)
661 		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
662 wait:
663 	switch (test) {
664 	case RO_PIN_TEST_SHARED:
665 		write(comm_pipes.parent_ready[1], "0", 1);
666 		wait(&ret);
667 		if (!WIFEXITED(ret))
668 			ksft_print_msg("[INFO] wait() failed\n");
669 		break;
670 	default:
671 		break;
672 	}
673 close_comm_pipes:
674 	close_comm_pipes(&comm_pipes);
675 free_tmp:
676 	free(tmp);
677 }
678 
679 static void test_ro_pin_on_shared(char *mem, size_t size)
680 {
681 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
682 }
683 
684 static void test_ro_fast_pin_on_shared(char *mem, size_t size)
685 {
686 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
687 }
688 
689 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
690 {
691 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
692 }
693 
694 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
695 {
696 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
697 }
698 
699 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
700 {
701 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
702 }
703 
704 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
705 {
706 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
707 }
708 
709 typedef void (*test_fn)(char *mem, size_t size);
710 
711 static void do_run_with_base_page(test_fn fn, bool swapout)
712 {
713 	char *mem;
714 	int ret;
715 
716 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
717 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
718 	if (mem == MAP_FAILED) {
719 		ksft_test_result_fail("mmap() failed\n");
720 		return;
721 	}
722 
723 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
724 	/* Ignore if not around on a kernel. */
725 	if (ret && errno != EINVAL) {
726 		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
727 		goto munmap;
728 	}
729 
730 	/* Populate a base page. */
731 	memset(mem, 0, pagesize);
732 
733 	if (swapout) {
734 		madvise(mem, pagesize, MADV_PAGEOUT);
735 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
736 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
737 			goto munmap;
738 		}
739 	}
740 
741 	fn(mem, pagesize);
742 munmap:
743 	munmap(mem, pagesize);
744 }
745 
746 static void run_with_base_page(test_fn fn, const char *desc)
747 {
748 	ksft_print_msg("[RUN] %s ... with base page\n", desc);
749 	do_run_with_base_page(fn, false);
750 }
751 
752 static void run_with_base_page_swap(test_fn fn, const char *desc)
753 {
754 	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
755 	do_run_with_base_page(fn, true);
756 }
757 
758 enum thp_run {
759 	THP_RUN_PMD,
760 	THP_RUN_PMD_SWAPOUT,
761 	THP_RUN_PTE,
762 	THP_RUN_PTE_SWAPOUT,
763 	THP_RUN_SINGLE_PTE,
764 	THP_RUN_SINGLE_PTE_SWAPOUT,
765 	THP_RUN_PARTIAL_MREMAP,
766 	THP_RUN_PARTIAL_SHARED,
767 };
768 
769 static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
770 {
771 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
772 	size_t size, mmap_size, mremap_size;
773 	int ret;
774 
775 	/* For alignment purposes, we need twice the thp size. */
776 	mmap_size = 2 * thpsize;
777 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
778 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
779 	if (mmap_mem == MAP_FAILED) {
780 		ksft_test_result_fail("mmap() failed\n");
781 		return;
782 	}
783 
784 	/* We need a THP-aligned memory area. */
785 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
786 
787 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
788 	if (ret) {
789 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
790 		goto munmap;
791 	}
792 
793 	/*
794 	 * Try to populate a THP. Touch the first sub-page and test if we get
795 	 * another sub-page populated automatically.
796 	 */
797 	mem[0] = 0;
798 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
799 		ksft_test_result_skip("Did not get a THP populated\n");
800 		goto munmap;
801 	}
802 	memset(mem, 0, thpsize);
803 
804 	size = thpsize;
805 	switch (thp_run) {
806 	case THP_RUN_PMD:
807 	case THP_RUN_PMD_SWAPOUT:
808 		break;
809 	case THP_RUN_PTE:
810 	case THP_RUN_PTE_SWAPOUT:
811 		/*
812 		 * Trigger PTE-mapping the THP by temporarily mapping a single
813 		 * subpage R/O.
814 		 */
815 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
816 		if (ret) {
817 			ksft_test_result_fail("mprotect() failed\n");
818 			goto munmap;
819 		}
820 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
821 		if (ret) {
822 			ksft_test_result_fail("mprotect() failed\n");
823 			goto munmap;
824 		}
825 		break;
826 	case THP_RUN_SINGLE_PTE:
827 	case THP_RUN_SINGLE_PTE_SWAPOUT:
828 		/*
829 		 * Discard all but a single subpage of that PTE-mapped THP. What
830 		 * remains is a single PTE mapping a single subpage.
831 		 */
832 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
833 		if (ret) {
834 			ksft_test_result_fail("MADV_DONTNEED failed\n");
835 			goto munmap;
836 		}
837 		size = pagesize;
838 		break;
839 	case THP_RUN_PARTIAL_MREMAP:
840 		/*
841 		 * Remap half of the THP. We need some new memory location
842 		 * for that.
843 		 */
844 		mremap_size = thpsize / 2;
845 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
846 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
847 		if (mem == MAP_FAILED) {
848 			ksft_test_result_fail("mmap() failed\n");
849 			goto munmap;
850 		}
851 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
852 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
853 		if (tmp != mremap_mem) {
854 			ksft_test_result_fail("mremap() failed\n");
855 			goto munmap;
856 		}
857 		size = mremap_size;
858 		break;
859 	case THP_RUN_PARTIAL_SHARED:
860 		/*
861 		 * Share the first page of the THP with a child and quit the
862 		 * child. This will result in some parts of the THP never
863 		 * have been shared.
864 		 */
865 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
866 		if (ret) {
867 			ksft_test_result_fail("MADV_DONTFORK failed\n");
868 			goto munmap;
869 		}
870 		ret = fork();
871 		if (ret < 0) {
872 			ksft_test_result_fail("fork() failed\n");
873 			goto munmap;
874 		} else if (!ret) {
875 			exit(0);
876 		}
877 		wait(&ret);
878 		/* Allow for sharing all pages again. */
879 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
880 		if (ret) {
881 			ksft_test_result_fail("MADV_DOFORK failed\n");
882 			goto munmap;
883 		}
884 		break;
885 	default:
886 		assert(false);
887 	}
888 
889 	switch (thp_run) {
890 	case THP_RUN_PMD_SWAPOUT:
891 	case THP_RUN_PTE_SWAPOUT:
892 	case THP_RUN_SINGLE_PTE_SWAPOUT:
893 		madvise(mem, size, MADV_PAGEOUT);
894 		if (!range_is_swapped(mem, size)) {
895 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
896 			goto munmap;
897 		}
898 		break;
899 	default:
900 		break;
901 	}
902 
903 	fn(mem, size);
904 munmap:
905 	munmap(mmap_mem, mmap_size);
906 	if (mremap_mem != MAP_FAILED)
907 		munmap(mremap_mem, mremap_size);
908 }
909 
910 static void run_with_thp(test_fn fn, const char *desc)
911 {
912 	ksft_print_msg("[RUN] %s ... with THP\n", desc);
913 	do_run_with_thp(fn, THP_RUN_PMD);
914 }
915 
916 static void run_with_thp_swap(test_fn fn, const char *desc)
917 {
918 	ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
919 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
920 }
921 
922 static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
923 {
924 	ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
925 	do_run_with_thp(fn, THP_RUN_PTE);
926 }
927 
928 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
929 {
930 	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
931 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
932 }
933 
934 static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
935 {
936 	ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
937 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
938 }
939 
940 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
941 {
942 	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
943 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
944 }
945 
946 static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
947 {
948 	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
949 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
950 }
951 
952 static void run_with_partial_shared_thp(test_fn fn, const char *desc)
953 {
954 	ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
955 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
956 }
957 
958 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
959 {
960 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
961 	char *mem, *dummy;
962 
963 	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
964 		       hugetlbsize / 1024);
965 
966 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
967 
968 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
969 	if (mem == MAP_FAILED) {
970 		ksft_test_result_skip("need more free huge pages\n");
971 		return;
972 	}
973 
974 	/* Populate an huge page. */
975 	memset(mem, 0, hugetlbsize);
976 
977 	/*
978 	 * We need a total of two hugetlb pages to handle COW/unsharing
979 	 * properly, otherwise we might get zapped by a SIGBUS.
980 	 */
981 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
982 	if (dummy == MAP_FAILED) {
983 		ksft_test_result_skip("need more free huge pages\n");
984 		goto munmap;
985 	}
986 	munmap(dummy, hugetlbsize);
987 
988 	fn(mem, hugetlbsize);
989 munmap:
990 	munmap(mem, hugetlbsize);
991 }
992 
993 struct test_case {
994 	const char *desc;
995 	test_fn fn;
996 };
997 
998 /*
999  * Test cases that are specific to anonymous pages: pages in private mappings
1000  * that may get shared via COW during fork().
1001  */
1002 static const struct test_case anon_test_cases[] = {
1003 	/*
1004 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
1005 	 * either the child can observe modifications by the parent or the
1006 	 * other way around.
1007 	 */
1008 	{
1009 		"Basic COW after fork()",
1010 		test_cow_in_parent,
1011 	},
1012 	/*
1013 	 * Basic test, but do an additional mprotect(PROT_READ)+
1014 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1015 	 */
1016 	{
1017 		"Basic COW after fork() with mprotect() optimization",
1018 		test_cow_in_parent_mprotect,
1019 	},
1020 	/*
1021 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
1022 	 * we miss to break COW, the child observes modifications by the parent.
1023 	 * This is CVE-2020-29374 reported by Jann Horn.
1024 	 */
1025 	{
1026 		"vmsplice() + unmap in child",
1027 		test_vmsplice_in_child
1028 	},
1029 	/*
1030 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
1031 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1032 	 */
1033 	{
1034 		"vmsplice() + unmap in child with mprotect() optimization",
1035 		test_vmsplice_in_child_mprotect
1036 	},
1037 	/*
1038 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1039 	 * fork(); modify in the child. If we miss to break COW, the parent
1040 	 * observes modifications by the child.
1041 	 */
1042 	{
1043 		"vmsplice() before fork(), unmap in parent after fork()",
1044 		test_vmsplice_before_fork,
1045 	},
1046 	/*
1047 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1048 	 * child. If we miss to break COW, the parent observes modifications by
1049 	 * the child.
1050 	 */
1051 	{
1052 		"vmsplice() + unmap in parent after fork()",
1053 		test_vmsplice_after_fork,
1054 	},
1055 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1056 	/*
1057 	 * Take a R/W longterm pin and then map the page R/O into the page
1058 	 * table to trigger a write fault on next access. When modifying the
1059 	 * page, the page content must be visible via the pin.
1060 	 */
1061 	{
1062 		"R/O-mapping a page registered as iouring fixed buffer",
1063 		test_iouring_ro,
1064 	},
1065 	/*
1066 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1067 	 * page, the page content must be visible via the pin. We expect the
1068 	 * pinned page to not get shared with the child.
1069 	 */
1070 	{
1071 		"fork() with an iouring fixed buffer",
1072 		test_iouring_fork,
1073 	},
1074 
1075 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1076 	/*
1077 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1078 	 * When modifying the page via the page table, the page content change
1079 	 * must be visible via the pin.
1080 	 */
1081 	{
1082 		"R/O GUP pin on R/O-mapped shared page",
1083 		test_ro_pin_on_shared,
1084 	},
1085 	/* Same as above, but using GUP-fast. */
1086 	{
1087 		"R/O GUP-fast pin on R/O-mapped shared page",
1088 		test_ro_fast_pin_on_shared,
1089 	},
1090 	/*
1091 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1092 	 * was previously shared. When modifying the page via the page table,
1093 	 * the page content change must be visible via the pin.
1094 	 */
1095 	{
1096 		"R/O GUP pin on R/O-mapped previously-shared page",
1097 		test_ro_pin_on_ro_previously_shared,
1098 	},
1099 	/* Same as above, but using GUP-fast. */
1100 	{
1101 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1102 		test_ro_fast_pin_on_ro_previously_shared,
1103 	},
1104 	/*
1105 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1106 	 * When modifying the page via the page table, the page content change
1107 	 * must be visible via the pin.
1108 	 */
1109 	{
1110 		"R/O GUP pin on R/O-mapped exclusive page",
1111 		test_ro_pin_on_ro_exclusive,
1112 	},
1113 	/* Same as above, but using GUP-fast. */
1114 	{
1115 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1116 		test_ro_fast_pin_on_ro_exclusive,
1117 	},
1118 };
1119 
1120 static void run_anon_test_case(struct test_case const *test_case)
1121 {
1122 	int i;
1123 
1124 	run_with_base_page(test_case->fn, test_case->desc);
1125 	run_with_base_page_swap(test_case->fn, test_case->desc);
1126 	if (thpsize) {
1127 		run_with_thp(test_case->fn, test_case->desc);
1128 		run_with_thp_swap(test_case->fn, test_case->desc);
1129 		run_with_pte_mapped_thp(test_case->fn, test_case->desc);
1130 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
1131 		run_with_single_pte_of_thp(test_case->fn, test_case->desc);
1132 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
1133 		run_with_partial_mremap_thp(test_case->fn, test_case->desc);
1134 		run_with_partial_shared_thp(test_case->fn, test_case->desc);
1135 	}
1136 	for (i = 0; i < nr_hugetlbsizes; i++)
1137 		run_with_hugetlb(test_case->fn, test_case->desc,
1138 				 hugetlbsizes[i]);
1139 }
1140 
1141 static void run_anon_test_cases(void)
1142 {
1143 	int i;
1144 
1145 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1146 
1147 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1148 		run_anon_test_case(&anon_test_cases[i]);
1149 }
1150 
1151 static int tests_per_anon_test_case(void)
1152 {
1153 	int tests = 2 + nr_hugetlbsizes;
1154 
1155 	if (thpsize)
1156 		tests += 8;
1157 	return tests;
1158 }
1159 
1160 enum anon_thp_collapse_test {
1161 	ANON_THP_COLLAPSE_UNSHARED,
1162 	ANON_THP_COLLAPSE_FULLY_SHARED,
1163 	ANON_THP_COLLAPSE_LOWER_SHARED,
1164 	ANON_THP_COLLAPSE_UPPER_SHARED,
1165 };
1166 
1167 static void do_test_anon_thp_collapse(char *mem, size_t size,
1168 				      enum anon_thp_collapse_test test)
1169 {
1170 	struct comm_pipes comm_pipes;
1171 	char buf;
1172 	int ret;
1173 
1174 	ret = setup_comm_pipes(&comm_pipes);
1175 	if (ret) {
1176 		ksft_test_result_fail("pipe() failed\n");
1177 		return;
1178 	}
1179 
1180 	/*
1181 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1182 	 * R/O, such that we can try collapsing it later.
1183 	 */
1184 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1185 	if (ret) {
1186 		ksft_test_result_fail("mprotect() failed\n");
1187 		goto close_comm_pipes;
1188 	}
1189 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1190 	if (ret) {
1191 		ksft_test_result_fail("mprotect() failed\n");
1192 		goto close_comm_pipes;
1193 	}
1194 
1195 	switch (test) {
1196 	case ANON_THP_COLLAPSE_UNSHARED:
1197 		/* Collapse before actually COW-sharing the page. */
1198 		ret = madvise(mem, size, MADV_COLLAPSE);
1199 		if (ret) {
1200 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1201 					      strerror(errno));
1202 			goto close_comm_pipes;
1203 		}
1204 		break;
1205 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1206 		/* COW-share the full PTE-mapped THP. */
1207 		break;
1208 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1209 		/* Don't COW-share the upper part of the THP. */
1210 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1211 		if (ret) {
1212 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1213 			goto close_comm_pipes;
1214 		}
1215 		break;
1216 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1217 		/* Don't COW-share the lower part of the THP. */
1218 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1219 		if (ret) {
1220 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1221 			goto close_comm_pipes;
1222 		}
1223 		break;
1224 	default:
1225 		assert(false);
1226 	}
1227 
1228 	ret = fork();
1229 	if (ret < 0) {
1230 		ksft_test_result_fail("fork() failed\n");
1231 		goto close_comm_pipes;
1232 	} else if (!ret) {
1233 		switch (test) {
1234 		case ANON_THP_COLLAPSE_UNSHARED:
1235 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1236 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1237 			break;
1238 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1239 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1240 			break;
1241 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1242 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1243 					     &comm_pipes));
1244 			break;
1245 		default:
1246 			assert(false);
1247 		}
1248 	}
1249 
1250 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1251 		;
1252 
1253 	switch (test) {
1254 	case ANON_THP_COLLAPSE_UNSHARED:
1255 		break;
1256 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1257 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1258 		/*
1259 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1260 		 * able to actually collapse.
1261 		 */
1262 		ret = madvise(mem, size, MADV_DOFORK);
1263 		if (ret) {
1264 			ksft_test_result_fail("MADV_DOFORK failed\n");
1265 			write(comm_pipes.parent_ready[1], "0", 1);
1266 			wait(&ret);
1267 			goto close_comm_pipes;
1268 		}
1269 		/* FALLTHROUGH */
1270 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1271 		/* Collapse before anyone modified the COW-shared page. */
1272 		ret = madvise(mem, size, MADV_COLLAPSE);
1273 		if (ret) {
1274 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1275 					      strerror(errno));
1276 			write(comm_pipes.parent_ready[1], "0", 1);
1277 			wait(&ret);
1278 			goto close_comm_pipes;
1279 		}
1280 		break;
1281 	default:
1282 		assert(false);
1283 	}
1284 
1285 	/* Modify the page. */
1286 	memset(mem, 0xff, size);
1287 	write(comm_pipes.parent_ready[1], "0", 1);
1288 
1289 	wait(&ret);
1290 	if (WIFEXITED(ret))
1291 		ret = WEXITSTATUS(ret);
1292 	else
1293 		ret = -EINVAL;
1294 
1295 	ksft_test_result(!ret, "No leak from parent into child\n");
1296 close_comm_pipes:
1297 	close_comm_pipes(&comm_pipes);
1298 }
1299 
1300 static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1301 {
1302 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1303 }
1304 
1305 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1306 {
1307 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1308 }
1309 
1310 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1311 {
1312 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1313 }
1314 
1315 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1316 {
1317 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1318 }
1319 
1320 /*
1321  * Test cases that are specific to anonymous THP: pages in private mappings
1322  * that may get shared via COW during fork().
1323  */
1324 static const struct test_case anon_thp_test_cases[] = {
1325 	/*
1326 	 * Basic COW test for fork() without any GUP when collapsing a THP
1327 	 * before fork().
1328 	 *
1329 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1330 	 * collapse") might easily get COW handling wrong when not collapsing
1331 	 * exclusivity information properly.
1332 	 */
1333 	{
1334 		"Basic COW after fork() when collapsing before fork()",
1335 		test_anon_thp_collapse_unshared,
1336 	},
1337 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1338 	{
1339 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1340 		test_anon_thp_collapse_fully_shared,
1341 	},
1342 	/*
1343 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1344 	 * THP.
1345 	 */
1346 	{
1347 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1348 		test_anon_thp_collapse_lower_shared,
1349 	},
1350 	/*
1351 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1352 	 * THP.
1353 	 */
1354 	{
1355 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1356 		test_anon_thp_collapse_upper_shared,
1357 	},
1358 };
1359 
1360 static void run_anon_thp_test_cases(void)
1361 {
1362 	int i;
1363 
1364 	if (!thpsize)
1365 		return;
1366 
1367 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1368 
1369 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1370 		struct test_case const *test_case = &anon_thp_test_cases[i];
1371 
1372 		ksft_print_msg("[RUN] %s\n", test_case->desc);
1373 		do_run_with_thp(test_case->fn, THP_RUN_PMD);
1374 	}
1375 }
1376 
1377 static int tests_per_anon_thp_test_case(void)
1378 {
1379 	return thpsize ? 1 : 0;
1380 }
1381 
1382 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1383 
1384 static void test_cow(char *mem, const char *smem, size_t size)
1385 {
1386 	char *old = malloc(size);
1387 
1388 	/* Backup the original content. */
1389 	memcpy(old, smem, size);
1390 
1391 	/* Modify the page. */
1392 	memset(mem, 0xff, size);
1393 
1394 	/* See if we still read the old values via the other mapping. */
1395 	ksft_test_result(!memcmp(smem, old, size),
1396 			 "Other mapping not modified\n");
1397 	free(old);
1398 }
1399 
1400 static void test_ro_pin(char *mem, const char *smem, size_t size)
1401 {
1402 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1403 }
1404 
1405 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1406 {
1407 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1408 }
1409 
1410 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1411 {
1412 	char *mem, *smem, tmp;
1413 
1414 	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1415 
1416 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1417 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1418 	if (mem == MAP_FAILED) {
1419 		ksft_test_result_fail("mmap() failed\n");
1420 		return;
1421 	}
1422 
1423 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1424 	if (mem == MAP_FAILED) {
1425 		ksft_test_result_fail("mmap() failed\n");
1426 		goto munmap;
1427 	}
1428 
1429 	/* Read from the page to populate the shared zeropage. */
1430 	tmp = *mem + *smem;
1431 	asm volatile("" : "+r" (tmp));
1432 
1433 	fn(mem, smem, pagesize);
1434 munmap:
1435 	munmap(mem, pagesize);
1436 	if (smem != MAP_FAILED)
1437 		munmap(smem, pagesize);
1438 }
1439 
1440 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1441 {
1442 	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1443 	size_t mmap_size;
1444 	int ret;
1445 
1446 	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1447 
1448 	if (!has_huge_zeropage) {
1449 		ksft_test_result_skip("Huge zeropage not enabled\n");
1450 		return;
1451 	}
1452 
1453 	/* For alignment purposes, we need twice the thp size. */
1454 	mmap_size = 2 * thpsize;
1455 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1456 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1457 	if (mmap_mem == MAP_FAILED) {
1458 		ksft_test_result_fail("mmap() failed\n");
1459 		return;
1460 	}
1461 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1462 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1463 	if (mmap_smem == MAP_FAILED) {
1464 		ksft_test_result_fail("mmap() failed\n");
1465 		goto munmap;
1466 	}
1467 
1468 	/* We need a THP-aligned memory area. */
1469 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
1470 	smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
1471 
1472 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
1473 	ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
1474 	if (ret) {
1475 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1476 		goto munmap;
1477 	}
1478 
1479 	/*
1480 	 * Read from the memory to populate the huge shared zeropage. Read from
1481 	 * the first sub-page and test if we get another sub-page populated
1482 	 * automatically.
1483 	 */
1484 	tmp = *mem + *smem;
1485 	asm volatile("" : "+r" (tmp));
1486 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1487 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1488 		ksft_test_result_skip("Did not get THPs populated\n");
1489 		goto munmap;
1490 	}
1491 
1492 	fn(mem, smem, thpsize);
1493 munmap:
1494 	munmap(mmap_mem, mmap_size);
1495 	if (mmap_smem != MAP_FAILED)
1496 		munmap(mmap_smem, mmap_size);
1497 }
1498 
1499 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1500 {
1501 	char *mem, *smem, tmp;
1502 	int fd;
1503 
1504 	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1505 
1506 	fd = memfd_create("test", 0);
1507 	if (fd < 0) {
1508 		ksft_test_result_fail("memfd_create() failed\n");
1509 		return;
1510 	}
1511 
1512 	/* File consists of a single page filled with zeroes. */
1513 	if (fallocate(fd, 0, 0, pagesize)) {
1514 		ksft_test_result_fail("fallocate() failed\n");
1515 		goto close;
1516 	}
1517 
1518 	/* Create a private mapping of the memfd. */
1519 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1520 	if (mem == MAP_FAILED) {
1521 		ksft_test_result_fail("mmap() failed\n");
1522 		goto close;
1523 	}
1524 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1525 	if (mem == MAP_FAILED) {
1526 		ksft_test_result_fail("mmap() failed\n");
1527 		goto munmap;
1528 	}
1529 
1530 	/* Fault the page in. */
1531 	tmp = *mem + *smem;
1532 	asm volatile("" : "+r" (tmp));
1533 
1534 	fn(mem, smem, pagesize);
1535 munmap:
1536 	munmap(mem, pagesize);
1537 	if (smem != MAP_FAILED)
1538 		munmap(smem, pagesize);
1539 close:
1540 	close(fd);
1541 }
1542 
1543 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1544 {
1545 	char *mem, *smem, tmp;
1546 	FILE *file;
1547 	int fd;
1548 
1549 	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1550 
1551 	file = tmpfile();
1552 	if (!file) {
1553 		ksft_test_result_fail("tmpfile() failed\n");
1554 		return;
1555 	}
1556 
1557 	fd = fileno(file);
1558 	if (fd < 0) {
1559 		ksft_test_result_skip("fileno() failed\n");
1560 		return;
1561 	}
1562 
1563 	/* File consists of a single page filled with zeroes. */
1564 	if (fallocate(fd, 0, 0, pagesize)) {
1565 		ksft_test_result_fail("fallocate() failed\n");
1566 		goto close;
1567 	}
1568 
1569 	/* Create a private mapping of the memfd. */
1570 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1571 	if (mem == MAP_FAILED) {
1572 		ksft_test_result_fail("mmap() failed\n");
1573 		goto close;
1574 	}
1575 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1576 	if (mem == MAP_FAILED) {
1577 		ksft_test_result_fail("mmap() failed\n");
1578 		goto munmap;
1579 	}
1580 
1581 	/* Fault the page in. */
1582 	tmp = *mem + *smem;
1583 	asm volatile("" : "+r" (tmp));
1584 
1585 	fn(mem, smem, pagesize);
1586 munmap:
1587 	munmap(mem, pagesize);
1588 	if (smem != MAP_FAILED)
1589 		munmap(smem, pagesize);
1590 close:
1591 	fclose(file);
1592 }
1593 
1594 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1595 				   size_t hugetlbsize)
1596 {
1597 	int flags = MFD_HUGETLB;
1598 	char *mem, *smem, tmp;
1599 	int fd;
1600 
1601 	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1602 		       hugetlbsize / 1024);
1603 
1604 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1605 
1606 	fd = memfd_create("test", flags);
1607 	if (fd < 0) {
1608 		ksft_test_result_skip("memfd_create() failed\n");
1609 		return;
1610 	}
1611 
1612 	/* File consists of a single page filled with zeroes. */
1613 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1614 		ksft_test_result_skip("need more free huge pages\n");
1615 		goto close;
1616 	}
1617 
1618 	/* Create a private mapping of the memfd. */
1619 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1620 		   0);
1621 	if (mem == MAP_FAILED) {
1622 		ksft_test_result_skip("need more free huge pages\n");
1623 		goto close;
1624 	}
1625 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1626 	if (mem == MAP_FAILED) {
1627 		ksft_test_result_fail("mmap() failed\n");
1628 		goto munmap;
1629 	}
1630 
1631 	/* Fault the page in. */
1632 	tmp = *mem + *smem;
1633 	asm volatile("" : "+r" (tmp));
1634 
1635 	fn(mem, smem, hugetlbsize);
1636 munmap:
1637 	munmap(mem, hugetlbsize);
1638 	if (mem != MAP_FAILED)
1639 		munmap(smem, hugetlbsize);
1640 close:
1641 	close(fd);
1642 }
1643 
1644 struct non_anon_test_case {
1645 	const char *desc;
1646 	non_anon_test_fn fn;
1647 };
1648 
1649 /*
1650  * Test cases that target any pages in private mappings that are not anonymous:
1651  * pages that may get shared via COW ndependent of fork(). This includes
1652  * the shared zeropage(s), pagecache pages, ...
1653  */
1654 static const struct non_anon_test_case non_anon_test_cases[] = {
1655 	/*
1656 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1657 	 * visible via other private/shared mappings.
1658 	 */
1659 	{
1660 		"Basic COW",
1661 		test_cow,
1662 	},
1663 	/*
1664 	 * Take a R/O longterm pin. When modifying the page via the page table,
1665 	 * the page content change must be visible via the pin.
1666 	 */
1667 	{
1668 		"R/O longterm GUP pin",
1669 		test_ro_pin,
1670 	},
1671 	/* Same as above, but using GUP-fast. */
1672 	{
1673 		"R/O longterm GUP-fast pin",
1674 		test_ro_fast_pin,
1675 	},
1676 };
1677 
1678 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1679 {
1680 	int i;
1681 
1682 	run_with_zeropage(test_case->fn, test_case->desc);
1683 	run_with_memfd(test_case->fn, test_case->desc);
1684 	run_with_tmpfile(test_case->fn, test_case->desc);
1685 	if (thpsize)
1686 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1687 	for (i = 0; i < nr_hugetlbsizes; i++)
1688 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1689 				       hugetlbsizes[i]);
1690 }
1691 
1692 static void run_non_anon_test_cases(void)
1693 {
1694 	int i;
1695 
1696 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1697 
1698 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1699 		run_non_anon_test_case(&non_anon_test_cases[i]);
1700 }
1701 
1702 static int tests_per_non_anon_test_case(void)
1703 {
1704 	int tests = 3 + nr_hugetlbsizes;
1705 
1706 	if (thpsize)
1707 		tests += 1;
1708 	return tests;
1709 }
1710 
1711 int main(int argc, char **argv)
1712 {
1713 	int err;
1714 
1715 	pagesize = getpagesize();
1716 	thpsize = read_pmd_pagesize();
1717 	if (thpsize)
1718 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
1719 			       thpsize / 1024);
1720 	detect_hugetlbsizes();
1721 	detect_huge_zeropage();
1722 
1723 	ksft_print_header();
1724 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1725 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1726 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1727 
1728 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1729 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1730 	if (pagemap_fd < 0)
1731 		ksft_exit_fail_msg("opening pagemap failed\n");
1732 
1733 	run_anon_test_cases();
1734 	run_anon_thp_test_cases();
1735 	run_non_anon_test_cases();
1736 
1737 	err = ksft_get_fail_cnt();
1738 	if (err)
1739 		ksft_exit_fail_msg("%d out of %d tests failed\n",
1740 				   err, ksft_test_num());
1741 	return ksft_exit_pass();
1742 }
1743