xref: /openbmc/linux/tools/testing/selftests/mm/cow.c (revision ca3ebcf2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * COW (Copy On Write) tests.
4  *
5  * Copyright 2022, Red Hat, Inc.
6  *
7  * Author(s): David Hildenbrand <david@redhat.com>
8  */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23 
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28 
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32 
33 static size_t pagesize;
34 static int pagemap_fd;
35 static size_t thpsize;
36 static int nr_hugetlbsizes;
37 static size_t hugetlbsizes[10];
38 static int gup_fd;
39 static bool has_huge_zeropage;
40 
detect_huge_zeropage(void)41 static void detect_huge_zeropage(void)
42 {
43 	int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
44 		      O_RDONLY);
45 	size_t enabled = 0;
46 	char buf[15];
47 	int ret;
48 
49 	if (fd < 0)
50 		return;
51 
52 	ret = pread(fd, buf, sizeof(buf), 0);
53 	if (ret > 0 && ret < sizeof(buf)) {
54 		buf[ret] = 0;
55 
56 		enabled = strtoul(buf, NULL, 10);
57 		if (enabled == 1) {
58 			has_huge_zeropage = true;
59 			ksft_print_msg("[INFO] huge zeropage is enabled\n");
60 		}
61 	}
62 
63 	close(fd);
64 }
65 
range_is_swapped(void * addr,size_t size)66 static bool range_is_swapped(void *addr, size_t size)
67 {
68 	for (; size; addr += pagesize, size -= pagesize)
69 		if (!pagemap_is_swapped(pagemap_fd, addr))
70 			return false;
71 	return true;
72 }
73 
74 struct comm_pipes {
75 	int child_ready[2];
76 	int parent_ready[2];
77 };
78 
setup_comm_pipes(struct comm_pipes * comm_pipes)79 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
80 {
81 	if (pipe(comm_pipes->child_ready) < 0)
82 		return -errno;
83 	if (pipe(comm_pipes->parent_ready) < 0) {
84 		close(comm_pipes->child_ready[0]);
85 		close(comm_pipes->child_ready[1]);
86 		return -errno;
87 	}
88 
89 	return 0;
90 }
91 
close_comm_pipes(struct comm_pipes * comm_pipes)92 static void close_comm_pipes(struct comm_pipes *comm_pipes)
93 {
94 	close(comm_pipes->child_ready[0]);
95 	close(comm_pipes->child_ready[1]);
96 	close(comm_pipes->parent_ready[0]);
97 	close(comm_pipes->parent_ready[1]);
98 }
99 
child_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)100 static int child_memcmp_fn(char *mem, size_t size,
101 			   struct comm_pipes *comm_pipes)
102 {
103 	char *old = malloc(size);
104 	char buf;
105 
106 	/* Backup the original content. */
107 	memcpy(old, mem, size);
108 
109 	/* Wait until the parent modified the page. */
110 	write(comm_pipes->child_ready[1], "0", 1);
111 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
112 		;
113 
114 	/* See if we still read the old values. */
115 	return memcmp(old, mem, size);
116 }
117 
child_vmsplice_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)118 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
119 				    struct comm_pipes *comm_pipes)
120 {
121 	struct iovec iov = {
122 		.iov_base = mem,
123 		.iov_len = size,
124 	};
125 	ssize_t cur, total, transferred;
126 	char *old, *new;
127 	int fds[2];
128 	char buf;
129 
130 	old = malloc(size);
131 	new = malloc(size);
132 
133 	/* Backup the original content. */
134 	memcpy(old, mem, size);
135 
136 	if (pipe(fds) < 0)
137 		return -errno;
138 
139 	/* Trigger a read-only pin. */
140 	transferred = vmsplice(fds[1], &iov, 1, 0);
141 	if (transferred < 0)
142 		return -errno;
143 	if (transferred == 0)
144 		return -EINVAL;
145 
146 	/* Unmap it from our page tables. */
147 	if (munmap(mem, size) < 0)
148 		return -errno;
149 
150 	/* Wait until the parent modified it. */
151 	write(comm_pipes->child_ready[1], "0", 1);
152 	while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
153 		;
154 
155 	/* See if we still read the old values via the pipe. */
156 	for (total = 0; total < transferred; total += cur) {
157 		cur = read(fds[0], new + total, transferred - total);
158 		if (cur < 0)
159 			return -errno;
160 	}
161 
162 	return memcmp(old, new, transferred);
163 }
164 
165 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
166 
do_test_cow_in_parent(char * mem,size_t size,bool do_mprotect,child_fn fn)167 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
168 				  child_fn fn)
169 {
170 	struct comm_pipes comm_pipes;
171 	char buf;
172 	int ret;
173 
174 	ret = setup_comm_pipes(&comm_pipes);
175 	if (ret) {
176 		ksft_test_result_fail("pipe() failed\n");
177 		return;
178 	}
179 
180 	ret = fork();
181 	if (ret < 0) {
182 		ksft_test_result_fail("fork() failed\n");
183 		goto close_comm_pipes;
184 	} else if (!ret) {
185 		exit(fn(mem, size, &comm_pipes));
186 	}
187 
188 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
189 		;
190 
191 	if (do_mprotect) {
192 		/*
193 		 * mprotect() optimizations might try avoiding
194 		 * write-faults by directly mapping pages writable.
195 		 */
196 		ret = mprotect(mem, size, PROT_READ);
197 		ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
198 		if (ret) {
199 			ksft_test_result_fail("mprotect() failed\n");
200 			write(comm_pipes.parent_ready[1], "0", 1);
201 			wait(&ret);
202 			goto close_comm_pipes;
203 		}
204 	}
205 
206 	/* Modify the page. */
207 	memset(mem, 0xff, size);
208 	write(comm_pipes.parent_ready[1], "0", 1);
209 
210 	wait(&ret);
211 	if (WIFEXITED(ret))
212 		ret = WEXITSTATUS(ret);
213 	else
214 		ret = -EINVAL;
215 
216 	ksft_test_result(!ret, "No leak from parent into child\n");
217 close_comm_pipes:
218 	close_comm_pipes(&comm_pipes);
219 }
220 
test_cow_in_parent(char * mem,size_t size)221 static void test_cow_in_parent(char *mem, size_t size)
222 {
223 	do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
224 }
225 
test_cow_in_parent_mprotect(char * mem,size_t size)226 static void test_cow_in_parent_mprotect(char *mem, size_t size)
227 {
228 	do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
229 }
230 
test_vmsplice_in_child(char * mem,size_t size)231 static void test_vmsplice_in_child(char *mem, size_t size)
232 {
233 	do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
234 }
235 
test_vmsplice_in_child_mprotect(char * mem,size_t size)236 static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
237 {
238 	do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
239 }
240 
do_test_vmsplice_in_parent(char * mem,size_t size,bool before_fork)241 static void do_test_vmsplice_in_parent(char *mem, size_t size,
242 				       bool before_fork)
243 {
244 	struct iovec iov = {
245 		.iov_base = mem,
246 		.iov_len = size,
247 	};
248 	ssize_t cur, total, transferred;
249 	struct comm_pipes comm_pipes;
250 	char *old, *new;
251 	int ret, fds[2];
252 	char buf;
253 
254 	old = malloc(size);
255 	new = malloc(size);
256 
257 	memcpy(old, mem, size);
258 
259 	ret = setup_comm_pipes(&comm_pipes);
260 	if (ret) {
261 		ksft_test_result_fail("pipe() failed\n");
262 		goto free;
263 	}
264 
265 	if (pipe(fds) < 0) {
266 		ksft_test_result_fail("pipe() failed\n");
267 		goto close_comm_pipes;
268 	}
269 
270 	if (before_fork) {
271 		transferred = vmsplice(fds[1], &iov, 1, 0);
272 		if (transferred <= 0) {
273 			ksft_test_result_fail("vmsplice() failed\n");
274 			goto close_pipe;
275 		}
276 	}
277 
278 	ret = fork();
279 	if (ret < 0) {
280 		ksft_test_result_fail("fork() failed\n");
281 		goto close_pipe;
282 	} else if (!ret) {
283 		write(comm_pipes.child_ready[1], "0", 1);
284 		while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
285 			;
286 		/* Modify page content in the child. */
287 		memset(mem, 0xff, size);
288 		exit(0);
289 	}
290 
291 	if (!before_fork) {
292 		transferred = vmsplice(fds[1], &iov, 1, 0);
293 		if (transferred <= 0) {
294 			ksft_test_result_fail("vmsplice() failed\n");
295 			wait(&ret);
296 			goto close_pipe;
297 		}
298 	}
299 
300 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
301 		;
302 	if (munmap(mem, size) < 0) {
303 		ksft_test_result_fail("munmap() failed\n");
304 		goto close_pipe;
305 	}
306 	write(comm_pipes.parent_ready[1], "0", 1);
307 
308 	/* Wait until the child is done writing. */
309 	wait(&ret);
310 	if (!WIFEXITED(ret)) {
311 		ksft_test_result_fail("wait() failed\n");
312 		goto close_pipe;
313 	}
314 
315 	/* See if we still read the old values. */
316 	for (total = 0; total < transferred; total += cur) {
317 		cur = read(fds[0], new + total, transferred - total);
318 		if (cur < 0) {
319 			ksft_test_result_fail("read() failed\n");
320 			goto close_pipe;
321 		}
322 	}
323 
324 	ksft_test_result(!memcmp(old, new, transferred),
325 			 "No leak from child into parent\n");
326 close_pipe:
327 	close(fds[0]);
328 	close(fds[1]);
329 close_comm_pipes:
330 	close_comm_pipes(&comm_pipes);
331 free:
332 	free(old);
333 	free(new);
334 }
335 
test_vmsplice_before_fork(char * mem,size_t size)336 static void test_vmsplice_before_fork(char *mem, size_t size)
337 {
338 	do_test_vmsplice_in_parent(mem, size, true);
339 }
340 
test_vmsplice_after_fork(char * mem,size_t size)341 static void test_vmsplice_after_fork(char *mem, size_t size)
342 {
343 	do_test_vmsplice_in_parent(mem, size, false);
344 }
345 
346 #ifdef LOCAL_CONFIG_HAVE_LIBURING
do_test_iouring(char * mem,size_t size,bool use_fork)347 static void do_test_iouring(char *mem, size_t size, bool use_fork)
348 {
349 	struct comm_pipes comm_pipes;
350 	struct io_uring_cqe *cqe;
351 	struct io_uring_sqe *sqe;
352 	struct io_uring ring;
353 	ssize_t cur, total;
354 	struct iovec iov;
355 	char *buf, *tmp;
356 	int ret, fd;
357 	FILE *file;
358 
359 	ret = setup_comm_pipes(&comm_pipes);
360 	if (ret) {
361 		ksft_test_result_fail("pipe() failed\n");
362 		return;
363 	}
364 
365 	file = tmpfile();
366 	if (!file) {
367 		ksft_test_result_fail("tmpfile() failed\n");
368 		goto close_comm_pipes;
369 	}
370 	fd = fileno(file);
371 	assert(fd);
372 
373 	tmp = malloc(size);
374 	if (!tmp) {
375 		ksft_test_result_fail("malloc() failed\n");
376 		goto close_file;
377 	}
378 
379 	/* Skip on errors, as we might just lack kernel support. */
380 	ret = io_uring_queue_init(1, &ring, 0);
381 	if (ret < 0) {
382 		ksft_test_result_skip("io_uring_queue_init() failed\n");
383 		goto free_tmp;
384 	}
385 
386 	/*
387 	 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
388 	 * | FOLL_LONGTERM the range.
389 	 *
390 	 * Skip on errors, as we might just lack kernel support or might not
391 	 * have sufficient MEMLOCK permissions.
392 	 */
393 	iov.iov_base = mem;
394 	iov.iov_len = size;
395 	ret = io_uring_register_buffers(&ring, &iov, 1);
396 	if (ret) {
397 		ksft_test_result_skip("io_uring_register_buffers() failed\n");
398 		goto queue_exit;
399 	}
400 
401 	if (use_fork) {
402 		/*
403 		 * fork() and keep the child alive until we're done. Note that
404 		 * we expect the pinned page to not get shared with the child.
405 		 */
406 		ret = fork();
407 		if (ret < 0) {
408 			ksft_test_result_fail("fork() failed\n");
409 			goto unregister_buffers;
410 		} else if (!ret) {
411 			write(comm_pipes.child_ready[1], "0", 1);
412 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
413 				;
414 			exit(0);
415 		}
416 
417 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
418 			;
419 	} else {
420 		/*
421 		 * Map the page R/O into the page table. Enable softdirty
422 		 * tracking to stop the page from getting mapped R/W immediately
423 		 * again by mprotect() optimizations. Note that we don't have an
424 		 * easy way to test if that worked (the pagemap does not export
425 		 * if the page is mapped R/O vs. R/W).
426 		 */
427 		ret = mprotect(mem, size, PROT_READ);
428 		clear_softdirty();
429 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
430 		if (ret) {
431 			ksft_test_result_fail("mprotect() failed\n");
432 			goto unregister_buffers;
433 		}
434 	}
435 
436 	/*
437 	 * Modify the page and write page content as observed by the fixed
438 	 * buffer pin to the file so we can verify it.
439 	 */
440 	memset(mem, 0xff, size);
441 	sqe = io_uring_get_sqe(&ring);
442 	if (!sqe) {
443 		ksft_test_result_fail("io_uring_get_sqe() failed\n");
444 		goto quit_child;
445 	}
446 	io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
447 
448 	ret = io_uring_submit(&ring);
449 	if (ret < 0) {
450 		ksft_test_result_fail("io_uring_submit() failed\n");
451 		goto quit_child;
452 	}
453 
454 	ret = io_uring_wait_cqe(&ring, &cqe);
455 	if (ret < 0) {
456 		ksft_test_result_fail("io_uring_wait_cqe() failed\n");
457 		goto quit_child;
458 	}
459 
460 	if (cqe->res != size) {
461 		ksft_test_result_fail("write_fixed failed\n");
462 		goto quit_child;
463 	}
464 	io_uring_cqe_seen(&ring, cqe);
465 
466 	/* Read back the file content to the temporary buffer. */
467 	total = 0;
468 	while (total < size) {
469 		cur = pread(fd, tmp + total, size - total, total);
470 		if (cur < 0) {
471 			ksft_test_result_fail("pread() failed\n");
472 			goto quit_child;
473 		}
474 		total += cur;
475 	}
476 
477 	/* Finally, check if we read what we expected. */
478 	ksft_test_result(!memcmp(mem, tmp, size),
479 			 "Longterm R/W pin is reliable\n");
480 
481 quit_child:
482 	if (use_fork) {
483 		write(comm_pipes.parent_ready[1], "0", 1);
484 		wait(&ret);
485 	}
486 unregister_buffers:
487 	io_uring_unregister_buffers(&ring);
488 queue_exit:
489 	io_uring_queue_exit(&ring);
490 free_tmp:
491 	free(tmp);
492 close_file:
493 	fclose(file);
494 close_comm_pipes:
495 	close_comm_pipes(&comm_pipes);
496 }
497 
test_iouring_ro(char * mem,size_t size)498 static void test_iouring_ro(char *mem, size_t size)
499 {
500 	do_test_iouring(mem, size, false);
501 }
502 
test_iouring_fork(char * mem,size_t size)503 static void test_iouring_fork(char *mem, size_t size)
504 {
505 	do_test_iouring(mem, size, true);
506 }
507 
508 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
509 
510 enum ro_pin_test {
511 	RO_PIN_TEST,
512 	RO_PIN_TEST_SHARED,
513 	RO_PIN_TEST_PREVIOUSLY_SHARED,
514 	RO_PIN_TEST_RO_EXCLUSIVE,
515 };
516 
do_test_ro_pin(char * mem,size_t size,enum ro_pin_test test,bool fast)517 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
518 			   bool fast)
519 {
520 	struct pin_longterm_test args;
521 	struct comm_pipes comm_pipes;
522 	char *tmp, buf;
523 	__u64 tmp_val;
524 	int ret;
525 
526 	if (gup_fd < 0) {
527 		ksft_test_result_skip("gup_test not available\n");
528 		return;
529 	}
530 
531 	tmp = malloc(size);
532 	if (!tmp) {
533 		ksft_test_result_fail("malloc() failed\n");
534 		return;
535 	}
536 
537 	ret = setup_comm_pipes(&comm_pipes);
538 	if (ret) {
539 		ksft_test_result_fail("pipe() failed\n");
540 		goto free_tmp;
541 	}
542 
543 	switch (test) {
544 	case RO_PIN_TEST:
545 		break;
546 	case RO_PIN_TEST_SHARED:
547 	case RO_PIN_TEST_PREVIOUSLY_SHARED:
548 		/*
549 		 * Share the pages with our child. As the pages are not pinned,
550 		 * this should just work.
551 		 */
552 		ret = fork();
553 		if (ret < 0) {
554 			ksft_test_result_fail("fork() failed\n");
555 			goto close_comm_pipes;
556 		} else if (!ret) {
557 			write(comm_pipes.child_ready[1], "0", 1);
558 			while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
559 				;
560 			exit(0);
561 		}
562 
563 		/* Wait until our child is ready. */
564 		while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
565 			;
566 
567 		if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
568 			/*
569 			 * Tell the child to quit now and wait until it quit.
570 			 * The pages should now be mapped R/O into our page
571 			 * tables, but they are no longer shared.
572 			 */
573 			write(comm_pipes.parent_ready[1], "0", 1);
574 			wait(&ret);
575 			if (!WIFEXITED(ret))
576 				ksft_print_msg("[INFO] wait() failed\n");
577 		}
578 		break;
579 	case RO_PIN_TEST_RO_EXCLUSIVE:
580 		/*
581 		 * Map the page R/O into the page table. Enable softdirty
582 		 * tracking to stop the page from getting mapped R/W immediately
583 		 * again by mprotect() optimizations. Note that we don't have an
584 		 * easy way to test if that worked (the pagemap does not export
585 		 * if the page is mapped R/O vs. R/W).
586 		 */
587 		ret = mprotect(mem, size, PROT_READ);
588 		clear_softdirty();
589 		ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
590 		if (ret) {
591 			ksft_test_result_fail("mprotect() failed\n");
592 			goto close_comm_pipes;
593 		}
594 		break;
595 	default:
596 		assert(false);
597 	}
598 
599 	/* Take a R/O pin. This should trigger unsharing. */
600 	args.addr = (__u64)(uintptr_t)mem;
601 	args.size = size;
602 	args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
603 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
604 	if (ret) {
605 		if (errno == EINVAL)
606 			ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
607 		else
608 			ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
609 		goto wait;
610 	}
611 
612 	/* Modify the page. */
613 	memset(mem, 0xff, size);
614 
615 	/*
616 	 * Read back the content via the pin to the temporary buffer and
617 	 * test if we observed the modification.
618 	 */
619 	tmp_val = (__u64)(uintptr_t)tmp;
620 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
621 	if (ret)
622 		ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
623 	else
624 		ksft_test_result(!memcmp(mem, tmp, size),
625 				 "Longterm R/O pin is reliable\n");
626 
627 	ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
628 	if (ret)
629 		ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
630 wait:
631 	switch (test) {
632 	case RO_PIN_TEST_SHARED:
633 		write(comm_pipes.parent_ready[1], "0", 1);
634 		wait(&ret);
635 		if (!WIFEXITED(ret))
636 			ksft_print_msg("[INFO] wait() failed\n");
637 		break;
638 	default:
639 		break;
640 	}
641 close_comm_pipes:
642 	close_comm_pipes(&comm_pipes);
643 free_tmp:
644 	free(tmp);
645 }
646 
test_ro_pin_on_shared(char * mem,size_t size)647 static void test_ro_pin_on_shared(char *mem, size_t size)
648 {
649 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
650 }
651 
test_ro_fast_pin_on_shared(char * mem,size_t size)652 static void test_ro_fast_pin_on_shared(char *mem, size_t size)
653 {
654 	do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
655 }
656 
test_ro_pin_on_ro_previously_shared(char * mem,size_t size)657 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
658 {
659 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
660 }
661 
test_ro_fast_pin_on_ro_previously_shared(char * mem,size_t size)662 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
663 {
664 	do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
665 }
666 
test_ro_pin_on_ro_exclusive(char * mem,size_t size)667 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
668 {
669 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
670 }
671 
test_ro_fast_pin_on_ro_exclusive(char * mem,size_t size)672 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
673 {
674 	do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
675 }
676 
677 typedef void (*test_fn)(char *mem, size_t size);
678 
do_run_with_base_page(test_fn fn,bool swapout)679 static void do_run_with_base_page(test_fn fn, bool swapout)
680 {
681 	char *mem;
682 	int ret;
683 
684 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
685 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
686 	if (mem == MAP_FAILED) {
687 		ksft_test_result_fail("mmap() failed\n");
688 		return;
689 	}
690 
691 	ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
692 	/* Ignore if not around on a kernel. */
693 	if (ret && errno != EINVAL) {
694 		ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
695 		goto munmap;
696 	}
697 
698 	/* Populate a base page. */
699 	memset(mem, 0, pagesize);
700 
701 	if (swapout) {
702 		madvise(mem, pagesize, MADV_PAGEOUT);
703 		if (!pagemap_is_swapped(pagemap_fd, mem)) {
704 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
705 			goto munmap;
706 		}
707 	}
708 
709 	fn(mem, pagesize);
710 munmap:
711 	munmap(mem, pagesize);
712 }
713 
run_with_base_page(test_fn fn,const char * desc)714 static void run_with_base_page(test_fn fn, const char *desc)
715 {
716 	ksft_print_msg("[RUN] %s ... with base page\n", desc);
717 	do_run_with_base_page(fn, false);
718 }
719 
run_with_base_page_swap(test_fn fn,const char * desc)720 static void run_with_base_page_swap(test_fn fn, const char *desc)
721 {
722 	ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
723 	do_run_with_base_page(fn, true);
724 }
725 
726 enum thp_run {
727 	THP_RUN_PMD,
728 	THP_RUN_PMD_SWAPOUT,
729 	THP_RUN_PTE,
730 	THP_RUN_PTE_SWAPOUT,
731 	THP_RUN_SINGLE_PTE,
732 	THP_RUN_SINGLE_PTE_SWAPOUT,
733 	THP_RUN_PARTIAL_MREMAP,
734 	THP_RUN_PARTIAL_SHARED,
735 };
736 
do_run_with_thp(test_fn fn,enum thp_run thp_run)737 static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
738 {
739 	char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
740 	size_t size, mmap_size, mremap_size;
741 	int ret;
742 
743 	/* For alignment purposes, we need twice the thp size. */
744 	mmap_size = 2 * thpsize;
745 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
746 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
747 	if (mmap_mem == MAP_FAILED) {
748 		ksft_test_result_fail("mmap() failed\n");
749 		return;
750 	}
751 
752 	/* We need a THP-aligned memory area. */
753 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
754 
755 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
756 	if (ret) {
757 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
758 		goto munmap;
759 	}
760 
761 	/*
762 	 * Try to populate a THP. Touch the first sub-page and test if we get
763 	 * another sub-page populated automatically.
764 	 */
765 	mem[0] = 0;
766 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
767 		ksft_test_result_skip("Did not get a THP populated\n");
768 		goto munmap;
769 	}
770 	memset(mem, 0, thpsize);
771 
772 	size = thpsize;
773 	switch (thp_run) {
774 	case THP_RUN_PMD:
775 	case THP_RUN_PMD_SWAPOUT:
776 		break;
777 	case THP_RUN_PTE:
778 	case THP_RUN_PTE_SWAPOUT:
779 		/*
780 		 * Trigger PTE-mapping the THP by temporarily mapping a single
781 		 * subpage R/O.
782 		 */
783 		ret = mprotect(mem + pagesize, pagesize, PROT_READ);
784 		if (ret) {
785 			ksft_test_result_fail("mprotect() failed\n");
786 			goto munmap;
787 		}
788 		ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
789 		if (ret) {
790 			ksft_test_result_fail("mprotect() failed\n");
791 			goto munmap;
792 		}
793 		break;
794 	case THP_RUN_SINGLE_PTE:
795 	case THP_RUN_SINGLE_PTE_SWAPOUT:
796 		/*
797 		 * Discard all but a single subpage of that PTE-mapped THP. What
798 		 * remains is a single PTE mapping a single subpage.
799 		 */
800 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
801 		if (ret) {
802 			ksft_test_result_fail("MADV_DONTNEED failed\n");
803 			goto munmap;
804 		}
805 		size = pagesize;
806 		break;
807 	case THP_RUN_PARTIAL_MREMAP:
808 		/*
809 		 * Remap half of the THP. We need some new memory location
810 		 * for that.
811 		 */
812 		mremap_size = thpsize / 2;
813 		mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
814 				  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
815 		if (mem == MAP_FAILED) {
816 			ksft_test_result_fail("mmap() failed\n");
817 			goto munmap;
818 		}
819 		tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
820 			     MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
821 		if (tmp != mremap_mem) {
822 			ksft_test_result_fail("mremap() failed\n");
823 			goto munmap;
824 		}
825 		size = mremap_size;
826 		break;
827 	case THP_RUN_PARTIAL_SHARED:
828 		/*
829 		 * Share the first page of the THP with a child and quit the
830 		 * child. This will result in some parts of the THP never
831 		 * have been shared.
832 		 */
833 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
834 		if (ret) {
835 			ksft_test_result_fail("MADV_DONTFORK failed\n");
836 			goto munmap;
837 		}
838 		ret = fork();
839 		if (ret < 0) {
840 			ksft_test_result_fail("fork() failed\n");
841 			goto munmap;
842 		} else if (!ret) {
843 			exit(0);
844 		}
845 		wait(&ret);
846 		/* Allow for sharing all pages again. */
847 		ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
848 		if (ret) {
849 			ksft_test_result_fail("MADV_DOFORK failed\n");
850 			goto munmap;
851 		}
852 		break;
853 	default:
854 		assert(false);
855 	}
856 
857 	switch (thp_run) {
858 	case THP_RUN_PMD_SWAPOUT:
859 	case THP_RUN_PTE_SWAPOUT:
860 	case THP_RUN_SINGLE_PTE_SWAPOUT:
861 		madvise(mem, size, MADV_PAGEOUT);
862 		if (!range_is_swapped(mem, size)) {
863 			ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
864 			goto munmap;
865 		}
866 		break;
867 	default:
868 		break;
869 	}
870 
871 	fn(mem, size);
872 munmap:
873 	munmap(mmap_mem, mmap_size);
874 	if (mremap_mem != MAP_FAILED)
875 		munmap(mremap_mem, mremap_size);
876 }
877 
run_with_thp(test_fn fn,const char * desc)878 static void run_with_thp(test_fn fn, const char *desc)
879 {
880 	ksft_print_msg("[RUN] %s ... with THP\n", desc);
881 	do_run_with_thp(fn, THP_RUN_PMD);
882 }
883 
run_with_thp_swap(test_fn fn,const char * desc)884 static void run_with_thp_swap(test_fn fn, const char *desc)
885 {
886 	ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
887 	do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
888 }
889 
run_with_pte_mapped_thp(test_fn fn,const char * desc)890 static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
891 {
892 	ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
893 	do_run_with_thp(fn, THP_RUN_PTE);
894 }
895 
run_with_pte_mapped_thp_swap(test_fn fn,const char * desc)896 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
897 {
898 	ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
899 	do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
900 }
901 
run_with_single_pte_of_thp(test_fn fn,const char * desc)902 static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
903 {
904 	ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
905 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
906 }
907 
run_with_single_pte_of_thp_swap(test_fn fn,const char * desc)908 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
909 {
910 	ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
911 	do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
912 }
913 
run_with_partial_mremap_thp(test_fn fn,const char * desc)914 static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
915 {
916 	ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
917 	do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
918 }
919 
run_with_partial_shared_thp(test_fn fn,const char * desc)920 static void run_with_partial_shared_thp(test_fn fn, const char *desc)
921 {
922 	ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
923 	do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
924 }
925 
run_with_hugetlb(test_fn fn,const char * desc,size_t hugetlbsize)926 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
927 {
928 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
929 	char *mem, *dummy;
930 
931 	ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
932 		       hugetlbsize / 1024);
933 
934 	flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
935 
936 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
937 	if (mem == MAP_FAILED) {
938 		ksft_test_result_skip("need more free huge pages\n");
939 		return;
940 	}
941 
942 	/* Populate an huge page. */
943 	memset(mem, 0, hugetlbsize);
944 
945 	/*
946 	 * We need a total of two hugetlb pages to handle COW/unsharing
947 	 * properly, otherwise we might get zapped by a SIGBUS.
948 	 */
949 	dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
950 	if (dummy == MAP_FAILED) {
951 		ksft_test_result_skip("need more free huge pages\n");
952 		goto munmap;
953 	}
954 	munmap(dummy, hugetlbsize);
955 
956 	fn(mem, hugetlbsize);
957 munmap:
958 	munmap(mem, hugetlbsize);
959 }
960 
961 struct test_case {
962 	const char *desc;
963 	test_fn fn;
964 };
965 
966 /*
967  * Test cases that are specific to anonymous pages: pages in private mappings
968  * that may get shared via COW during fork().
969  */
970 static const struct test_case anon_test_cases[] = {
971 	/*
972 	 * Basic COW tests for fork() without any GUP. If we miss to break COW,
973 	 * either the child can observe modifications by the parent or the
974 	 * other way around.
975 	 */
976 	{
977 		"Basic COW after fork()",
978 		test_cow_in_parent,
979 	},
980 	/*
981 	 * Basic test, but do an additional mprotect(PROT_READ)+
982 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
983 	 */
984 	{
985 		"Basic COW after fork() with mprotect() optimization",
986 		test_cow_in_parent_mprotect,
987 	},
988 	/*
989 	 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
990 	 * we miss to break COW, the child observes modifications by the parent.
991 	 * This is CVE-2020-29374 reported by Jann Horn.
992 	 */
993 	{
994 		"vmsplice() + unmap in child",
995 		test_vmsplice_in_child
996 	},
997 	/*
998 	 * vmsplice() test, but do an additional mprotect(PROT_READ)+
999 	 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1000 	 */
1001 	{
1002 		"vmsplice() + unmap in child with mprotect() optimization",
1003 		test_vmsplice_in_child_mprotect
1004 	},
1005 	/*
1006 	 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1007 	 * fork(); modify in the child. If we miss to break COW, the parent
1008 	 * observes modifications by the child.
1009 	 */
1010 	{
1011 		"vmsplice() before fork(), unmap in parent after fork()",
1012 		test_vmsplice_before_fork,
1013 	},
1014 	/*
1015 	 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1016 	 * child. If we miss to break COW, the parent observes modifications by
1017 	 * the child.
1018 	 */
1019 	{
1020 		"vmsplice() + unmap in parent after fork()",
1021 		test_vmsplice_after_fork,
1022 	},
1023 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1024 	/*
1025 	 * Take a R/W longterm pin and then map the page R/O into the page
1026 	 * table to trigger a write fault on next access. When modifying the
1027 	 * page, the page content must be visible via the pin.
1028 	 */
1029 	{
1030 		"R/O-mapping a page registered as iouring fixed buffer",
1031 		test_iouring_ro,
1032 	},
1033 	/*
1034 	 * Take a R/W longterm pin and then fork() a child. When modifying the
1035 	 * page, the page content must be visible via the pin. We expect the
1036 	 * pinned page to not get shared with the child.
1037 	 */
1038 	{
1039 		"fork() with an iouring fixed buffer",
1040 		test_iouring_fork,
1041 	},
1042 
1043 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1044 	/*
1045 	 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1046 	 * When modifying the page via the page table, the page content change
1047 	 * must be visible via the pin.
1048 	 */
1049 	{
1050 		"R/O GUP pin on R/O-mapped shared page",
1051 		test_ro_pin_on_shared,
1052 	},
1053 	/* Same as above, but using GUP-fast. */
1054 	{
1055 		"R/O GUP-fast pin on R/O-mapped shared page",
1056 		test_ro_fast_pin_on_shared,
1057 	},
1058 	/*
1059 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1060 	 * was previously shared. When modifying the page via the page table,
1061 	 * the page content change must be visible via the pin.
1062 	 */
1063 	{
1064 		"R/O GUP pin on R/O-mapped previously-shared page",
1065 		test_ro_pin_on_ro_previously_shared,
1066 	},
1067 	/* Same as above, but using GUP-fast. */
1068 	{
1069 		"R/O GUP-fast pin on R/O-mapped previously-shared page",
1070 		test_ro_fast_pin_on_ro_previously_shared,
1071 	},
1072 	/*
1073 	 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1074 	 * When modifying the page via the page table, the page content change
1075 	 * must be visible via the pin.
1076 	 */
1077 	{
1078 		"R/O GUP pin on R/O-mapped exclusive page",
1079 		test_ro_pin_on_ro_exclusive,
1080 	},
1081 	/* Same as above, but using GUP-fast. */
1082 	{
1083 		"R/O GUP-fast pin on R/O-mapped exclusive page",
1084 		test_ro_fast_pin_on_ro_exclusive,
1085 	},
1086 };
1087 
run_anon_test_case(struct test_case const * test_case)1088 static void run_anon_test_case(struct test_case const *test_case)
1089 {
1090 	int i;
1091 
1092 	run_with_base_page(test_case->fn, test_case->desc);
1093 	run_with_base_page_swap(test_case->fn, test_case->desc);
1094 	if (thpsize) {
1095 		run_with_thp(test_case->fn, test_case->desc);
1096 		run_with_thp_swap(test_case->fn, test_case->desc);
1097 		run_with_pte_mapped_thp(test_case->fn, test_case->desc);
1098 		run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
1099 		run_with_single_pte_of_thp(test_case->fn, test_case->desc);
1100 		run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
1101 		run_with_partial_mremap_thp(test_case->fn, test_case->desc);
1102 		run_with_partial_shared_thp(test_case->fn, test_case->desc);
1103 	}
1104 	for (i = 0; i < nr_hugetlbsizes; i++)
1105 		run_with_hugetlb(test_case->fn, test_case->desc,
1106 				 hugetlbsizes[i]);
1107 }
1108 
run_anon_test_cases(void)1109 static void run_anon_test_cases(void)
1110 {
1111 	int i;
1112 
1113 	ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1114 
1115 	for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1116 		run_anon_test_case(&anon_test_cases[i]);
1117 }
1118 
tests_per_anon_test_case(void)1119 static int tests_per_anon_test_case(void)
1120 {
1121 	int tests = 2 + nr_hugetlbsizes;
1122 
1123 	if (thpsize)
1124 		tests += 8;
1125 	return tests;
1126 }
1127 
1128 enum anon_thp_collapse_test {
1129 	ANON_THP_COLLAPSE_UNSHARED,
1130 	ANON_THP_COLLAPSE_FULLY_SHARED,
1131 	ANON_THP_COLLAPSE_LOWER_SHARED,
1132 	ANON_THP_COLLAPSE_UPPER_SHARED,
1133 };
1134 
do_test_anon_thp_collapse(char * mem,size_t size,enum anon_thp_collapse_test test)1135 static void do_test_anon_thp_collapse(char *mem, size_t size,
1136 				      enum anon_thp_collapse_test test)
1137 {
1138 	struct comm_pipes comm_pipes;
1139 	char buf;
1140 	int ret;
1141 
1142 	ret = setup_comm_pipes(&comm_pipes);
1143 	if (ret) {
1144 		ksft_test_result_fail("pipe() failed\n");
1145 		return;
1146 	}
1147 
1148 	/*
1149 	 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1150 	 * R/O, such that we can try collapsing it later.
1151 	 */
1152 	ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1153 	if (ret) {
1154 		ksft_test_result_fail("mprotect() failed\n");
1155 		goto close_comm_pipes;
1156 	}
1157 	ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1158 	if (ret) {
1159 		ksft_test_result_fail("mprotect() failed\n");
1160 		goto close_comm_pipes;
1161 	}
1162 
1163 	switch (test) {
1164 	case ANON_THP_COLLAPSE_UNSHARED:
1165 		/* Collapse before actually COW-sharing the page. */
1166 		ret = madvise(mem, size, MADV_COLLAPSE);
1167 		if (ret) {
1168 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1169 					      strerror(errno));
1170 			goto close_comm_pipes;
1171 		}
1172 		break;
1173 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1174 		/* COW-share the full PTE-mapped THP. */
1175 		break;
1176 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1177 		/* Don't COW-share the upper part of the THP. */
1178 		ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1179 		if (ret) {
1180 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1181 			goto close_comm_pipes;
1182 		}
1183 		break;
1184 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1185 		/* Don't COW-share the lower part of the THP. */
1186 		ret = madvise(mem, size / 2, MADV_DONTFORK);
1187 		if (ret) {
1188 			ksft_test_result_fail("MADV_DONTFORK failed\n");
1189 			goto close_comm_pipes;
1190 		}
1191 		break;
1192 	default:
1193 		assert(false);
1194 	}
1195 
1196 	ret = fork();
1197 	if (ret < 0) {
1198 		ksft_test_result_fail("fork() failed\n");
1199 		goto close_comm_pipes;
1200 	} else if (!ret) {
1201 		switch (test) {
1202 		case ANON_THP_COLLAPSE_UNSHARED:
1203 		case ANON_THP_COLLAPSE_FULLY_SHARED:
1204 			exit(child_memcmp_fn(mem, size, &comm_pipes));
1205 			break;
1206 		case ANON_THP_COLLAPSE_LOWER_SHARED:
1207 			exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1208 			break;
1209 		case ANON_THP_COLLAPSE_UPPER_SHARED:
1210 			exit(child_memcmp_fn(mem + size / 2, size / 2,
1211 					     &comm_pipes));
1212 			break;
1213 		default:
1214 			assert(false);
1215 		}
1216 	}
1217 
1218 	while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1219 		;
1220 
1221 	switch (test) {
1222 	case ANON_THP_COLLAPSE_UNSHARED:
1223 		break;
1224 	case ANON_THP_COLLAPSE_UPPER_SHARED:
1225 	case ANON_THP_COLLAPSE_LOWER_SHARED:
1226 		/*
1227 		 * Revert MADV_DONTFORK such that we merge the VMAs and are
1228 		 * able to actually collapse.
1229 		 */
1230 		ret = madvise(mem, size, MADV_DOFORK);
1231 		if (ret) {
1232 			ksft_test_result_fail("MADV_DOFORK failed\n");
1233 			write(comm_pipes.parent_ready[1], "0", 1);
1234 			wait(&ret);
1235 			goto close_comm_pipes;
1236 		}
1237 		/* FALLTHROUGH */
1238 	case ANON_THP_COLLAPSE_FULLY_SHARED:
1239 		/* Collapse before anyone modified the COW-shared page. */
1240 		ret = madvise(mem, size, MADV_COLLAPSE);
1241 		if (ret) {
1242 			ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1243 					      strerror(errno));
1244 			write(comm_pipes.parent_ready[1], "0", 1);
1245 			wait(&ret);
1246 			goto close_comm_pipes;
1247 		}
1248 		break;
1249 	default:
1250 		assert(false);
1251 	}
1252 
1253 	/* Modify the page. */
1254 	memset(mem, 0xff, size);
1255 	write(comm_pipes.parent_ready[1], "0", 1);
1256 
1257 	wait(&ret);
1258 	if (WIFEXITED(ret))
1259 		ret = WEXITSTATUS(ret);
1260 	else
1261 		ret = -EINVAL;
1262 
1263 	ksft_test_result(!ret, "No leak from parent into child\n");
1264 close_comm_pipes:
1265 	close_comm_pipes(&comm_pipes);
1266 }
1267 
test_anon_thp_collapse_unshared(char * mem,size_t size)1268 static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1269 {
1270 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1271 }
1272 
test_anon_thp_collapse_fully_shared(char * mem,size_t size)1273 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1274 {
1275 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1276 }
1277 
test_anon_thp_collapse_lower_shared(char * mem,size_t size)1278 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1279 {
1280 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1281 }
1282 
test_anon_thp_collapse_upper_shared(char * mem,size_t size)1283 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1284 {
1285 	do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1286 }
1287 
1288 /*
1289  * Test cases that are specific to anonymous THP: pages in private mappings
1290  * that may get shared via COW during fork().
1291  */
1292 static const struct test_case anon_thp_test_cases[] = {
1293 	/*
1294 	 * Basic COW test for fork() without any GUP when collapsing a THP
1295 	 * before fork().
1296 	 *
1297 	 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1298 	 * collapse") might easily get COW handling wrong when not collapsing
1299 	 * exclusivity information properly.
1300 	 */
1301 	{
1302 		"Basic COW after fork() when collapsing before fork()",
1303 		test_anon_thp_collapse_unshared,
1304 	},
1305 	/* Basic COW test, but collapse after COW-sharing a full THP. */
1306 	{
1307 		"Basic COW after fork() when collapsing after fork() (fully shared)",
1308 		test_anon_thp_collapse_fully_shared,
1309 	},
1310 	/*
1311 	 * Basic COW test, but collapse after COW-sharing the lower half of a
1312 	 * THP.
1313 	 */
1314 	{
1315 		"Basic COW after fork() when collapsing after fork() (lower shared)",
1316 		test_anon_thp_collapse_lower_shared,
1317 	},
1318 	/*
1319 	 * Basic COW test, but collapse after COW-sharing the upper half of a
1320 	 * THP.
1321 	 */
1322 	{
1323 		"Basic COW after fork() when collapsing after fork() (upper shared)",
1324 		test_anon_thp_collapse_upper_shared,
1325 	},
1326 };
1327 
run_anon_thp_test_cases(void)1328 static void run_anon_thp_test_cases(void)
1329 {
1330 	int i;
1331 
1332 	if (!thpsize)
1333 		return;
1334 
1335 	ksft_print_msg("[INFO] Anonymous THP tests\n");
1336 
1337 	for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1338 		struct test_case const *test_case = &anon_thp_test_cases[i];
1339 
1340 		ksft_print_msg("[RUN] %s\n", test_case->desc);
1341 		do_run_with_thp(test_case->fn, THP_RUN_PMD);
1342 	}
1343 }
1344 
tests_per_anon_thp_test_case(void)1345 static int tests_per_anon_thp_test_case(void)
1346 {
1347 	return thpsize ? 1 : 0;
1348 }
1349 
1350 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1351 
test_cow(char * mem,const char * smem,size_t size)1352 static void test_cow(char *mem, const char *smem, size_t size)
1353 {
1354 	char *old = malloc(size);
1355 
1356 	/* Backup the original content. */
1357 	memcpy(old, smem, size);
1358 
1359 	/* Modify the page. */
1360 	memset(mem, 0xff, size);
1361 
1362 	/* See if we still read the old values via the other mapping. */
1363 	ksft_test_result(!memcmp(smem, old, size),
1364 			 "Other mapping not modified\n");
1365 	free(old);
1366 }
1367 
test_ro_pin(char * mem,const char * smem,size_t size)1368 static void test_ro_pin(char *mem, const char *smem, size_t size)
1369 {
1370 	do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1371 }
1372 
test_ro_fast_pin(char * mem,const char * smem,size_t size)1373 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1374 {
1375 	do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1376 }
1377 
run_with_zeropage(non_anon_test_fn fn,const char * desc)1378 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1379 {
1380 	char *mem, *smem, tmp;
1381 
1382 	ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1383 
1384 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1385 		   MAP_PRIVATE | MAP_ANON, -1, 0);
1386 	if (mem == MAP_FAILED) {
1387 		ksft_test_result_fail("mmap() failed\n");
1388 		return;
1389 	}
1390 
1391 	smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1392 	if (mem == MAP_FAILED) {
1393 		ksft_test_result_fail("mmap() failed\n");
1394 		goto munmap;
1395 	}
1396 
1397 	/* Read from the page to populate the shared zeropage. */
1398 	tmp = *mem + *smem;
1399 	asm volatile("" : "+r" (tmp));
1400 
1401 	fn(mem, smem, pagesize);
1402 munmap:
1403 	munmap(mem, pagesize);
1404 	if (smem != MAP_FAILED)
1405 		munmap(smem, pagesize);
1406 }
1407 
run_with_huge_zeropage(non_anon_test_fn fn,const char * desc)1408 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1409 {
1410 	char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1411 	size_t mmap_size;
1412 	int ret;
1413 
1414 	ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1415 
1416 	if (!has_huge_zeropage) {
1417 		ksft_test_result_skip("Huge zeropage not enabled\n");
1418 		return;
1419 	}
1420 
1421 	/* For alignment purposes, we need twice the thp size. */
1422 	mmap_size = 2 * thpsize;
1423 	mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1424 			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1425 	if (mmap_mem == MAP_FAILED) {
1426 		ksft_test_result_fail("mmap() failed\n");
1427 		return;
1428 	}
1429 	mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1430 			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1431 	if (mmap_smem == MAP_FAILED) {
1432 		ksft_test_result_fail("mmap() failed\n");
1433 		goto munmap;
1434 	}
1435 
1436 	/* We need a THP-aligned memory area. */
1437 	mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
1438 	smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
1439 
1440 	ret = madvise(mem, thpsize, MADV_HUGEPAGE);
1441 	ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
1442 	if (ret) {
1443 		ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1444 		goto munmap;
1445 	}
1446 
1447 	/*
1448 	 * Read from the memory to populate the huge shared zeropage. Read from
1449 	 * the first sub-page and test if we get another sub-page populated
1450 	 * automatically.
1451 	 */
1452 	tmp = *mem + *smem;
1453 	asm volatile("" : "+r" (tmp));
1454 	if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1455 	    !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1456 		ksft_test_result_skip("Did not get THPs populated\n");
1457 		goto munmap;
1458 	}
1459 
1460 	fn(mem, smem, thpsize);
1461 munmap:
1462 	munmap(mmap_mem, mmap_size);
1463 	if (mmap_smem != MAP_FAILED)
1464 		munmap(mmap_smem, mmap_size);
1465 }
1466 
run_with_memfd(non_anon_test_fn fn,const char * desc)1467 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1468 {
1469 	char *mem, *smem, tmp;
1470 	int fd;
1471 
1472 	ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1473 
1474 	fd = memfd_create("test", 0);
1475 	if (fd < 0) {
1476 		ksft_test_result_fail("memfd_create() failed\n");
1477 		return;
1478 	}
1479 
1480 	/* File consists of a single page filled with zeroes. */
1481 	if (fallocate(fd, 0, 0, pagesize)) {
1482 		ksft_test_result_fail("fallocate() failed\n");
1483 		goto close;
1484 	}
1485 
1486 	/* Create a private mapping of the memfd. */
1487 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1488 	if (mem == MAP_FAILED) {
1489 		ksft_test_result_fail("mmap() failed\n");
1490 		goto close;
1491 	}
1492 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1493 	if (mem == MAP_FAILED) {
1494 		ksft_test_result_fail("mmap() failed\n");
1495 		goto munmap;
1496 	}
1497 
1498 	/* Fault the page in. */
1499 	tmp = *mem + *smem;
1500 	asm volatile("" : "+r" (tmp));
1501 
1502 	fn(mem, smem, pagesize);
1503 munmap:
1504 	munmap(mem, pagesize);
1505 	if (smem != MAP_FAILED)
1506 		munmap(smem, pagesize);
1507 close:
1508 	close(fd);
1509 }
1510 
run_with_tmpfile(non_anon_test_fn fn,const char * desc)1511 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1512 {
1513 	char *mem, *smem, tmp;
1514 	FILE *file;
1515 	int fd;
1516 
1517 	ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1518 
1519 	file = tmpfile();
1520 	if (!file) {
1521 		ksft_test_result_fail("tmpfile() failed\n");
1522 		return;
1523 	}
1524 
1525 	fd = fileno(file);
1526 	if (fd < 0) {
1527 		ksft_test_result_skip("fileno() failed\n");
1528 		return;
1529 	}
1530 
1531 	/* File consists of a single page filled with zeroes. */
1532 	if (fallocate(fd, 0, 0, pagesize)) {
1533 		ksft_test_result_fail("fallocate() failed\n");
1534 		goto close;
1535 	}
1536 
1537 	/* Create a private mapping of the memfd. */
1538 	mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1539 	if (mem == MAP_FAILED) {
1540 		ksft_test_result_fail("mmap() failed\n");
1541 		goto close;
1542 	}
1543 	smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1544 	if (mem == MAP_FAILED) {
1545 		ksft_test_result_fail("mmap() failed\n");
1546 		goto munmap;
1547 	}
1548 
1549 	/* Fault the page in. */
1550 	tmp = *mem + *smem;
1551 	asm volatile("" : "+r" (tmp));
1552 
1553 	fn(mem, smem, pagesize);
1554 munmap:
1555 	munmap(mem, pagesize);
1556 	if (smem != MAP_FAILED)
1557 		munmap(smem, pagesize);
1558 close:
1559 	fclose(file);
1560 }
1561 
run_with_memfd_hugetlb(non_anon_test_fn fn,const char * desc,size_t hugetlbsize)1562 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1563 				   size_t hugetlbsize)
1564 {
1565 	int flags = MFD_HUGETLB;
1566 	char *mem, *smem, tmp;
1567 	int fd;
1568 
1569 	ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1570 		       hugetlbsize / 1024);
1571 
1572 	flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1573 
1574 	fd = memfd_create("test", flags);
1575 	if (fd < 0) {
1576 		ksft_test_result_skip("memfd_create() failed\n");
1577 		return;
1578 	}
1579 
1580 	/* File consists of a single page filled with zeroes. */
1581 	if (fallocate(fd, 0, 0, hugetlbsize)) {
1582 		ksft_test_result_skip("need more free huge pages\n");
1583 		goto close;
1584 	}
1585 
1586 	/* Create a private mapping of the memfd. */
1587 	mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1588 		   0);
1589 	if (mem == MAP_FAILED) {
1590 		ksft_test_result_skip("need more free huge pages\n");
1591 		goto close;
1592 	}
1593 	smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1594 	if (mem == MAP_FAILED) {
1595 		ksft_test_result_fail("mmap() failed\n");
1596 		goto munmap;
1597 	}
1598 
1599 	/* Fault the page in. */
1600 	tmp = *mem + *smem;
1601 	asm volatile("" : "+r" (tmp));
1602 
1603 	fn(mem, smem, hugetlbsize);
1604 munmap:
1605 	munmap(mem, hugetlbsize);
1606 	if (mem != MAP_FAILED)
1607 		munmap(smem, hugetlbsize);
1608 close:
1609 	close(fd);
1610 }
1611 
1612 struct non_anon_test_case {
1613 	const char *desc;
1614 	non_anon_test_fn fn;
1615 };
1616 
1617 /*
1618  * Test cases that target any pages in private mappings that are not anonymous:
1619  * pages that may get shared via COW ndependent of fork(). This includes
1620  * the shared zeropage(s), pagecache pages, ...
1621  */
1622 static const struct non_anon_test_case non_anon_test_cases[] = {
1623 	/*
1624 	 * Basic COW test without any GUP. If we miss to break COW, changes are
1625 	 * visible via other private/shared mappings.
1626 	 */
1627 	{
1628 		"Basic COW",
1629 		test_cow,
1630 	},
1631 	/*
1632 	 * Take a R/O longterm pin. When modifying the page via the page table,
1633 	 * the page content change must be visible via the pin.
1634 	 */
1635 	{
1636 		"R/O longterm GUP pin",
1637 		test_ro_pin,
1638 	},
1639 	/* Same as above, but using GUP-fast. */
1640 	{
1641 		"R/O longterm GUP-fast pin",
1642 		test_ro_fast_pin,
1643 	},
1644 };
1645 
run_non_anon_test_case(struct non_anon_test_case const * test_case)1646 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1647 {
1648 	int i;
1649 
1650 	run_with_zeropage(test_case->fn, test_case->desc);
1651 	run_with_memfd(test_case->fn, test_case->desc);
1652 	run_with_tmpfile(test_case->fn, test_case->desc);
1653 	if (thpsize)
1654 		run_with_huge_zeropage(test_case->fn, test_case->desc);
1655 	for (i = 0; i < nr_hugetlbsizes; i++)
1656 		run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1657 				       hugetlbsizes[i]);
1658 }
1659 
run_non_anon_test_cases(void)1660 static void run_non_anon_test_cases(void)
1661 {
1662 	int i;
1663 
1664 	ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1665 
1666 	for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1667 		run_non_anon_test_case(&non_anon_test_cases[i]);
1668 }
1669 
tests_per_non_anon_test_case(void)1670 static int tests_per_non_anon_test_case(void)
1671 {
1672 	int tests = 3 + nr_hugetlbsizes;
1673 
1674 	if (thpsize)
1675 		tests += 1;
1676 	return tests;
1677 }
1678 
main(int argc,char ** argv)1679 int main(int argc, char **argv)
1680 {
1681 	int err;
1682 
1683 	ksft_print_header();
1684 
1685 	pagesize = getpagesize();
1686 	thpsize = read_pmd_pagesize();
1687 	if (thpsize)
1688 		ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
1689 			       thpsize / 1024);
1690 	nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1691 						    ARRAY_SIZE(hugetlbsizes));
1692 	detect_huge_zeropage();
1693 
1694 	ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1695 		      ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1696 		      ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1697 
1698 	gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1699 	pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1700 	if (pagemap_fd < 0)
1701 		ksft_exit_fail_msg("opening pagemap failed\n");
1702 
1703 	run_anon_test_cases();
1704 	run_anon_thp_test_cases();
1705 	run_non_anon_test_cases();
1706 
1707 	err = ksft_get_fail_cnt();
1708 	if (err)
1709 		ksft_exit_fail_msg("%d out of %d tests failed\n",
1710 				   err, ksft_test_num());
1711 	return ksft_exit_pass();
1712 }
1713