1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * COW (Copy On Write) tests.
4 *
5 * Copyright 2022, Red Hat, Inc.
6 *
7 * Author(s): David Hildenbrand <david@redhat.com>
8 */
9 #define _GNU_SOURCE
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <assert.h>
18 #include <linux/mman.h>
19 #include <sys/mman.h>
20 #include <sys/ioctl.h>
21 #include <sys/wait.h>
22 #include <linux/memfd.h>
23
24 #include "local_config.h"
25 #ifdef LOCAL_CONFIG_HAVE_LIBURING
26 #include <liburing.h>
27 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
28
29 #include "../../../../mm/gup_test.h"
30 #include "../kselftest.h"
31 #include "vm_util.h"
32
33 static size_t pagesize;
34 static int pagemap_fd;
35 static size_t thpsize;
36 static int nr_hugetlbsizes;
37 static size_t hugetlbsizes[10];
38 static int gup_fd;
39 static bool has_huge_zeropage;
40
detect_huge_zeropage(void)41 static void detect_huge_zeropage(void)
42 {
43 int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
44 O_RDONLY);
45 size_t enabled = 0;
46 char buf[15];
47 int ret;
48
49 if (fd < 0)
50 return;
51
52 ret = pread(fd, buf, sizeof(buf), 0);
53 if (ret > 0 && ret < sizeof(buf)) {
54 buf[ret] = 0;
55
56 enabled = strtoul(buf, NULL, 10);
57 if (enabled == 1) {
58 has_huge_zeropage = true;
59 ksft_print_msg("[INFO] huge zeropage is enabled\n");
60 }
61 }
62
63 close(fd);
64 }
65
range_is_swapped(void * addr,size_t size)66 static bool range_is_swapped(void *addr, size_t size)
67 {
68 for (; size; addr += pagesize, size -= pagesize)
69 if (!pagemap_is_swapped(pagemap_fd, addr))
70 return false;
71 return true;
72 }
73
74 struct comm_pipes {
75 int child_ready[2];
76 int parent_ready[2];
77 };
78
setup_comm_pipes(struct comm_pipes * comm_pipes)79 static int setup_comm_pipes(struct comm_pipes *comm_pipes)
80 {
81 if (pipe(comm_pipes->child_ready) < 0)
82 return -errno;
83 if (pipe(comm_pipes->parent_ready) < 0) {
84 close(comm_pipes->child_ready[0]);
85 close(comm_pipes->child_ready[1]);
86 return -errno;
87 }
88
89 return 0;
90 }
91
close_comm_pipes(struct comm_pipes * comm_pipes)92 static void close_comm_pipes(struct comm_pipes *comm_pipes)
93 {
94 close(comm_pipes->child_ready[0]);
95 close(comm_pipes->child_ready[1]);
96 close(comm_pipes->parent_ready[0]);
97 close(comm_pipes->parent_ready[1]);
98 }
99
child_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)100 static int child_memcmp_fn(char *mem, size_t size,
101 struct comm_pipes *comm_pipes)
102 {
103 char *old = malloc(size);
104 char buf;
105
106 /* Backup the original content. */
107 memcpy(old, mem, size);
108
109 /* Wait until the parent modified the page. */
110 write(comm_pipes->child_ready[1], "0", 1);
111 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
112 ;
113
114 /* See if we still read the old values. */
115 return memcmp(old, mem, size);
116 }
117
child_vmsplice_memcmp_fn(char * mem,size_t size,struct comm_pipes * comm_pipes)118 static int child_vmsplice_memcmp_fn(char *mem, size_t size,
119 struct comm_pipes *comm_pipes)
120 {
121 struct iovec iov = {
122 .iov_base = mem,
123 .iov_len = size,
124 };
125 ssize_t cur, total, transferred;
126 char *old, *new;
127 int fds[2];
128 char buf;
129
130 old = malloc(size);
131 new = malloc(size);
132
133 /* Backup the original content. */
134 memcpy(old, mem, size);
135
136 if (pipe(fds) < 0)
137 return -errno;
138
139 /* Trigger a read-only pin. */
140 transferred = vmsplice(fds[1], &iov, 1, 0);
141 if (transferred < 0)
142 return -errno;
143 if (transferred == 0)
144 return -EINVAL;
145
146 /* Unmap it from our page tables. */
147 if (munmap(mem, size) < 0)
148 return -errno;
149
150 /* Wait until the parent modified it. */
151 write(comm_pipes->child_ready[1], "0", 1);
152 while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
153 ;
154
155 /* See if we still read the old values via the pipe. */
156 for (total = 0; total < transferred; total += cur) {
157 cur = read(fds[0], new + total, transferred - total);
158 if (cur < 0)
159 return -errno;
160 }
161
162 return memcmp(old, new, transferred);
163 }
164
165 typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
166
do_test_cow_in_parent(char * mem,size_t size,bool do_mprotect,child_fn fn)167 static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
168 child_fn fn)
169 {
170 struct comm_pipes comm_pipes;
171 char buf;
172 int ret;
173
174 ret = setup_comm_pipes(&comm_pipes);
175 if (ret) {
176 ksft_test_result_fail("pipe() failed\n");
177 return;
178 }
179
180 ret = fork();
181 if (ret < 0) {
182 ksft_test_result_fail("fork() failed\n");
183 goto close_comm_pipes;
184 } else if (!ret) {
185 exit(fn(mem, size, &comm_pipes));
186 }
187
188 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
189 ;
190
191 if (do_mprotect) {
192 /*
193 * mprotect() optimizations might try avoiding
194 * write-faults by directly mapping pages writable.
195 */
196 ret = mprotect(mem, size, PROT_READ);
197 ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
198 if (ret) {
199 ksft_test_result_fail("mprotect() failed\n");
200 write(comm_pipes.parent_ready[1], "0", 1);
201 wait(&ret);
202 goto close_comm_pipes;
203 }
204 }
205
206 /* Modify the page. */
207 memset(mem, 0xff, size);
208 write(comm_pipes.parent_ready[1], "0", 1);
209
210 wait(&ret);
211 if (WIFEXITED(ret))
212 ret = WEXITSTATUS(ret);
213 else
214 ret = -EINVAL;
215
216 ksft_test_result(!ret, "No leak from parent into child\n");
217 close_comm_pipes:
218 close_comm_pipes(&comm_pipes);
219 }
220
test_cow_in_parent(char * mem,size_t size)221 static void test_cow_in_parent(char *mem, size_t size)
222 {
223 do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
224 }
225
test_cow_in_parent_mprotect(char * mem,size_t size)226 static void test_cow_in_parent_mprotect(char *mem, size_t size)
227 {
228 do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
229 }
230
test_vmsplice_in_child(char * mem,size_t size)231 static void test_vmsplice_in_child(char *mem, size_t size)
232 {
233 do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
234 }
235
test_vmsplice_in_child_mprotect(char * mem,size_t size)236 static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
237 {
238 do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
239 }
240
do_test_vmsplice_in_parent(char * mem,size_t size,bool before_fork)241 static void do_test_vmsplice_in_parent(char *mem, size_t size,
242 bool before_fork)
243 {
244 struct iovec iov = {
245 .iov_base = mem,
246 .iov_len = size,
247 };
248 ssize_t cur, total, transferred;
249 struct comm_pipes comm_pipes;
250 char *old, *new;
251 int ret, fds[2];
252 char buf;
253
254 old = malloc(size);
255 new = malloc(size);
256
257 memcpy(old, mem, size);
258
259 ret = setup_comm_pipes(&comm_pipes);
260 if (ret) {
261 ksft_test_result_fail("pipe() failed\n");
262 goto free;
263 }
264
265 if (pipe(fds) < 0) {
266 ksft_test_result_fail("pipe() failed\n");
267 goto close_comm_pipes;
268 }
269
270 if (before_fork) {
271 transferred = vmsplice(fds[1], &iov, 1, 0);
272 if (transferred <= 0) {
273 ksft_test_result_fail("vmsplice() failed\n");
274 goto close_pipe;
275 }
276 }
277
278 ret = fork();
279 if (ret < 0) {
280 ksft_test_result_fail("fork() failed\n");
281 goto close_pipe;
282 } else if (!ret) {
283 write(comm_pipes.child_ready[1], "0", 1);
284 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
285 ;
286 /* Modify page content in the child. */
287 memset(mem, 0xff, size);
288 exit(0);
289 }
290
291 if (!before_fork) {
292 transferred = vmsplice(fds[1], &iov, 1, 0);
293 if (transferred <= 0) {
294 ksft_test_result_fail("vmsplice() failed\n");
295 wait(&ret);
296 goto close_pipe;
297 }
298 }
299
300 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
301 ;
302 if (munmap(mem, size) < 0) {
303 ksft_test_result_fail("munmap() failed\n");
304 goto close_pipe;
305 }
306 write(comm_pipes.parent_ready[1], "0", 1);
307
308 /* Wait until the child is done writing. */
309 wait(&ret);
310 if (!WIFEXITED(ret)) {
311 ksft_test_result_fail("wait() failed\n");
312 goto close_pipe;
313 }
314
315 /* See if we still read the old values. */
316 for (total = 0; total < transferred; total += cur) {
317 cur = read(fds[0], new + total, transferred - total);
318 if (cur < 0) {
319 ksft_test_result_fail("read() failed\n");
320 goto close_pipe;
321 }
322 }
323
324 ksft_test_result(!memcmp(old, new, transferred),
325 "No leak from child into parent\n");
326 close_pipe:
327 close(fds[0]);
328 close(fds[1]);
329 close_comm_pipes:
330 close_comm_pipes(&comm_pipes);
331 free:
332 free(old);
333 free(new);
334 }
335
test_vmsplice_before_fork(char * mem,size_t size)336 static void test_vmsplice_before_fork(char *mem, size_t size)
337 {
338 do_test_vmsplice_in_parent(mem, size, true);
339 }
340
test_vmsplice_after_fork(char * mem,size_t size)341 static void test_vmsplice_after_fork(char *mem, size_t size)
342 {
343 do_test_vmsplice_in_parent(mem, size, false);
344 }
345
346 #ifdef LOCAL_CONFIG_HAVE_LIBURING
do_test_iouring(char * mem,size_t size,bool use_fork)347 static void do_test_iouring(char *mem, size_t size, bool use_fork)
348 {
349 struct comm_pipes comm_pipes;
350 struct io_uring_cqe *cqe;
351 struct io_uring_sqe *sqe;
352 struct io_uring ring;
353 ssize_t cur, total;
354 struct iovec iov;
355 char *buf, *tmp;
356 int ret, fd;
357 FILE *file;
358
359 ret = setup_comm_pipes(&comm_pipes);
360 if (ret) {
361 ksft_test_result_fail("pipe() failed\n");
362 return;
363 }
364
365 file = tmpfile();
366 if (!file) {
367 ksft_test_result_fail("tmpfile() failed\n");
368 goto close_comm_pipes;
369 }
370 fd = fileno(file);
371 assert(fd);
372
373 tmp = malloc(size);
374 if (!tmp) {
375 ksft_test_result_fail("malloc() failed\n");
376 goto close_file;
377 }
378
379 /* Skip on errors, as we might just lack kernel support. */
380 ret = io_uring_queue_init(1, &ring, 0);
381 if (ret < 0) {
382 ksft_test_result_skip("io_uring_queue_init() failed\n");
383 goto free_tmp;
384 }
385
386 /*
387 * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
388 * | FOLL_LONGTERM the range.
389 *
390 * Skip on errors, as we might just lack kernel support or might not
391 * have sufficient MEMLOCK permissions.
392 */
393 iov.iov_base = mem;
394 iov.iov_len = size;
395 ret = io_uring_register_buffers(&ring, &iov, 1);
396 if (ret) {
397 ksft_test_result_skip("io_uring_register_buffers() failed\n");
398 goto queue_exit;
399 }
400
401 if (use_fork) {
402 /*
403 * fork() and keep the child alive until we're done. Note that
404 * we expect the pinned page to not get shared with the child.
405 */
406 ret = fork();
407 if (ret < 0) {
408 ksft_test_result_fail("fork() failed\n");
409 goto unregister_buffers;
410 } else if (!ret) {
411 write(comm_pipes.child_ready[1], "0", 1);
412 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
413 ;
414 exit(0);
415 }
416
417 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
418 ;
419 } else {
420 /*
421 * Map the page R/O into the page table. Enable softdirty
422 * tracking to stop the page from getting mapped R/W immediately
423 * again by mprotect() optimizations. Note that we don't have an
424 * easy way to test if that worked (the pagemap does not export
425 * if the page is mapped R/O vs. R/W).
426 */
427 ret = mprotect(mem, size, PROT_READ);
428 clear_softdirty();
429 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
430 if (ret) {
431 ksft_test_result_fail("mprotect() failed\n");
432 goto unregister_buffers;
433 }
434 }
435
436 /*
437 * Modify the page and write page content as observed by the fixed
438 * buffer pin to the file so we can verify it.
439 */
440 memset(mem, 0xff, size);
441 sqe = io_uring_get_sqe(&ring);
442 if (!sqe) {
443 ksft_test_result_fail("io_uring_get_sqe() failed\n");
444 goto quit_child;
445 }
446 io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
447
448 ret = io_uring_submit(&ring);
449 if (ret < 0) {
450 ksft_test_result_fail("io_uring_submit() failed\n");
451 goto quit_child;
452 }
453
454 ret = io_uring_wait_cqe(&ring, &cqe);
455 if (ret < 0) {
456 ksft_test_result_fail("io_uring_wait_cqe() failed\n");
457 goto quit_child;
458 }
459
460 if (cqe->res != size) {
461 ksft_test_result_fail("write_fixed failed\n");
462 goto quit_child;
463 }
464 io_uring_cqe_seen(&ring, cqe);
465
466 /* Read back the file content to the temporary buffer. */
467 total = 0;
468 while (total < size) {
469 cur = pread(fd, tmp + total, size - total, total);
470 if (cur < 0) {
471 ksft_test_result_fail("pread() failed\n");
472 goto quit_child;
473 }
474 total += cur;
475 }
476
477 /* Finally, check if we read what we expected. */
478 ksft_test_result(!memcmp(mem, tmp, size),
479 "Longterm R/W pin is reliable\n");
480
481 quit_child:
482 if (use_fork) {
483 write(comm_pipes.parent_ready[1], "0", 1);
484 wait(&ret);
485 }
486 unregister_buffers:
487 io_uring_unregister_buffers(&ring);
488 queue_exit:
489 io_uring_queue_exit(&ring);
490 free_tmp:
491 free(tmp);
492 close_file:
493 fclose(file);
494 close_comm_pipes:
495 close_comm_pipes(&comm_pipes);
496 }
497
test_iouring_ro(char * mem,size_t size)498 static void test_iouring_ro(char *mem, size_t size)
499 {
500 do_test_iouring(mem, size, false);
501 }
502
test_iouring_fork(char * mem,size_t size)503 static void test_iouring_fork(char *mem, size_t size)
504 {
505 do_test_iouring(mem, size, true);
506 }
507
508 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
509
510 enum ro_pin_test {
511 RO_PIN_TEST,
512 RO_PIN_TEST_SHARED,
513 RO_PIN_TEST_PREVIOUSLY_SHARED,
514 RO_PIN_TEST_RO_EXCLUSIVE,
515 };
516
do_test_ro_pin(char * mem,size_t size,enum ro_pin_test test,bool fast)517 static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
518 bool fast)
519 {
520 struct pin_longterm_test args;
521 struct comm_pipes comm_pipes;
522 char *tmp, buf;
523 __u64 tmp_val;
524 int ret;
525
526 if (gup_fd < 0) {
527 ksft_test_result_skip("gup_test not available\n");
528 return;
529 }
530
531 tmp = malloc(size);
532 if (!tmp) {
533 ksft_test_result_fail("malloc() failed\n");
534 return;
535 }
536
537 ret = setup_comm_pipes(&comm_pipes);
538 if (ret) {
539 ksft_test_result_fail("pipe() failed\n");
540 goto free_tmp;
541 }
542
543 switch (test) {
544 case RO_PIN_TEST:
545 break;
546 case RO_PIN_TEST_SHARED:
547 case RO_PIN_TEST_PREVIOUSLY_SHARED:
548 /*
549 * Share the pages with our child. As the pages are not pinned,
550 * this should just work.
551 */
552 ret = fork();
553 if (ret < 0) {
554 ksft_test_result_fail("fork() failed\n");
555 goto close_comm_pipes;
556 } else if (!ret) {
557 write(comm_pipes.child_ready[1], "0", 1);
558 while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
559 ;
560 exit(0);
561 }
562
563 /* Wait until our child is ready. */
564 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
565 ;
566
567 if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
568 /*
569 * Tell the child to quit now and wait until it quit.
570 * The pages should now be mapped R/O into our page
571 * tables, but they are no longer shared.
572 */
573 write(comm_pipes.parent_ready[1], "0", 1);
574 wait(&ret);
575 if (!WIFEXITED(ret))
576 ksft_print_msg("[INFO] wait() failed\n");
577 }
578 break;
579 case RO_PIN_TEST_RO_EXCLUSIVE:
580 /*
581 * Map the page R/O into the page table. Enable softdirty
582 * tracking to stop the page from getting mapped R/W immediately
583 * again by mprotect() optimizations. Note that we don't have an
584 * easy way to test if that worked (the pagemap does not export
585 * if the page is mapped R/O vs. R/W).
586 */
587 ret = mprotect(mem, size, PROT_READ);
588 clear_softdirty();
589 ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
590 if (ret) {
591 ksft_test_result_fail("mprotect() failed\n");
592 goto close_comm_pipes;
593 }
594 break;
595 default:
596 assert(false);
597 }
598
599 /* Take a R/O pin. This should trigger unsharing. */
600 args.addr = (__u64)(uintptr_t)mem;
601 args.size = size;
602 args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
603 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
604 if (ret) {
605 if (errno == EINVAL)
606 ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
607 else
608 ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
609 goto wait;
610 }
611
612 /* Modify the page. */
613 memset(mem, 0xff, size);
614
615 /*
616 * Read back the content via the pin to the temporary buffer and
617 * test if we observed the modification.
618 */
619 tmp_val = (__u64)(uintptr_t)tmp;
620 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
621 if (ret)
622 ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
623 else
624 ksft_test_result(!memcmp(mem, tmp, size),
625 "Longterm R/O pin is reliable\n");
626
627 ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
628 if (ret)
629 ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
630 wait:
631 switch (test) {
632 case RO_PIN_TEST_SHARED:
633 write(comm_pipes.parent_ready[1], "0", 1);
634 wait(&ret);
635 if (!WIFEXITED(ret))
636 ksft_print_msg("[INFO] wait() failed\n");
637 break;
638 default:
639 break;
640 }
641 close_comm_pipes:
642 close_comm_pipes(&comm_pipes);
643 free_tmp:
644 free(tmp);
645 }
646
test_ro_pin_on_shared(char * mem,size_t size)647 static void test_ro_pin_on_shared(char *mem, size_t size)
648 {
649 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
650 }
651
test_ro_fast_pin_on_shared(char * mem,size_t size)652 static void test_ro_fast_pin_on_shared(char *mem, size_t size)
653 {
654 do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
655 }
656
test_ro_pin_on_ro_previously_shared(char * mem,size_t size)657 static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
658 {
659 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
660 }
661
test_ro_fast_pin_on_ro_previously_shared(char * mem,size_t size)662 static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
663 {
664 do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
665 }
666
test_ro_pin_on_ro_exclusive(char * mem,size_t size)667 static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
668 {
669 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
670 }
671
test_ro_fast_pin_on_ro_exclusive(char * mem,size_t size)672 static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
673 {
674 do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
675 }
676
677 typedef void (*test_fn)(char *mem, size_t size);
678
do_run_with_base_page(test_fn fn,bool swapout)679 static void do_run_with_base_page(test_fn fn, bool swapout)
680 {
681 char *mem;
682 int ret;
683
684 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
685 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
686 if (mem == MAP_FAILED) {
687 ksft_test_result_fail("mmap() failed\n");
688 return;
689 }
690
691 ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
692 /* Ignore if not around on a kernel. */
693 if (ret && errno != EINVAL) {
694 ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
695 goto munmap;
696 }
697
698 /* Populate a base page. */
699 memset(mem, 0, pagesize);
700
701 if (swapout) {
702 madvise(mem, pagesize, MADV_PAGEOUT);
703 if (!pagemap_is_swapped(pagemap_fd, mem)) {
704 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
705 goto munmap;
706 }
707 }
708
709 fn(mem, pagesize);
710 munmap:
711 munmap(mem, pagesize);
712 }
713
run_with_base_page(test_fn fn,const char * desc)714 static void run_with_base_page(test_fn fn, const char *desc)
715 {
716 ksft_print_msg("[RUN] %s ... with base page\n", desc);
717 do_run_with_base_page(fn, false);
718 }
719
run_with_base_page_swap(test_fn fn,const char * desc)720 static void run_with_base_page_swap(test_fn fn, const char *desc)
721 {
722 ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
723 do_run_with_base_page(fn, true);
724 }
725
726 enum thp_run {
727 THP_RUN_PMD,
728 THP_RUN_PMD_SWAPOUT,
729 THP_RUN_PTE,
730 THP_RUN_PTE_SWAPOUT,
731 THP_RUN_SINGLE_PTE,
732 THP_RUN_SINGLE_PTE_SWAPOUT,
733 THP_RUN_PARTIAL_MREMAP,
734 THP_RUN_PARTIAL_SHARED,
735 };
736
do_run_with_thp(test_fn fn,enum thp_run thp_run)737 static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
738 {
739 char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
740 size_t size, mmap_size, mremap_size;
741 int ret;
742
743 /* For alignment purposes, we need twice the thp size. */
744 mmap_size = 2 * thpsize;
745 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
746 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
747 if (mmap_mem == MAP_FAILED) {
748 ksft_test_result_fail("mmap() failed\n");
749 return;
750 }
751
752 /* We need a THP-aligned memory area. */
753 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
754
755 ret = madvise(mem, thpsize, MADV_HUGEPAGE);
756 if (ret) {
757 ksft_test_result_fail("MADV_HUGEPAGE failed\n");
758 goto munmap;
759 }
760
761 /*
762 * Try to populate a THP. Touch the first sub-page and test if we get
763 * another sub-page populated automatically.
764 */
765 mem[0] = 0;
766 if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
767 ksft_test_result_skip("Did not get a THP populated\n");
768 goto munmap;
769 }
770 memset(mem, 0, thpsize);
771
772 size = thpsize;
773 switch (thp_run) {
774 case THP_RUN_PMD:
775 case THP_RUN_PMD_SWAPOUT:
776 break;
777 case THP_RUN_PTE:
778 case THP_RUN_PTE_SWAPOUT:
779 /*
780 * Trigger PTE-mapping the THP by temporarily mapping a single
781 * subpage R/O.
782 */
783 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
784 if (ret) {
785 ksft_test_result_fail("mprotect() failed\n");
786 goto munmap;
787 }
788 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
789 if (ret) {
790 ksft_test_result_fail("mprotect() failed\n");
791 goto munmap;
792 }
793 break;
794 case THP_RUN_SINGLE_PTE:
795 case THP_RUN_SINGLE_PTE_SWAPOUT:
796 /*
797 * Discard all but a single subpage of that PTE-mapped THP. What
798 * remains is a single PTE mapping a single subpage.
799 */
800 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
801 if (ret) {
802 ksft_test_result_fail("MADV_DONTNEED failed\n");
803 goto munmap;
804 }
805 size = pagesize;
806 break;
807 case THP_RUN_PARTIAL_MREMAP:
808 /*
809 * Remap half of the THP. We need some new memory location
810 * for that.
811 */
812 mremap_size = thpsize / 2;
813 mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
814 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
815 if (mem == MAP_FAILED) {
816 ksft_test_result_fail("mmap() failed\n");
817 goto munmap;
818 }
819 tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
820 MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
821 if (tmp != mremap_mem) {
822 ksft_test_result_fail("mremap() failed\n");
823 goto munmap;
824 }
825 size = mremap_size;
826 break;
827 case THP_RUN_PARTIAL_SHARED:
828 /*
829 * Share the first page of the THP with a child and quit the
830 * child. This will result in some parts of the THP never
831 * have been shared.
832 */
833 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
834 if (ret) {
835 ksft_test_result_fail("MADV_DONTFORK failed\n");
836 goto munmap;
837 }
838 ret = fork();
839 if (ret < 0) {
840 ksft_test_result_fail("fork() failed\n");
841 goto munmap;
842 } else if (!ret) {
843 exit(0);
844 }
845 wait(&ret);
846 /* Allow for sharing all pages again. */
847 ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
848 if (ret) {
849 ksft_test_result_fail("MADV_DOFORK failed\n");
850 goto munmap;
851 }
852 break;
853 default:
854 assert(false);
855 }
856
857 switch (thp_run) {
858 case THP_RUN_PMD_SWAPOUT:
859 case THP_RUN_PTE_SWAPOUT:
860 case THP_RUN_SINGLE_PTE_SWAPOUT:
861 madvise(mem, size, MADV_PAGEOUT);
862 if (!range_is_swapped(mem, size)) {
863 ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
864 goto munmap;
865 }
866 break;
867 default:
868 break;
869 }
870
871 fn(mem, size);
872 munmap:
873 munmap(mmap_mem, mmap_size);
874 if (mremap_mem != MAP_FAILED)
875 munmap(mremap_mem, mremap_size);
876 }
877
run_with_thp(test_fn fn,const char * desc)878 static void run_with_thp(test_fn fn, const char *desc)
879 {
880 ksft_print_msg("[RUN] %s ... with THP\n", desc);
881 do_run_with_thp(fn, THP_RUN_PMD);
882 }
883
run_with_thp_swap(test_fn fn,const char * desc)884 static void run_with_thp_swap(test_fn fn, const char *desc)
885 {
886 ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
887 do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
888 }
889
run_with_pte_mapped_thp(test_fn fn,const char * desc)890 static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
891 {
892 ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
893 do_run_with_thp(fn, THP_RUN_PTE);
894 }
895
run_with_pte_mapped_thp_swap(test_fn fn,const char * desc)896 static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
897 {
898 ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
899 do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
900 }
901
run_with_single_pte_of_thp(test_fn fn,const char * desc)902 static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
903 {
904 ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
905 do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
906 }
907
run_with_single_pte_of_thp_swap(test_fn fn,const char * desc)908 static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
909 {
910 ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
911 do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
912 }
913
run_with_partial_mremap_thp(test_fn fn,const char * desc)914 static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
915 {
916 ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
917 do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
918 }
919
run_with_partial_shared_thp(test_fn fn,const char * desc)920 static void run_with_partial_shared_thp(test_fn fn, const char *desc)
921 {
922 ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
923 do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
924 }
925
run_with_hugetlb(test_fn fn,const char * desc,size_t hugetlbsize)926 static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
927 {
928 int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
929 char *mem, *dummy;
930
931 ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
932 hugetlbsize / 1024);
933
934 flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
935
936 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
937 if (mem == MAP_FAILED) {
938 ksft_test_result_skip("need more free huge pages\n");
939 return;
940 }
941
942 /* Populate an huge page. */
943 memset(mem, 0, hugetlbsize);
944
945 /*
946 * We need a total of two hugetlb pages to handle COW/unsharing
947 * properly, otherwise we might get zapped by a SIGBUS.
948 */
949 dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
950 if (dummy == MAP_FAILED) {
951 ksft_test_result_skip("need more free huge pages\n");
952 goto munmap;
953 }
954 munmap(dummy, hugetlbsize);
955
956 fn(mem, hugetlbsize);
957 munmap:
958 munmap(mem, hugetlbsize);
959 }
960
961 struct test_case {
962 const char *desc;
963 test_fn fn;
964 };
965
966 /*
967 * Test cases that are specific to anonymous pages: pages in private mappings
968 * that may get shared via COW during fork().
969 */
970 static const struct test_case anon_test_cases[] = {
971 /*
972 * Basic COW tests for fork() without any GUP. If we miss to break COW,
973 * either the child can observe modifications by the parent or the
974 * other way around.
975 */
976 {
977 "Basic COW after fork()",
978 test_cow_in_parent,
979 },
980 /*
981 * Basic test, but do an additional mprotect(PROT_READ)+
982 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
983 */
984 {
985 "Basic COW after fork() with mprotect() optimization",
986 test_cow_in_parent_mprotect,
987 },
988 /*
989 * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
990 * we miss to break COW, the child observes modifications by the parent.
991 * This is CVE-2020-29374 reported by Jann Horn.
992 */
993 {
994 "vmsplice() + unmap in child",
995 test_vmsplice_in_child
996 },
997 /*
998 * vmsplice() test, but do an additional mprotect(PROT_READ)+
999 * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
1000 */
1001 {
1002 "vmsplice() + unmap in child with mprotect() optimization",
1003 test_vmsplice_in_child_mprotect
1004 },
1005 /*
1006 * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
1007 * fork(); modify in the child. If we miss to break COW, the parent
1008 * observes modifications by the child.
1009 */
1010 {
1011 "vmsplice() before fork(), unmap in parent after fork()",
1012 test_vmsplice_before_fork,
1013 },
1014 /*
1015 * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
1016 * child. If we miss to break COW, the parent observes modifications by
1017 * the child.
1018 */
1019 {
1020 "vmsplice() + unmap in parent after fork()",
1021 test_vmsplice_after_fork,
1022 },
1023 #ifdef LOCAL_CONFIG_HAVE_LIBURING
1024 /*
1025 * Take a R/W longterm pin and then map the page R/O into the page
1026 * table to trigger a write fault on next access. When modifying the
1027 * page, the page content must be visible via the pin.
1028 */
1029 {
1030 "R/O-mapping a page registered as iouring fixed buffer",
1031 test_iouring_ro,
1032 },
1033 /*
1034 * Take a R/W longterm pin and then fork() a child. When modifying the
1035 * page, the page content must be visible via the pin. We expect the
1036 * pinned page to not get shared with the child.
1037 */
1038 {
1039 "fork() with an iouring fixed buffer",
1040 test_iouring_fork,
1041 },
1042
1043 #endif /* LOCAL_CONFIG_HAVE_LIBURING */
1044 /*
1045 * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
1046 * When modifying the page via the page table, the page content change
1047 * must be visible via the pin.
1048 */
1049 {
1050 "R/O GUP pin on R/O-mapped shared page",
1051 test_ro_pin_on_shared,
1052 },
1053 /* Same as above, but using GUP-fast. */
1054 {
1055 "R/O GUP-fast pin on R/O-mapped shared page",
1056 test_ro_fast_pin_on_shared,
1057 },
1058 /*
1059 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
1060 * was previously shared. When modifying the page via the page table,
1061 * the page content change must be visible via the pin.
1062 */
1063 {
1064 "R/O GUP pin on R/O-mapped previously-shared page",
1065 test_ro_pin_on_ro_previously_shared,
1066 },
1067 /* Same as above, but using GUP-fast. */
1068 {
1069 "R/O GUP-fast pin on R/O-mapped previously-shared page",
1070 test_ro_fast_pin_on_ro_previously_shared,
1071 },
1072 /*
1073 * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
1074 * When modifying the page via the page table, the page content change
1075 * must be visible via the pin.
1076 */
1077 {
1078 "R/O GUP pin on R/O-mapped exclusive page",
1079 test_ro_pin_on_ro_exclusive,
1080 },
1081 /* Same as above, but using GUP-fast. */
1082 {
1083 "R/O GUP-fast pin on R/O-mapped exclusive page",
1084 test_ro_fast_pin_on_ro_exclusive,
1085 },
1086 };
1087
run_anon_test_case(struct test_case const * test_case)1088 static void run_anon_test_case(struct test_case const *test_case)
1089 {
1090 int i;
1091
1092 run_with_base_page(test_case->fn, test_case->desc);
1093 run_with_base_page_swap(test_case->fn, test_case->desc);
1094 if (thpsize) {
1095 run_with_thp(test_case->fn, test_case->desc);
1096 run_with_thp_swap(test_case->fn, test_case->desc);
1097 run_with_pte_mapped_thp(test_case->fn, test_case->desc);
1098 run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
1099 run_with_single_pte_of_thp(test_case->fn, test_case->desc);
1100 run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
1101 run_with_partial_mremap_thp(test_case->fn, test_case->desc);
1102 run_with_partial_shared_thp(test_case->fn, test_case->desc);
1103 }
1104 for (i = 0; i < nr_hugetlbsizes; i++)
1105 run_with_hugetlb(test_case->fn, test_case->desc,
1106 hugetlbsizes[i]);
1107 }
1108
run_anon_test_cases(void)1109 static void run_anon_test_cases(void)
1110 {
1111 int i;
1112
1113 ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
1114
1115 for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
1116 run_anon_test_case(&anon_test_cases[i]);
1117 }
1118
tests_per_anon_test_case(void)1119 static int tests_per_anon_test_case(void)
1120 {
1121 int tests = 2 + nr_hugetlbsizes;
1122
1123 if (thpsize)
1124 tests += 8;
1125 return tests;
1126 }
1127
1128 enum anon_thp_collapse_test {
1129 ANON_THP_COLLAPSE_UNSHARED,
1130 ANON_THP_COLLAPSE_FULLY_SHARED,
1131 ANON_THP_COLLAPSE_LOWER_SHARED,
1132 ANON_THP_COLLAPSE_UPPER_SHARED,
1133 };
1134
do_test_anon_thp_collapse(char * mem,size_t size,enum anon_thp_collapse_test test)1135 static void do_test_anon_thp_collapse(char *mem, size_t size,
1136 enum anon_thp_collapse_test test)
1137 {
1138 struct comm_pipes comm_pipes;
1139 char buf;
1140 int ret;
1141
1142 ret = setup_comm_pipes(&comm_pipes);
1143 if (ret) {
1144 ksft_test_result_fail("pipe() failed\n");
1145 return;
1146 }
1147
1148 /*
1149 * Trigger PTE-mapping the THP by temporarily mapping a single subpage
1150 * R/O, such that we can try collapsing it later.
1151 */
1152 ret = mprotect(mem + pagesize, pagesize, PROT_READ);
1153 if (ret) {
1154 ksft_test_result_fail("mprotect() failed\n");
1155 goto close_comm_pipes;
1156 }
1157 ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
1158 if (ret) {
1159 ksft_test_result_fail("mprotect() failed\n");
1160 goto close_comm_pipes;
1161 }
1162
1163 switch (test) {
1164 case ANON_THP_COLLAPSE_UNSHARED:
1165 /* Collapse before actually COW-sharing the page. */
1166 ret = madvise(mem, size, MADV_COLLAPSE);
1167 if (ret) {
1168 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1169 strerror(errno));
1170 goto close_comm_pipes;
1171 }
1172 break;
1173 case ANON_THP_COLLAPSE_FULLY_SHARED:
1174 /* COW-share the full PTE-mapped THP. */
1175 break;
1176 case ANON_THP_COLLAPSE_LOWER_SHARED:
1177 /* Don't COW-share the upper part of the THP. */
1178 ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
1179 if (ret) {
1180 ksft_test_result_fail("MADV_DONTFORK failed\n");
1181 goto close_comm_pipes;
1182 }
1183 break;
1184 case ANON_THP_COLLAPSE_UPPER_SHARED:
1185 /* Don't COW-share the lower part of the THP. */
1186 ret = madvise(mem, size / 2, MADV_DONTFORK);
1187 if (ret) {
1188 ksft_test_result_fail("MADV_DONTFORK failed\n");
1189 goto close_comm_pipes;
1190 }
1191 break;
1192 default:
1193 assert(false);
1194 }
1195
1196 ret = fork();
1197 if (ret < 0) {
1198 ksft_test_result_fail("fork() failed\n");
1199 goto close_comm_pipes;
1200 } else if (!ret) {
1201 switch (test) {
1202 case ANON_THP_COLLAPSE_UNSHARED:
1203 case ANON_THP_COLLAPSE_FULLY_SHARED:
1204 exit(child_memcmp_fn(mem, size, &comm_pipes));
1205 break;
1206 case ANON_THP_COLLAPSE_LOWER_SHARED:
1207 exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
1208 break;
1209 case ANON_THP_COLLAPSE_UPPER_SHARED:
1210 exit(child_memcmp_fn(mem + size / 2, size / 2,
1211 &comm_pipes));
1212 break;
1213 default:
1214 assert(false);
1215 }
1216 }
1217
1218 while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
1219 ;
1220
1221 switch (test) {
1222 case ANON_THP_COLLAPSE_UNSHARED:
1223 break;
1224 case ANON_THP_COLLAPSE_UPPER_SHARED:
1225 case ANON_THP_COLLAPSE_LOWER_SHARED:
1226 /*
1227 * Revert MADV_DONTFORK such that we merge the VMAs and are
1228 * able to actually collapse.
1229 */
1230 ret = madvise(mem, size, MADV_DOFORK);
1231 if (ret) {
1232 ksft_test_result_fail("MADV_DOFORK failed\n");
1233 write(comm_pipes.parent_ready[1], "0", 1);
1234 wait(&ret);
1235 goto close_comm_pipes;
1236 }
1237 /* FALLTHROUGH */
1238 case ANON_THP_COLLAPSE_FULLY_SHARED:
1239 /* Collapse before anyone modified the COW-shared page. */
1240 ret = madvise(mem, size, MADV_COLLAPSE);
1241 if (ret) {
1242 ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
1243 strerror(errno));
1244 write(comm_pipes.parent_ready[1], "0", 1);
1245 wait(&ret);
1246 goto close_comm_pipes;
1247 }
1248 break;
1249 default:
1250 assert(false);
1251 }
1252
1253 /* Modify the page. */
1254 memset(mem, 0xff, size);
1255 write(comm_pipes.parent_ready[1], "0", 1);
1256
1257 wait(&ret);
1258 if (WIFEXITED(ret))
1259 ret = WEXITSTATUS(ret);
1260 else
1261 ret = -EINVAL;
1262
1263 ksft_test_result(!ret, "No leak from parent into child\n");
1264 close_comm_pipes:
1265 close_comm_pipes(&comm_pipes);
1266 }
1267
test_anon_thp_collapse_unshared(char * mem,size_t size)1268 static void test_anon_thp_collapse_unshared(char *mem, size_t size)
1269 {
1270 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
1271 }
1272
test_anon_thp_collapse_fully_shared(char * mem,size_t size)1273 static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
1274 {
1275 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
1276 }
1277
test_anon_thp_collapse_lower_shared(char * mem,size_t size)1278 static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
1279 {
1280 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
1281 }
1282
test_anon_thp_collapse_upper_shared(char * mem,size_t size)1283 static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
1284 {
1285 do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
1286 }
1287
1288 /*
1289 * Test cases that are specific to anonymous THP: pages in private mappings
1290 * that may get shared via COW during fork().
1291 */
1292 static const struct test_case anon_thp_test_cases[] = {
1293 /*
1294 * Basic COW test for fork() without any GUP when collapsing a THP
1295 * before fork().
1296 *
1297 * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
1298 * collapse") might easily get COW handling wrong when not collapsing
1299 * exclusivity information properly.
1300 */
1301 {
1302 "Basic COW after fork() when collapsing before fork()",
1303 test_anon_thp_collapse_unshared,
1304 },
1305 /* Basic COW test, but collapse after COW-sharing a full THP. */
1306 {
1307 "Basic COW after fork() when collapsing after fork() (fully shared)",
1308 test_anon_thp_collapse_fully_shared,
1309 },
1310 /*
1311 * Basic COW test, but collapse after COW-sharing the lower half of a
1312 * THP.
1313 */
1314 {
1315 "Basic COW after fork() when collapsing after fork() (lower shared)",
1316 test_anon_thp_collapse_lower_shared,
1317 },
1318 /*
1319 * Basic COW test, but collapse after COW-sharing the upper half of a
1320 * THP.
1321 */
1322 {
1323 "Basic COW after fork() when collapsing after fork() (upper shared)",
1324 test_anon_thp_collapse_upper_shared,
1325 },
1326 };
1327
run_anon_thp_test_cases(void)1328 static void run_anon_thp_test_cases(void)
1329 {
1330 int i;
1331
1332 if (!thpsize)
1333 return;
1334
1335 ksft_print_msg("[INFO] Anonymous THP tests\n");
1336
1337 for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
1338 struct test_case const *test_case = &anon_thp_test_cases[i];
1339
1340 ksft_print_msg("[RUN] %s\n", test_case->desc);
1341 do_run_with_thp(test_case->fn, THP_RUN_PMD);
1342 }
1343 }
1344
tests_per_anon_thp_test_case(void)1345 static int tests_per_anon_thp_test_case(void)
1346 {
1347 return thpsize ? 1 : 0;
1348 }
1349
1350 typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
1351
test_cow(char * mem,const char * smem,size_t size)1352 static void test_cow(char *mem, const char *smem, size_t size)
1353 {
1354 char *old = malloc(size);
1355
1356 /* Backup the original content. */
1357 memcpy(old, smem, size);
1358
1359 /* Modify the page. */
1360 memset(mem, 0xff, size);
1361
1362 /* See if we still read the old values via the other mapping. */
1363 ksft_test_result(!memcmp(smem, old, size),
1364 "Other mapping not modified\n");
1365 free(old);
1366 }
1367
test_ro_pin(char * mem,const char * smem,size_t size)1368 static void test_ro_pin(char *mem, const char *smem, size_t size)
1369 {
1370 do_test_ro_pin(mem, size, RO_PIN_TEST, false);
1371 }
1372
test_ro_fast_pin(char * mem,const char * smem,size_t size)1373 static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
1374 {
1375 do_test_ro_pin(mem, size, RO_PIN_TEST, true);
1376 }
1377
run_with_zeropage(non_anon_test_fn fn,const char * desc)1378 static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
1379 {
1380 char *mem, *smem, tmp;
1381
1382 ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
1383
1384 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
1385 MAP_PRIVATE | MAP_ANON, -1, 0);
1386 if (mem == MAP_FAILED) {
1387 ksft_test_result_fail("mmap() failed\n");
1388 return;
1389 }
1390
1391 smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
1392 if (mem == MAP_FAILED) {
1393 ksft_test_result_fail("mmap() failed\n");
1394 goto munmap;
1395 }
1396
1397 /* Read from the page to populate the shared zeropage. */
1398 tmp = *mem + *smem;
1399 asm volatile("" : "+r" (tmp));
1400
1401 fn(mem, smem, pagesize);
1402 munmap:
1403 munmap(mem, pagesize);
1404 if (smem != MAP_FAILED)
1405 munmap(smem, pagesize);
1406 }
1407
run_with_huge_zeropage(non_anon_test_fn fn,const char * desc)1408 static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
1409 {
1410 char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
1411 size_t mmap_size;
1412 int ret;
1413
1414 ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
1415
1416 if (!has_huge_zeropage) {
1417 ksft_test_result_skip("Huge zeropage not enabled\n");
1418 return;
1419 }
1420
1421 /* For alignment purposes, we need twice the thp size. */
1422 mmap_size = 2 * thpsize;
1423 mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1424 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1425 if (mmap_mem == MAP_FAILED) {
1426 ksft_test_result_fail("mmap() failed\n");
1427 return;
1428 }
1429 mmap_smem = mmap(NULL, mmap_size, PROT_READ,
1430 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1431 if (mmap_smem == MAP_FAILED) {
1432 ksft_test_result_fail("mmap() failed\n");
1433 goto munmap;
1434 }
1435
1436 /* We need a THP-aligned memory area. */
1437 mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
1438 smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
1439
1440 ret = madvise(mem, thpsize, MADV_HUGEPAGE);
1441 ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
1442 if (ret) {
1443 ksft_test_result_fail("MADV_HUGEPAGE failed\n");
1444 goto munmap;
1445 }
1446
1447 /*
1448 * Read from the memory to populate the huge shared zeropage. Read from
1449 * the first sub-page and test if we get another sub-page populated
1450 * automatically.
1451 */
1452 tmp = *mem + *smem;
1453 asm volatile("" : "+r" (tmp));
1454 if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
1455 !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
1456 ksft_test_result_skip("Did not get THPs populated\n");
1457 goto munmap;
1458 }
1459
1460 fn(mem, smem, thpsize);
1461 munmap:
1462 munmap(mmap_mem, mmap_size);
1463 if (mmap_smem != MAP_FAILED)
1464 munmap(mmap_smem, mmap_size);
1465 }
1466
run_with_memfd(non_anon_test_fn fn,const char * desc)1467 static void run_with_memfd(non_anon_test_fn fn, const char *desc)
1468 {
1469 char *mem, *smem, tmp;
1470 int fd;
1471
1472 ksft_print_msg("[RUN] %s ... with memfd\n", desc);
1473
1474 fd = memfd_create("test", 0);
1475 if (fd < 0) {
1476 ksft_test_result_fail("memfd_create() failed\n");
1477 return;
1478 }
1479
1480 /* File consists of a single page filled with zeroes. */
1481 if (fallocate(fd, 0, 0, pagesize)) {
1482 ksft_test_result_fail("fallocate() failed\n");
1483 goto close;
1484 }
1485
1486 /* Create a private mapping of the memfd. */
1487 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1488 if (mem == MAP_FAILED) {
1489 ksft_test_result_fail("mmap() failed\n");
1490 goto close;
1491 }
1492 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1493 if (mem == MAP_FAILED) {
1494 ksft_test_result_fail("mmap() failed\n");
1495 goto munmap;
1496 }
1497
1498 /* Fault the page in. */
1499 tmp = *mem + *smem;
1500 asm volatile("" : "+r" (tmp));
1501
1502 fn(mem, smem, pagesize);
1503 munmap:
1504 munmap(mem, pagesize);
1505 if (smem != MAP_FAILED)
1506 munmap(smem, pagesize);
1507 close:
1508 close(fd);
1509 }
1510
run_with_tmpfile(non_anon_test_fn fn,const char * desc)1511 static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
1512 {
1513 char *mem, *smem, tmp;
1514 FILE *file;
1515 int fd;
1516
1517 ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
1518
1519 file = tmpfile();
1520 if (!file) {
1521 ksft_test_result_fail("tmpfile() failed\n");
1522 return;
1523 }
1524
1525 fd = fileno(file);
1526 if (fd < 0) {
1527 ksft_test_result_skip("fileno() failed\n");
1528 return;
1529 }
1530
1531 /* File consists of a single page filled with zeroes. */
1532 if (fallocate(fd, 0, 0, pagesize)) {
1533 ksft_test_result_fail("fallocate() failed\n");
1534 goto close;
1535 }
1536
1537 /* Create a private mapping of the memfd. */
1538 mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
1539 if (mem == MAP_FAILED) {
1540 ksft_test_result_fail("mmap() failed\n");
1541 goto close;
1542 }
1543 smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
1544 if (mem == MAP_FAILED) {
1545 ksft_test_result_fail("mmap() failed\n");
1546 goto munmap;
1547 }
1548
1549 /* Fault the page in. */
1550 tmp = *mem + *smem;
1551 asm volatile("" : "+r" (tmp));
1552
1553 fn(mem, smem, pagesize);
1554 munmap:
1555 munmap(mem, pagesize);
1556 if (smem != MAP_FAILED)
1557 munmap(smem, pagesize);
1558 close:
1559 fclose(file);
1560 }
1561
run_with_memfd_hugetlb(non_anon_test_fn fn,const char * desc,size_t hugetlbsize)1562 static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
1563 size_t hugetlbsize)
1564 {
1565 int flags = MFD_HUGETLB;
1566 char *mem, *smem, tmp;
1567 int fd;
1568
1569 ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
1570 hugetlbsize / 1024);
1571
1572 flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
1573
1574 fd = memfd_create("test", flags);
1575 if (fd < 0) {
1576 ksft_test_result_skip("memfd_create() failed\n");
1577 return;
1578 }
1579
1580 /* File consists of a single page filled with zeroes. */
1581 if (fallocate(fd, 0, 0, hugetlbsize)) {
1582 ksft_test_result_skip("need more free huge pages\n");
1583 goto close;
1584 }
1585
1586 /* Create a private mapping of the memfd. */
1587 mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
1588 0);
1589 if (mem == MAP_FAILED) {
1590 ksft_test_result_skip("need more free huge pages\n");
1591 goto close;
1592 }
1593 smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
1594 if (mem == MAP_FAILED) {
1595 ksft_test_result_fail("mmap() failed\n");
1596 goto munmap;
1597 }
1598
1599 /* Fault the page in. */
1600 tmp = *mem + *smem;
1601 asm volatile("" : "+r" (tmp));
1602
1603 fn(mem, smem, hugetlbsize);
1604 munmap:
1605 munmap(mem, hugetlbsize);
1606 if (mem != MAP_FAILED)
1607 munmap(smem, hugetlbsize);
1608 close:
1609 close(fd);
1610 }
1611
1612 struct non_anon_test_case {
1613 const char *desc;
1614 non_anon_test_fn fn;
1615 };
1616
1617 /*
1618 * Test cases that target any pages in private mappings that are not anonymous:
1619 * pages that may get shared via COW ndependent of fork(). This includes
1620 * the shared zeropage(s), pagecache pages, ...
1621 */
1622 static const struct non_anon_test_case non_anon_test_cases[] = {
1623 /*
1624 * Basic COW test without any GUP. If we miss to break COW, changes are
1625 * visible via other private/shared mappings.
1626 */
1627 {
1628 "Basic COW",
1629 test_cow,
1630 },
1631 /*
1632 * Take a R/O longterm pin. When modifying the page via the page table,
1633 * the page content change must be visible via the pin.
1634 */
1635 {
1636 "R/O longterm GUP pin",
1637 test_ro_pin,
1638 },
1639 /* Same as above, but using GUP-fast. */
1640 {
1641 "R/O longterm GUP-fast pin",
1642 test_ro_fast_pin,
1643 },
1644 };
1645
run_non_anon_test_case(struct non_anon_test_case const * test_case)1646 static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
1647 {
1648 int i;
1649
1650 run_with_zeropage(test_case->fn, test_case->desc);
1651 run_with_memfd(test_case->fn, test_case->desc);
1652 run_with_tmpfile(test_case->fn, test_case->desc);
1653 if (thpsize)
1654 run_with_huge_zeropage(test_case->fn, test_case->desc);
1655 for (i = 0; i < nr_hugetlbsizes; i++)
1656 run_with_memfd_hugetlb(test_case->fn, test_case->desc,
1657 hugetlbsizes[i]);
1658 }
1659
run_non_anon_test_cases(void)1660 static void run_non_anon_test_cases(void)
1661 {
1662 int i;
1663
1664 ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
1665
1666 for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
1667 run_non_anon_test_case(&non_anon_test_cases[i]);
1668 }
1669
tests_per_non_anon_test_case(void)1670 static int tests_per_non_anon_test_case(void)
1671 {
1672 int tests = 3 + nr_hugetlbsizes;
1673
1674 if (thpsize)
1675 tests += 1;
1676 return tests;
1677 }
1678
main(int argc,char ** argv)1679 int main(int argc, char **argv)
1680 {
1681 int err;
1682
1683 ksft_print_header();
1684
1685 pagesize = getpagesize();
1686 thpsize = read_pmd_pagesize();
1687 if (thpsize)
1688 ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
1689 thpsize / 1024);
1690 nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
1691 ARRAY_SIZE(hugetlbsizes));
1692 detect_huge_zeropage();
1693
1694 ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
1695 ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
1696 ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
1697
1698 gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
1699 pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
1700 if (pagemap_fd < 0)
1701 ksft_exit_fail_msg("opening pagemap failed\n");
1702
1703 run_anon_test_cases();
1704 run_anon_thp_test_cases();
1705 run_non_anon_test_cases();
1706
1707 err = ksft_get_fail_cnt();
1708 if (err)
1709 ksft_exit_fail_msg("%d out of %d tests failed\n",
1710 err, ksft_test_num());
1711 return ksft_exit_pass();
1712 }
1713