1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Userfaultfd tests util functions 4 * 5 * Copyright (C) 2015-2023 Red Hat, Inc. 6 */ 7 8 #include "uffd-common.h" 9 10 #define BASE_PMD_ADDR ((void *)(1UL << 30)) 11 12 volatile bool test_uffdio_copy_eexist = true; 13 unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; 14 char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; 15 int uffd = -1, uffd_flags, finished, *pipefd, test_type; 16 bool map_shared; 17 bool test_uffdio_wp = true; 18 unsigned long long *count_verify; 19 uffd_test_ops_t *uffd_test_ops; 20 atomic_bool ready_for_fork; 21 22 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) 23 { 24 unsigned int memfd_flags = 0; 25 int mem_fd; 26 27 if (hugetlb) 28 memfd_flags = MFD_HUGETLB; 29 mem_fd = memfd_create("uffd-test", memfd_flags); 30 if (mem_fd < 0) 31 err("memfd_create"); 32 if (ftruncate(mem_fd, mem_size)) 33 err("ftruncate"); 34 if (fallocate(mem_fd, 35 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 36 mem_size)) 37 err("fallocate"); 38 39 return mem_fd; 40 } 41 42 static void anon_release_pages(char *rel_area) 43 { 44 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 45 err("madvise(MADV_DONTNEED) failed"); 46 } 47 48 static int anon_allocate_area(void **alloc_area, bool is_src) 49 { 50 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 51 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 52 if (*alloc_area == MAP_FAILED) { 53 *alloc_area = NULL; 54 return -errno; 55 } 56 return 0; 57 } 58 59 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) 60 { 61 } 62 63 static void hugetlb_release_pages(char *rel_area) 64 { 65 if (!map_shared) { 66 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 67 err("madvise(MADV_DONTNEED) failed"); 68 } else { 69 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 70 err("madvise(MADV_REMOVE) failed"); 71 } 72 } 73 74 static int hugetlb_allocate_area(void **alloc_area, bool is_src) 75 { 76 off_t size = nr_pages * page_size; 77 off_t offset = is_src ? 0 : size; 78 void *area_alias = NULL; 79 char **alloc_area_alias; 80 int mem_fd = uffd_mem_fd_create(size * 2, true); 81 82 *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, 83 (map_shared ? MAP_SHARED : MAP_PRIVATE) | 84 (is_src ? 0 : MAP_NORESERVE), 85 mem_fd, offset); 86 if (*alloc_area == MAP_FAILED) { 87 *alloc_area = NULL; 88 return -errno; 89 } 90 91 if (map_shared) { 92 area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, 93 MAP_SHARED, mem_fd, offset); 94 if (area_alias == MAP_FAILED) 95 return -errno; 96 } 97 98 if (is_src) { 99 alloc_area_alias = &area_src_alias; 100 } else { 101 alloc_area_alias = &area_dst_alias; 102 } 103 if (area_alias) 104 *alloc_area_alias = area_alias; 105 106 close(mem_fd); 107 return 0; 108 } 109 110 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) 111 { 112 if (!map_shared) 113 return; 114 115 *start = (unsigned long) area_dst_alias + offset; 116 } 117 118 static void shmem_release_pages(char *rel_area) 119 { 120 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 121 err("madvise(MADV_REMOVE) failed"); 122 } 123 124 static int shmem_allocate_area(void **alloc_area, bool is_src) 125 { 126 void *area_alias = NULL; 127 size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize(); 128 unsigned long offset = is_src ? 0 : bytes; 129 char *p = NULL, *p_alias = NULL; 130 int mem_fd = uffd_mem_fd_create(bytes * 2, false); 131 132 /* TODO: clean this up. Use a static addr is ugly */ 133 p = BASE_PMD_ADDR; 134 if (!is_src) 135 /* src map + alias + interleaved hpages */ 136 p += 2 * (bytes + hpage_size); 137 p_alias = p; 138 p_alias += bytes; 139 p_alias += hpage_size; /* Prevent src/dst VMA merge */ 140 141 *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, 142 mem_fd, offset); 143 if (*alloc_area == MAP_FAILED) { 144 *alloc_area = NULL; 145 return -errno; 146 } 147 if (*alloc_area != p) 148 err("mmap of memfd failed at %p", p); 149 150 area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, 151 mem_fd, offset); 152 if (area_alias == MAP_FAILED) { 153 munmap(*alloc_area, bytes); 154 *alloc_area = NULL; 155 return -errno; 156 } 157 if (area_alias != p_alias) 158 err("mmap of anonymous memory failed at %p", p_alias); 159 160 if (is_src) 161 area_src_alias = area_alias; 162 else 163 area_dst_alias = area_alias; 164 165 close(mem_fd); 166 return 0; 167 } 168 169 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) 170 { 171 *start = (unsigned long)area_dst_alias + offset; 172 } 173 174 static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) 175 { 176 if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, 177 read_pmd_pagesize())) 178 err("Did not find expected %d number of hugepages", 179 expect_nr_hpages); 180 } 181 182 struct uffd_test_ops anon_uffd_test_ops = { 183 .allocate_area = anon_allocate_area, 184 .release_pages = anon_release_pages, 185 .alias_mapping = noop_alias_mapping, 186 .check_pmd_mapping = NULL, 187 }; 188 189 struct uffd_test_ops shmem_uffd_test_ops = { 190 .allocate_area = shmem_allocate_area, 191 .release_pages = shmem_release_pages, 192 .alias_mapping = shmem_alias_mapping, 193 .check_pmd_mapping = shmem_check_pmd_mapping, 194 }; 195 196 struct uffd_test_ops hugetlb_uffd_test_ops = { 197 .allocate_area = hugetlb_allocate_area, 198 .release_pages = hugetlb_release_pages, 199 .alias_mapping = hugetlb_alias_mapping, 200 .check_pmd_mapping = NULL, 201 }; 202 203 void uffd_stats_report(struct uffd_args *args, int n_cpus) 204 { 205 int i; 206 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; 207 208 for (i = 0; i < n_cpus; i++) { 209 miss_total += args[i].missing_faults; 210 wp_total += args[i].wp_faults; 211 minor_total += args[i].minor_faults; 212 } 213 214 printf("userfaults: "); 215 if (miss_total) { 216 printf("%llu missing (", miss_total); 217 for (i = 0; i < n_cpus; i++) 218 printf("%lu+", args[i].missing_faults); 219 printf("\b) "); 220 } 221 if (wp_total) { 222 printf("%llu wp (", wp_total); 223 for (i = 0; i < n_cpus; i++) 224 printf("%lu+", args[i].wp_faults); 225 printf("\b) "); 226 } 227 if (minor_total) { 228 printf("%llu minor (", minor_total); 229 for (i = 0; i < n_cpus; i++) 230 printf("%lu+", args[i].minor_faults); 231 printf("\b)"); 232 } 233 printf("\n"); 234 } 235 236 int userfaultfd_open(uint64_t *features) 237 { 238 struct uffdio_api uffdio_api; 239 240 uffd = uffd_open(UFFD_FLAGS); 241 if (uffd < 0) 242 return -1; 243 uffd_flags = fcntl(uffd, F_GETFD, NULL); 244 245 uffdio_api.api = UFFD_API; 246 uffdio_api.features = *features; 247 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) 248 /* Probably lack of CAP_PTRACE? */ 249 return -1; 250 if (uffdio_api.api != UFFD_API) 251 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); 252 253 *features = uffdio_api.features; 254 return 0; 255 } 256 257 static inline void munmap_area(void **area) 258 { 259 if (*area) 260 if (munmap(*area, nr_pages * page_size)) 261 err("munmap"); 262 263 *area = NULL; 264 } 265 266 static void uffd_test_ctx_clear(void) 267 { 268 size_t i; 269 270 if (pipefd) { 271 for (i = 0; i < nr_cpus * 2; ++i) { 272 if (close(pipefd[i])) 273 err("close pipefd"); 274 } 275 free(pipefd); 276 pipefd = NULL; 277 } 278 279 if (count_verify) { 280 free(count_verify); 281 count_verify = NULL; 282 } 283 284 if (uffd != -1) { 285 if (close(uffd)) 286 err("close uffd"); 287 uffd = -1; 288 } 289 290 munmap_area((void **)&area_src); 291 munmap_area((void **)&area_src_alias); 292 munmap_area((void **)&area_dst); 293 munmap_area((void **)&area_dst_alias); 294 munmap_area((void **)&area_remap); 295 } 296 297 int uffd_test_ctx_init(uint64_t features, const char **errmsg) 298 { 299 unsigned long nr, cpu; 300 int ret; 301 302 uffd_test_ctx_clear(); 303 304 ret = uffd_test_ops->allocate_area((void **)&area_src, true); 305 ret |= uffd_test_ops->allocate_area((void **)&area_dst, false); 306 if (ret) { 307 if (errmsg) 308 *errmsg = "memory allocation failed"; 309 return ret; 310 } 311 312 ret = userfaultfd_open(&features); 313 if (ret) { 314 if (errmsg) 315 *errmsg = "possible lack of priviledge"; 316 return ret; 317 } 318 319 count_verify = malloc(nr_pages * sizeof(unsigned long long)); 320 if (!count_verify) 321 err("count_verify"); 322 323 for (nr = 0; nr < nr_pages; nr++) { 324 *area_mutex(area_src, nr) = 325 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; 326 count_verify[nr] = *area_count(area_src, nr) = 1; 327 /* 328 * In the transition between 255 to 256, powerpc will 329 * read out of order in my_bcmp and see both bytes as 330 * zero, so leave a placeholder below always non-zero 331 * after the count, to avoid my_bcmp to trigger false 332 * positives. 333 */ 334 *(area_count(area_src, nr) + 1) = 1; 335 } 336 337 /* 338 * After initialization of area_src, we must explicitly release pages 339 * for area_dst to make sure it's fully empty. Otherwise we could have 340 * some area_dst pages be errornously initialized with zero pages, 341 * hence we could hit memory corruption later in the test. 342 * 343 * One example is when THP is globally enabled, above allocate_area() 344 * calls could have the two areas merged into a single VMA (as they 345 * will have the same VMA flags so they're mergeable). When we 346 * initialize the area_src above, it's possible that some part of 347 * area_dst could have been faulted in via one huge THP that will be 348 * shared between area_src and area_dst. It could cause some of the 349 * area_dst won't be trapped by missing userfaults. 350 * 351 * This release_pages() will guarantee even if that happened, we'll 352 * proactively split the thp and drop any accidentally initialized 353 * pages within area_dst. 354 */ 355 uffd_test_ops->release_pages(area_dst); 356 357 pipefd = malloc(sizeof(int) * nr_cpus * 2); 358 if (!pipefd) 359 err("pipefd"); 360 for (cpu = 0; cpu < nr_cpus; cpu++) 361 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) 362 err("pipe"); 363 364 return 0; 365 } 366 367 void wp_range(int ufd, __u64 start, __u64 len, bool wp) 368 { 369 struct uffdio_writeprotect prms; 370 371 /* Write protection page faults */ 372 prms.range.start = start; 373 prms.range.len = len; 374 /* Undo write-protect, do wakeup after that */ 375 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; 376 377 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) 378 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); 379 } 380 381 static void continue_range(int ufd, __u64 start, __u64 len, bool wp) 382 { 383 struct uffdio_continue req; 384 int ret; 385 386 req.range.start = start; 387 req.range.len = len; 388 req.mode = 0; 389 if (wp) 390 req.mode |= UFFDIO_CONTINUE_MODE_WP; 391 392 if (ioctl(ufd, UFFDIO_CONTINUE, &req)) 393 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, 394 (uint64_t)start); 395 396 /* 397 * Error handling within the kernel for continue is subtly different 398 * from copy or zeropage, so it may be a source of bugs. Trigger an 399 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. 400 */ 401 req.mapped = 0; 402 ret = ioctl(ufd, UFFDIO_CONTINUE, &req); 403 if (ret >= 0 || req.mapped != -EEXIST) 404 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, 405 ret, (int64_t) req.mapped); 406 } 407 408 int uffd_read_msg(int ufd, struct uffd_msg *msg) 409 { 410 int ret = read(uffd, msg, sizeof(*msg)); 411 412 if (ret != sizeof(*msg)) { 413 if (ret < 0) { 414 if (errno == EAGAIN || errno == EINTR) 415 return 1; 416 err("blocking read error"); 417 } else { 418 err("short read"); 419 } 420 } 421 422 return 0; 423 } 424 425 void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) 426 { 427 unsigned long offset; 428 429 if (msg->event != UFFD_EVENT_PAGEFAULT) 430 err("unexpected msg event %u", msg->event); 431 432 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { 433 /* Write protect page faults */ 434 wp_range(uffd, msg->arg.pagefault.address, page_size, false); 435 args->wp_faults++; 436 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { 437 uint8_t *area; 438 int b; 439 440 /* 441 * Minor page faults 442 * 443 * To prove we can modify the original range for testing 444 * purposes, we're going to bit flip this range before 445 * continuing. 446 * 447 * Note that this requires all minor page fault tests operate on 448 * area_dst (non-UFFD-registered) and area_dst_alias 449 * (UFFD-registered). 450 */ 451 452 area = (uint8_t *)(area_dst + 453 ((char *)msg->arg.pagefault.address - 454 area_dst_alias)); 455 for (b = 0; b < page_size; ++b) 456 area[b] = ~area[b]; 457 continue_range(uffd, msg->arg.pagefault.address, page_size, 458 args->apply_wp); 459 args->minor_faults++; 460 } else { 461 /* 462 * Missing page faults. 463 * 464 * Here we force a write check for each of the missing mode 465 * faults. It's guaranteed because the only threads that 466 * will trigger uffd faults are the locking threads, and 467 * their first instruction to touch the missing page will 468 * always be pthread_mutex_lock(). 469 * 470 * Note that here we relied on an NPTL glibc impl detail to 471 * always read the lock type at the entry of the lock op 472 * (pthread_mutex_t.__data.__type, offset 0x10) before 473 * doing any locking operations to guarantee that. It's 474 * actually not good to rely on this impl detail because 475 * logically a pthread-compatible lib can implement the 476 * locks without types and we can fail when linking with 477 * them. However since we used to find bugs with this 478 * strict check we still keep it around. Hopefully this 479 * could be a good hint when it fails again. If one day 480 * it'll break on some other impl of glibc we'll revisit. 481 */ 482 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) 483 err("unexpected write fault"); 484 485 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; 486 offset &= ~(page_size-1); 487 488 if (copy_page(uffd, offset, args->apply_wp)) 489 args->missing_faults++; 490 } 491 } 492 493 void *uffd_poll_thread(void *arg) 494 { 495 struct uffd_args *args = (struct uffd_args *)arg; 496 unsigned long cpu = args->cpu; 497 struct pollfd pollfd[2]; 498 struct uffd_msg msg; 499 struct uffdio_register uffd_reg; 500 int ret; 501 char tmp_chr; 502 503 if (!args->handle_fault) 504 args->handle_fault = uffd_handle_page_fault; 505 506 pollfd[0].fd = uffd; 507 pollfd[0].events = POLLIN; 508 pollfd[1].fd = pipefd[cpu*2]; 509 pollfd[1].events = POLLIN; 510 511 ready_for_fork = true; 512 513 for (;;) { 514 ret = poll(pollfd, 2, -1); 515 if (ret <= 0) { 516 if (errno == EINTR || errno == EAGAIN) 517 continue; 518 err("poll error: %d", ret); 519 } 520 if (pollfd[1].revents) { 521 if (!(pollfd[1].revents & POLLIN)) 522 err("pollfd[1].revents %d", pollfd[1].revents); 523 if (read(pollfd[1].fd, &tmp_chr, 1) != 1) 524 err("read pipefd error"); 525 break; 526 } 527 if (!(pollfd[0].revents & POLLIN)) 528 err("pollfd[0].revents %d", pollfd[0].revents); 529 if (uffd_read_msg(uffd, &msg)) 530 continue; 531 switch (msg.event) { 532 default: 533 err("unexpected msg event %u\n", msg.event); 534 break; 535 case UFFD_EVENT_PAGEFAULT: 536 args->handle_fault(&msg, args); 537 break; 538 case UFFD_EVENT_FORK: 539 close(uffd); 540 uffd = msg.arg.fork.ufd; 541 pollfd[0].fd = uffd; 542 break; 543 case UFFD_EVENT_REMOVE: 544 uffd_reg.range.start = msg.arg.remove.start; 545 uffd_reg.range.len = msg.arg.remove.end - 546 msg.arg.remove.start; 547 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) 548 err("remove failure"); 549 break; 550 case UFFD_EVENT_REMAP: 551 area_remap = area_dst; /* save for later unmap */ 552 area_dst = (char *)(unsigned long)msg.arg.remap.to; 553 break; 554 } 555 } 556 557 return NULL; 558 } 559 560 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, 561 unsigned long offset) 562 { 563 uffd_test_ops->alias_mapping(&uffdio_copy->dst, 564 uffdio_copy->len, 565 offset); 566 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { 567 /* real retval in ufdio_copy.copy */ 568 if (uffdio_copy->copy != -EEXIST) 569 err("UFFDIO_COPY retry error: %"PRId64, 570 (int64_t)uffdio_copy->copy); 571 } else { 572 err("UFFDIO_COPY retry unexpected: %"PRId64, 573 (int64_t)uffdio_copy->copy); 574 } 575 } 576 577 static void wake_range(int ufd, unsigned long addr, unsigned long len) 578 { 579 struct uffdio_range uffdio_wake; 580 581 uffdio_wake.start = addr; 582 uffdio_wake.len = len; 583 584 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) 585 fprintf(stderr, "error waking %lu\n", 586 addr), exit(1); 587 } 588 589 int __copy_page(int ufd, unsigned long offset, bool retry, bool wp) 590 { 591 struct uffdio_copy uffdio_copy; 592 593 if (offset >= nr_pages * page_size) 594 err("unexpected offset %lu\n", offset); 595 uffdio_copy.dst = (unsigned long) area_dst + offset; 596 uffdio_copy.src = (unsigned long) area_src + offset; 597 uffdio_copy.len = page_size; 598 if (wp) 599 uffdio_copy.mode = UFFDIO_COPY_MODE_WP; 600 else 601 uffdio_copy.mode = 0; 602 uffdio_copy.copy = 0; 603 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { 604 /* real retval in ufdio_copy.copy */ 605 if (uffdio_copy.copy != -EEXIST) 606 err("UFFDIO_COPY error: %"PRId64, 607 (int64_t)uffdio_copy.copy); 608 wake_range(ufd, uffdio_copy.dst, page_size); 609 } else if (uffdio_copy.copy != page_size) { 610 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); 611 } else { 612 if (test_uffdio_copy_eexist && retry) { 613 test_uffdio_copy_eexist = false; 614 retry_copy_page(ufd, &uffdio_copy, offset); 615 } 616 return 1; 617 } 618 return 0; 619 } 620 621 int copy_page(int ufd, unsigned long offset, bool wp) 622 { 623 return __copy_page(ufd, offset, false, wp); 624 } 625 626 int uffd_open_dev(unsigned int flags) 627 { 628 int fd, uffd; 629 630 fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); 631 if (fd < 0) 632 return fd; 633 uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags); 634 close(fd); 635 636 return uffd; 637 } 638 639 int uffd_open_sys(unsigned int flags) 640 { 641 #ifdef __NR_userfaultfd 642 return syscall(__NR_userfaultfd, flags); 643 #else 644 return -1; 645 #endif 646 } 647 648 int uffd_open(unsigned int flags) 649 { 650 int uffd = uffd_open_sys(flags); 651 652 if (uffd < 0) 653 uffd = uffd_open_dev(flags); 654 655 return uffd; 656 } 657 658 int uffd_get_features(uint64_t *features) 659 { 660 struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 }; 661 /* 662 * This should by default work in most kernels; the feature list 663 * will be the same no matter what we pass in here. 664 */ 665 int fd = uffd_open(UFFD_USER_MODE_ONLY); 666 667 if (fd < 0) 668 /* Maybe the kernel is older than user-only mode? */ 669 fd = uffd_open(0); 670 671 if (fd < 0) 672 return fd; 673 674 if (ioctl(fd, UFFDIO_API, &uffdio_api)) { 675 close(fd); 676 return -errno; 677 } 678 679 *features = uffdio_api.features; 680 close(fd); 681 682 return 0; 683 } 684