1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Userfaultfd tests util functions 4 * 5 * Copyright (C) 2015-2023 Red Hat, Inc. 6 */ 7 8 #include "uffd-common.h" 9 10 #define BASE_PMD_ADDR ((void *)(1UL << 30)) 11 12 volatile bool test_uffdio_copy_eexist = true; 13 unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; 14 char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; 15 int uffd = -1, uffd_flags, finished, *pipefd, test_type; 16 bool map_shared; 17 bool test_uffdio_wp = true; 18 unsigned long long *count_verify; 19 uffd_test_ops_t *uffd_test_ops; 20 21 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) 22 { 23 unsigned int memfd_flags = 0; 24 int mem_fd; 25 26 if (hugetlb) 27 memfd_flags = MFD_HUGETLB; 28 mem_fd = memfd_create("uffd-test", memfd_flags); 29 if (mem_fd < 0) 30 err("memfd_create"); 31 if (ftruncate(mem_fd, mem_size)) 32 err("ftruncate"); 33 if (fallocate(mem_fd, 34 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 35 mem_size)) 36 err("fallocate"); 37 38 return mem_fd; 39 } 40 41 static void anon_release_pages(char *rel_area) 42 { 43 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 44 err("madvise(MADV_DONTNEED) failed"); 45 } 46 47 static int anon_allocate_area(void **alloc_area, bool is_src) 48 { 49 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, 50 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); 51 if (*alloc_area == MAP_FAILED) { 52 *alloc_area = NULL; 53 return -errno; 54 } 55 return 0; 56 } 57 58 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) 59 { 60 } 61 62 static void hugetlb_release_pages(char *rel_area) 63 { 64 if (!map_shared) { 65 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) 66 err("madvise(MADV_DONTNEED) failed"); 67 } else { 68 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 69 err("madvise(MADV_REMOVE) failed"); 70 } 71 } 72 73 static int hugetlb_allocate_area(void **alloc_area, bool is_src) 74 { 75 off_t size = nr_pages * page_size; 76 off_t offset = is_src ? 0 : size; 77 void *area_alias = NULL; 78 char **alloc_area_alias; 79 int mem_fd = uffd_mem_fd_create(size * 2, true); 80 81 *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, 82 (map_shared ? MAP_SHARED : MAP_PRIVATE) | 83 (is_src ? 0 : MAP_NORESERVE), 84 mem_fd, offset); 85 if (*alloc_area == MAP_FAILED) { 86 *alloc_area = NULL; 87 return -errno; 88 } 89 90 if (map_shared) { 91 area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, 92 MAP_SHARED, mem_fd, offset); 93 if (area_alias == MAP_FAILED) 94 return -errno; 95 } 96 97 if (is_src) { 98 alloc_area_alias = &area_src_alias; 99 } else { 100 alloc_area_alias = &area_dst_alias; 101 } 102 if (area_alias) 103 *alloc_area_alias = area_alias; 104 105 close(mem_fd); 106 return 0; 107 } 108 109 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) 110 { 111 if (!map_shared) 112 return; 113 114 *start = (unsigned long) area_dst_alias + offset; 115 } 116 117 static void shmem_release_pages(char *rel_area) 118 { 119 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) 120 err("madvise(MADV_REMOVE) failed"); 121 } 122 123 static int shmem_allocate_area(void **alloc_area, bool is_src) 124 { 125 void *area_alias = NULL; 126 size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize(); 127 unsigned long offset = is_src ? 0 : bytes; 128 char *p = NULL, *p_alias = NULL; 129 int mem_fd = uffd_mem_fd_create(bytes * 2, false); 130 131 /* TODO: clean this up. Use a static addr is ugly */ 132 p = BASE_PMD_ADDR; 133 if (!is_src) 134 /* src map + alias + interleaved hpages */ 135 p += 2 * (bytes + hpage_size); 136 p_alias = p; 137 p_alias += bytes; 138 p_alias += hpage_size; /* Prevent src/dst VMA merge */ 139 140 *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, 141 mem_fd, offset); 142 if (*alloc_area == MAP_FAILED) { 143 *alloc_area = NULL; 144 return -errno; 145 } 146 if (*alloc_area != p) 147 err("mmap of memfd failed at %p", p); 148 149 area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, 150 mem_fd, offset); 151 if (area_alias == MAP_FAILED) { 152 munmap(*alloc_area, bytes); 153 *alloc_area = NULL; 154 return -errno; 155 } 156 if (area_alias != p_alias) 157 err("mmap of anonymous memory failed at %p", p_alias); 158 159 if (is_src) 160 area_src_alias = area_alias; 161 else 162 area_dst_alias = area_alias; 163 164 close(mem_fd); 165 return 0; 166 } 167 168 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) 169 { 170 *start = (unsigned long)area_dst_alias + offset; 171 } 172 173 static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) 174 { 175 if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, 176 read_pmd_pagesize())) 177 err("Did not find expected %d number of hugepages", 178 expect_nr_hpages); 179 } 180 181 struct uffd_test_ops anon_uffd_test_ops = { 182 .allocate_area = anon_allocate_area, 183 .release_pages = anon_release_pages, 184 .alias_mapping = noop_alias_mapping, 185 .check_pmd_mapping = NULL, 186 }; 187 188 struct uffd_test_ops shmem_uffd_test_ops = { 189 .allocate_area = shmem_allocate_area, 190 .release_pages = shmem_release_pages, 191 .alias_mapping = shmem_alias_mapping, 192 .check_pmd_mapping = shmem_check_pmd_mapping, 193 }; 194 195 struct uffd_test_ops hugetlb_uffd_test_ops = { 196 .allocate_area = hugetlb_allocate_area, 197 .release_pages = hugetlb_release_pages, 198 .alias_mapping = hugetlb_alias_mapping, 199 .check_pmd_mapping = NULL, 200 }; 201 202 void uffd_stats_report(struct uffd_args *args, int n_cpus) 203 { 204 int i; 205 unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; 206 207 for (i = 0; i < n_cpus; i++) { 208 miss_total += args[i].missing_faults; 209 wp_total += args[i].wp_faults; 210 minor_total += args[i].minor_faults; 211 } 212 213 printf("userfaults: "); 214 if (miss_total) { 215 printf("%llu missing (", miss_total); 216 for (i = 0; i < n_cpus; i++) 217 printf("%lu+", args[i].missing_faults); 218 printf("\b) "); 219 } 220 if (wp_total) { 221 printf("%llu wp (", wp_total); 222 for (i = 0; i < n_cpus; i++) 223 printf("%lu+", args[i].wp_faults); 224 printf("\b) "); 225 } 226 if (minor_total) { 227 printf("%llu minor (", minor_total); 228 for (i = 0; i < n_cpus; i++) 229 printf("%lu+", args[i].minor_faults); 230 printf("\b)"); 231 } 232 printf("\n"); 233 } 234 235 int userfaultfd_open(uint64_t *features) 236 { 237 struct uffdio_api uffdio_api; 238 239 uffd = uffd_open(UFFD_FLAGS); 240 if (uffd < 0) 241 return -1; 242 uffd_flags = fcntl(uffd, F_GETFD, NULL); 243 244 uffdio_api.api = UFFD_API; 245 uffdio_api.features = *features; 246 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) 247 /* Probably lack of CAP_PTRACE? */ 248 return -1; 249 if (uffdio_api.api != UFFD_API) 250 err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); 251 252 *features = uffdio_api.features; 253 return 0; 254 } 255 256 static inline void munmap_area(void **area) 257 { 258 if (*area) 259 if (munmap(*area, nr_pages * page_size)) 260 err("munmap"); 261 262 *area = NULL; 263 } 264 265 static void uffd_test_ctx_clear(void) 266 { 267 size_t i; 268 269 if (pipefd) { 270 for (i = 0; i < nr_cpus * 2; ++i) { 271 if (close(pipefd[i])) 272 err("close pipefd"); 273 } 274 free(pipefd); 275 pipefd = NULL; 276 } 277 278 if (count_verify) { 279 free(count_verify); 280 count_verify = NULL; 281 } 282 283 if (uffd != -1) { 284 if (close(uffd)) 285 err("close uffd"); 286 uffd = -1; 287 } 288 289 munmap_area((void **)&area_src); 290 munmap_area((void **)&area_src_alias); 291 munmap_area((void **)&area_dst); 292 munmap_area((void **)&area_dst_alias); 293 munmap_area((void **)&area_remap); 294 } 295 296 int uffd_test_ctx_init(uint64_t features, const char **errmsg) 297 { 298 unsigned long nr, cpu; 299 int ret; 300 301 uffd_test_ctx_clear(); 302 303 ret = uffd_test_ops->allocate_area((void **)&area_src, true); 304 ret |= uffd_test_ops->allocate_area((void **)&area_dst, false); 305 if (ret) { 306 if (errmsg) 307 *errmsg = "memory allocation failed"; 308 return ret; 309 } 310 311 ret = userfaultfd_open(&features); 312 if (ret) { 313 if (errmsg) 314 *errmsg = "possible lack of priviledge"; 315 return ret; 316 } 317 318 count_verify = malloc(nr_pages * sizeof(unsigned long long)); 319 if (!count_verify) 320 err("count_verify"); 321 322 for (nr = 0; nr < nr_pages; nr++) { 323 *area_mutex(area_src, nr) = 324 (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; 325 count_verify[nr] = *area_count(area_src, nr) = 1; 326 /* 327 * In the transition between 255 to 256, powerpc will 328 * read out of order in my_bcmp and see both bytes as 329 * zero, so leave a placeholder below always non-zero 330 * after the count, to avoid my_bcmp to trigger false 331 * positives. 332 */ 333 *(area_count(area_src, nr) + 1) = 1; 334 } 335 336 /* 337 * After initialization of area_src, we must explicitly release pages 338 * for area_dst to make sure it's fully empty. Otherwise we could have 339 * some area_dst pages be errornously initialized with zero pages, 340 * hence we could hit memory corruption later in the test. 341 * 342 * One example is when THP is globally enabled, above allocate_area() 343 * calls could have the two areas merged into a single VMA (as they 344 * will have the same VMA flags so they're mergeable). When we 345 * initialize the area_src above, it's possible that some part of 346 * area_dst could have been faulted in via one huge THP that will be 347 * shared between area_src and area_dst. It could cause some of the 348 * area_dst won't be trapped by missing userfaults. 349 * 350 * This release_pages() will guarantee even if that happened, we'll 351 * proactively split the thp and drop any accidentally initialized 352 * pages within area_dst. 353 */ 354 uffd_test_ops->release_pages(area_dst); 355 356 pipefd = malloc(sizeof(int) * nr_cpus * 2); 357 if (!pipefd) 358 err("pipefd"); 359 for (cpu = 0; cpu < nr_cpus; cpu++) 360 if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) 361 err("pipe"); 362 363 return 0; 364 } 365 366 void wp_range(int ufd, __u64 start, __u64 len, bool wp) 367 { 368 struct uffdio_writeprotect prms; 369 370 /* Write protection page faults */ 371 prms.range.start = start; 372 prms.range.len = len; 373 /* Undo write-protect, do wakeup after that */ 374 prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; 375 376 if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) 377 err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); 378 } 379 380 static void continue_range(int ufd, __u64 start, __u64 len, bool wp) 381 { 382 struct uffdio_continue req; 383 int ret; 384 385 req.range.start = start; 386 req.range.len = len; 387 req.mode = 0; 388 if (wp) 389 req.mode |= UFFDIO_CONTINUE_MODE_WP; 390 391 if (ioctl(ufd, UFFDIO_CONTINUE, &req)) 392 err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, 393 (uint64_t)start); 394 395 /* 396 * Error handling within the kernel for continue is subtly different 397 * from copy or zeropage, so it may be a source of bugs. Trigger an 398 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. 399 */ 400 req.mapped = 0; 401 ret = ioctl(ufd, UFFDIO_CONTINUE, &req); 402 if (ret >= 0 || req.mapped != -EEXIST) 403 err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, 404 ret, (int64_t) req.mapped); 405 } 406 407 int uffd_read_msg(int ufd, struct uffd_msg *msg) 408 { 409 int ret = read(uffd, msg, sizeof(*msg)); 410 411 if (ret != sizeof(*msg)) { 412 if (ret < 0) { 413 if (errno == EAGAIN || errno == EINTR) 414 return 1; 415 err("blocking read error"); 416 } else { 417 err("short read"); 418 } 419 } 420 421 return 0; 422 } 423 424 void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) 425 { 426 unsigned long offset; 427 428 if (msg->event != UFFD_EVENT_PAGEFAULT) 429 err("unexpected msg event %u", msg->event); 430 431 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { 432 /* Write protect page faults */ 433 wp_range(uffd, msg->arg.pagefault.address, page_size, false); 434 args->wp_faults++; 435 } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { 436 uint8_t *area; 437 int b; 438 439 /* 440 * Minor page faults 441 * 442 * To prove we can modify the original range for testing 443 * purposes, we're going to bit flip this range before 444 * continuing. 445 * 446 * Note that this requires all minor page fault tests operate on 447 * area_dst (non-UFFD-registered) and area_dst_alias 448 * (UFFD-registered). 449 */ 450 451 area = (uint8_t *)(area_dst + 452 ((char *)msg->arg.pagefault.address - 453 area_dst_alias)); 454 for (b = 0; b < page_size; ++b) 455 area[b] = ~area[b]; 456 continue_range(uffd, msg->arg.pagefault.address, page_size, 457 args->apply_wp); 458 args->minor_faults++; 459 } else { 460 /* 461 * Missing page faults. 462 * 463 * Here we force a write check for each of the missing mode 464 * faults. It's guaranteed because the only threads that 465 * will trigger uffd faults are the locking threads, and 466 * their first instruction to touch the missing page will 467 * always be pthread_mutex_lock(). 468 * 469 * Note that here we relied on an NPTL glibc impl detail to 470 * always read the lock type at the entry of the lock op 471 * (pthread_mutex_t.__data.__type, offset 0x10) before 472 * doing any locking operations to guarantee that. It's 473 * actually not good to rely on this impl detail because 474 * logically a pthread-compatible lib can implement the 475 * locks without types and we can fail when linking with 476 * them. However since we used to find bugs with this 477 * strict check we still keep it around. Hopefully this 478 * could be a good hint when it fails again. If one day 479 * it'll break on some other impl of glibc we'll revisit. 480 */ 481 if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) 482 err("unexpected write fault"); 483 484 offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; 485 offset &= ~(page_size-1); 486 487 if (copy_page(uffd, offset, args->apply_wp)) 488 args->missing_faults++; 489 } 490 } 491 492 void *uffd_poll_thread(void *arg) 493 { 494 struct uffd_args *args = (struct uffd_args *)arg; 495 unsigned long cpu = args->cpu; 496 struct pollfd pollfd[2]; 497 struct uffd_msg msg; 498 struct uffdio_register uffd_reg; 499 int ret; 500 char tmp_chr; 501 502 if (!args->handle_fault) 503 args->handle_fault = uffd_handle_page_fault; 504 505 pollfd[0].fd = uffd; 506 pollfd[0].events = POLLIN; 507 pollfd[1].fd = pipefd[cpu*2]; 508 pollfd[1].events = POLLIN; 509 510 for (;;) { 511 ret = poll(pollfd, 2, -1); 512 if (ret <= 0) { 513 if (errno == EINTR || errno == EAGAIN) 514 continue; 515 err("poll error: %d", ret); 516 } 517 if (pollfd[1].revents) { 518 if (!(pollfd[1].revents & POLLIN)) 519 err("pollfd[1].revents %d", pollfd[1].revents); 520 if (read(pollfd[1].fd, &tmp_chr, 1) != 1) 521 err("read pipefd error"); 522 break; 523 } 524 if (!(pollfd[0].revents & POLLIN)) 525 err("pollfd[0].revents %d", pollfd[0].revents); 526 if (uffd_read_msg(uffd, &msg)) 527 continue; 528 switch (msg.event) { 529 default: 530 err("unexpected msg event %u\n", msg.event); 531 break; 532 case UFFD_EVENT_PAGEFAULT: 533 args->handle_fault(&msg, args); 534 break; 535 case UFFD_EVENT_FORK: 536 close(uffd); 537 uffd = msg.arg.fork.ufd; 538 pollfd[0].fd = uffd; 539 break; 540 case UFFD_EVENT_REMOVE: 541 uffd_reg.range.start = msg.arg.remove.start; 542 uffd_reg.range.len = msg.arg.remove.end - 543 msg.arg.remove.start; 544 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) 545 err("remove failure"); 546 break; 547 case UFFD_EVENT_REMAP: 548 area_remap = area_dst; /* save for later unmap */ 549 area_dst = (char *)(unsigned long)msg.arg.remap.to; 550 break; 551 } 552 } 553 554 return NULL; 555 } 556 557 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, 558 unsigned long offset) 559 { 560 uffd_test_ops->alias_mapping(&uffdio_copy->dst, 561 uffdio_copy->len, 562 offset); 563 if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { 564 /* real retval in ufdio_copy.copy */ 565 if (uffdio_copy->copy != -EEXIST) 566 err("UFFDIO_COPY retry error: %"PRId64, 567 (int64_t)uffdio_copy->copy); 568 } else { 569 err("UFFDIO_COPY retry unexpected: %"PRId64, 570 (int64_t)uffdio_copy->copy); 571 } 572 } 573 574 static void wake_range(int ufd, unsigned long addr, unsigned long len) 575 { 576 struct uffdio_range uffdio_wake; 577 578 uffdio_wake.start = addr; 579 uffdio_wake.len = len; 580 581 if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) 582 fprintf(stderr, "error waking %lu\n", 583 addr), exit(1); 584 } 585 586 int __copy_page(int ufd, unsigned long offset, bool retry, bool wp) 587 { 588 struct uffdio_copy uffdio_copy; 589 590 if (offset >= nr_pages * page_size) 591 err("unexpected offset %lu\n", offset); 592 uffdio_copy.dst = (unsigned long) area_dst + offset; 593 uffdio_copy.src = (unsigned long) area_src + offset; 594 uffdio_copy.len = page_size; 595 if (wp) 596 uffdio_copy.mode = UFFDIO_COPY_MODE_WP; 597 else 598 uffdio_copy.mode = 0; 599 uffdio_copy.copy = 0; 600 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { 601 /* real retval in ufdio_copy.copy */ 602 if (uffdio_copy.copy != -EEXIST) 603 err("UFFDIO_COPY error: %"PRId64, 604 (int64_t)uffdio_copy.copy); 605 wake_range(ufd, uffdio_copy.dst, page_size); 606 } else if (uffdio_copy.copy != page_size) { 607 err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); 608 } else { 609 if (test_uffdio_copy_eexist && retry) { 610 test_uffdio_copy_eexist = false; 611 retry_copy_page(ufd, &uffdio_copy, offset); 612 } 613 return 1; 614 } 615 return 0; 616 } 617 618 int copy_page(int ufd, unsigned long offset, bool wp) 619 { 620 return __copy_page(ufd, offset, false, wp); 621 } 622 623 int uffd_open_dev(unsigned int flags) 624 { 625 int fd, uffd; 626 627 fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); 628 if (fd < 0) 629 return fd; 630 uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags); 631 close(fd); 632 633 return uffd; 634 } 635 636 int uffd_open_sys(unsigned int flags) 637 { 638 #ifdef __NR_userfaultfd 639 return syscall(__NR_userfaultfd, flags); 640 #else 641 return -1; 642 #endif 643 } 644 645 int uffd_open(unsigned int flags) 646 { 647 int uffd = uffd_open_sys(flags); 648 649 if (uffd < 0) 650 uffd = uffd_open_dev(flags); 651 652 return uffd; 653 } 654 655 int uffd_get_features(uint64_t *features) 656 { 657 struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 }; 658 /* 659 * This should by default work in most kernels; the feature list 660 * will be the same no matter what we pass in here. 661 */ 662 int fd = uffd_open(UFFD_USER_MODE_ONLY); 663 664 if (fd < 0) 665 /* Maybe the kernel is older than user-only mode? */ 666 fd = uffd_open(0); 667 668 if (fd < 0) 669 return fd; 670 671 if (ioctl(fd, UFFDIO_API, &uffdio_api)) { 672 close(fd); 673 return -errno; 674 } 675 676 *features = uffdio_api.features; 677 close(fd); 678 679 return 0; 680 } 681