1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) 4 * 5 * There are examples in here of: 6 * * how to set protection keys on memory 7 * * how to set/clear bits in pkey registers (the rights register) 8 * * how to handle SEGV_PKUERR signals and extract pkey-relevant 9 * information from the siginfo 10 * 11 * Things to add: 12 * make sure KSM and KSM COW breaking works 13 * prefault pages in at malloc, or not 14 * protect MPX bounds tables with protection keys? 15 * make sure VMA splitting/merging is working correctly 16 * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys 17 * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel 18 * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks 19 * 20 * Compile like this: 21 * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 22 * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm 23 */ 24 #define _GNU_SOURCE 25 #define __SANE_USERSPACE_TYPES__ 26 #include <errno.h> 27 #include <linux/elf.h> 28 #include <linux/futex.h> 29 #include <time.h> 30 #include <sys/time.h> 31 #include <sys/syscall.h> 32 #include <string.h> 33 #include <stdio.h> 34 #include <stdint.h> 35 #include <stdbool.h> 36 #include <signal.h> 37 #include <assert.h> 38 #include <stdlib.h> 39 #include <ucontext.h> 40 #include <sys/mman.h> 41 #include <sys/types.h> 42 #include <sys/wait.h> 43 #include <sys/stat.h> 44 #include <fcntl.h> 45 #include <unistd.h> 46 #include <sys/ptrace.h> 47 #include <setjmp.h> 48 49 #include "pkey-helpers.h" 50 51 int iteration_nr = 1; 52 int test_nr; 53 54 u64 shadow_pkey_reg; 55 int dprint_in_signal; 56 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; 57 58 void cat_into_file(char *str, char *file) 59 { 60 int fd = open(file, O_RDWR); 61 int ret; 62 63 dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); 64 /* 65 * these need to be raw because they are called under 66 * pkey_assert() 67 */ 68 if (fd < 0) { 69 fprintf(stderr, "error opening '%s'\n", str); 70 perror("error: "); 71 exit(__LINE__); 72 } 73 74 ret = write(fd, str, strlen(str)); 75 if (ret != strlen(str)) { 76 perror("write to file failed"); 77 fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); 78 exit(__LINE__); 79 } 80 close(fd); 81 } 82 83 #if CONTROL_TRACING > 0 84 static int warned_tracing; 85 int tracing_root_ok(void) 86 { 87 if (geteuid() != 0) { 88 if (!warned_tracing) 89 fprintf(stderr, "WARNING: not run as root, " 90 "can not do tracing control\n"); 91 warned_tracing = 1; 92 return 0; 93 } 94 return 1; 95 } 96 #endif 97 98 void tracing_on(void) 99 { 100 #if CONTROL_TRACING > 0 101 #define TRACEDIR "/sys/kernel/debug/tracing" 102 char pidstr[32]; 103 104 if (!tracing_root_ok()) 105 return; 106 107 sprintf(pidstr, "%d", getpid()); 108 cat_into_file("0", TRACEDIR "/tracing_on"); 109 cat_into_file("\n", TRACEDIR "/trace"); 110 if (1) { 111 cat_into_file("function_graph", TRACEDIR "/current_tracer"); 112 cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); 113 } else { 114 cat_into_file("nop", TRACEDIR "/current_tracer"); 115 } 116 cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); 117 cat_into_file("1", TRACEDIR "/tracing_on"); 118 dprintf1("enabled tracing\n"); 119 #endif 120 } 121 122 void tracing_off(void) 123 { 124 #if CONTROL_TRACING > 0 125 if (!tracing_root_ok()) 126 return; 127 cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); 128 #endif 129 } 130 131 void abort_hooks(void) 132 { 133 fprintf(stderr, "running %s()...\n", __func__); 134 tracing_off(); 135 #ifdef SLEEP_ON_ABORT 136 sleep(SLEEP_ON_ABORT); 137 #endif 138 } 139 140 /* 141 * This attempts to have roughly a page of instructions followed by a few 142 * instructions that do a write, and another page of instructions. That 143 * way, we are pretty sure that the write is in the second page of 144 * instructions and has at least a page of padding behind it. 145 * 146 * *That* lets us be sure to madvise() away the write instruction, which 147 * will then fault, which makes sure that the fault code handles 148 * execute-only memory properly. 149 */ 150 #ifdef __powerpc64__ 151 /* This way, both 4K and 64K alignment are maintained */ 152 __attribute__((__aligned__(65536))) 153 #else 154 __attribute__((__aligned__(PAGE_SIZE))) 155 #endif 156 void lots_o_noops_around_write(int *write_to_me) 157 { 158 dprintf3("running %s()\n", __func__); 159 __page_o_noops(); 160 /* Assume this happens in the second page of instructions: */ 161 *write_to_me = __LINE__; 162 /* pad out by another page: */ 163 __page_o_noops(); 164 dprintf3("%s() done\n", __func__); 165 } 166 167 void dump_mem(void *dumpme, int len_bytes) 168 { 169 char *c = (void *)dumpme; 170 int i; 171 172 for (i = 0; i < len_bytes; i += sizeof(u64)) { 173 u64 *ptr = (u64 *)(c + i); 174 dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); 175 } 176 } 177 178 static u32 hw_pkey_get(int pkey, unsigned long flags) 179 { 180 u64 pkey_reg = __read_pkey_reg(); 181 182 dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", 183 __func__, pkey, flags, 0, 0); 184 dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); 185 186 return (u32) get_pkey_bits(pkey_reg, pkey); 187 } 188 189 static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) 190 { 191 u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); 192 u64 old_pkey_reg = __read_pkey_reg(); 193 u64 new_pkey_reg; 194 195 /* make sure that 'rights' only contains the bits we expect: */ 196 assert(!(rights & ~mask)); 197 198 /* modify bits accordingly in old pkey_reg and assign it */ 199 new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); 200 201 __write_pkey_reg(new_pkey_reg); 202 203 dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" 204 " pkey_reg now: %016llx old_pkey_reg: %016llx\n", 205 __func__, pkey, rights, flags, 0, __read_pkey_reg(), 206 old_pkey_reg); 207 return 0; 208 } 209 210 void pkey_disable_set(int pkey, int flags) 211 { 212 unsigned long syscall_flags = 0; 213 int ret; 214 int pkey_rights; 215 u64 orig_pkey_reg = read_pkey_reg(); 216 217 dprintf1("START->%s(%d, 0x%x)\n", __func__, 218 pkey, flags); 219 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 220 221 pkey_rights = hw_pkey_get(pkey, syscall_flags); 222 223 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 224 pkey, pkey, pkey_rights); 225 226 pkey_assert(pkey_rights >= 0); 227 228 pkey_rights |= flags; 229 230 ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); 231 assert(!ret); 232 /* pkey_reg and flags have the same format */ 233 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); 234 dprintf1("%s(%d) shadow: 0x%016llx\n", 235 __func__, pkey, shadow_pkey_reg); 236 237 pkey_assert(ret >= 0); 238 239 pkey_rights = hw_pkey_get(pkey, syscall_flags); 240 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 241 pkey, pkey, pkey_rights); 242 243 dprintf1("%s(%d) pkey_reg: 0x%016llx\n", 244 __func__, pkey, read_pkey_reg()); 245 if (flags) 246 pkey_assert(read_pkey_reg() >= orig_pkey_reg); 247 dprintf1("END<---%s(%d, 0x%x)\n", __func__, 248 pkey, flags); 249 } 250 251 void pkey_disable_clear(int pkey, int flags) 252 { 253 unsigned long syscall_flags = 0; 254 int ret; 255 int pkey_rights = hw_pkey_get(pkey, syscall_flags); 256 u64 orig_pkey_reg = read_pkey_reg(); 257 258 pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); 259 260 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 261 pkey, pkey, pkey_rights); 262 pkey_assert(pkey_rights >= 0); 263 264 pkey_rights &= ~flags; 265 266 ret = hw_pkey_set(pkey, pkey_rights, 0); 267 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); 268 pkey_assert(ret >= 0); 269 270 pkey_rights = hw_pkey_get(pkey, syscall_flags); 271 dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, 272 pkey, pkey, pkey_rights); 273 274 dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, 275 pkey, read_pkey_reg()); 276 if (flags) 277 assert(read_pkey_reg() <= orig_pkey_reg); 278 } 279 280 void pkey_write_allow(int pkey) 281 { 282 pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); 283 } 284 void pkey_write_deny(int pkey) 285 { 286 pkey_disable_set(pkey, PKEY_DISABLE_WRITE); 287 } 288 void pkey_access_allow(int pkey) 289 { 290 pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); 291 } 292 void pkey_access_deny(int pkey) 293 { 294 pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); 295 } 296 297 /* Failed address bound checks: */ 298 #ifndef SEGV_BNDERR 299 # define SEGV_BNDERR 3 300 #endif 301 302 #ifndef SEGV_PKUERR 303 # define SEGV_PKUERR 4 304 #endif 305 306 static char *si_code_str(int si_code) 307 { 308 if (si_code == SEGV_MAPERR) 309 return "SEGV_MAPERR"; 310 if (si_code == SEGV_ACCERR) 311 return "SEGV_ACCERR"; 312 if (si_code == SEGV_BNDERR) 313 return "SEGV_BNDERR"; 314 if (si_code == SEGV_PKUERR) 315 return "SEGV_PKUERR"; 316 return "UNKNOWN"; 317 } 318 319 int pkey_faults; 320 int last_si_pkey = -1; 321 void signal_handler(int signum, siginfo_t *si, void *vucontext) 322 { 323 ucontext_t *uctxt = vucontext; 324 int trapno; 325 unsigned long ip; 326 char *fpregs; 327 #if defined(__i386__) || defined(__x86_64__) /* arch */ 328 u32 *pkey_reg_ptr; 329 int pkey_reg_offset; 330 #endif /* arch */ 331 u64 siginfo_pkey; 332 u32 *si_pkey_ptr; 333 334 dprint_in_signal = 1; 335 dprintf1(">>>>===============SIGSEGV============================\n"); 336 dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", 337 __func__, __LINE__, 338 __read_pkey_reg(), shadow_pkey_reg); 339 340 trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; 341 ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; 342 fpregs = (char *) uctxt->uc_mcontext.fpregs; 343 344 dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", 345 __func__, trapno, ip, si_code_str(si->si_code), 346 si->si_code); 347 348 #if defined(__i386__) || defined(__x86_64__) /* arch */ 349 #ifdef __i386__ 350 /* 351 * 32-bit has some extra padding so that userspace can tell whether 352 * the XSTATE header is present in addition to the "legacy" FPU 353 * state. We just assume that it is here. 354 */ 355 fpregs += 0x70; 356 #endif /* i386 */ 357 pkey_reg_offset = pkey_reg_xstate_offset(); 358 pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); 359 360 /* 361 * If we got a PKEY fault, we *HAVE* to have at least one bit set in 362 * here. 363 */ 364 dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); 365 if (DEBUG_LEVEL > 4) 366 dump_mem(pkey_reg_ptr - 128, 256); 367 pkey_assert(*pkey_reg_ptr); 368 #endif /* arch */ 369 370 dprintf1("siginfo: %p\n", si); 371 dprintf1(" fpregs: %p\n", fpregs); 372 373 if ((si->si_code == SEGV_MAPERR) || 374 (si->si_code == SEGV_ACCERR) || 375 (si->si_code == SEGV_BNDERR)) { 376 printf("non-PK si_code, exiting...\n"); 377 exit(4); 378 } 379 380 si_pkey_ptr = siginfo_get_pkey_ptr(si); 381 dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); 382 dump_mem((u8 *)si_pkey_ptr - 8, 24); 383 siginfo_pkey = *si_pkey_ptr; 384 pkey_assert(siginfo_pkey < NR_PKEYS); 385 last_si_pkey = siginfo_pkey; 386 387 /* 388 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 389 * checking 390 */ 391 dprintf1("signal pkey_reg from pkey_reg: %016llx\n", 392 __read_pkey_reg()); 393 dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); 394 #if defined(__i386__) || defined(__x86_64__) /* arch */ 395 dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); 396 *(u64 *)pkey_reg_ptr = 0x00000000; 397 dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); 398 #elif defined(__powerpc64__) /* arch */ 399 /* restore access and let the faulting instruction continue */ 400 pkey_access_allow(siginfo_pkey); 401 #endif /* arch */ 402 pkey_faults++; 403 dprintf1("<<<<==================================================\n"); 404 dprint_in_signal = 0; 405 } 406 407 int wait_all_children(void) 408 { 409 int status; 410 return waitpid(-1, &status, 0); 411 } 412 413 void sig_chld(int x) 414 { 415 dprint_in_signal = 1; 416 dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); 417 dprint_in_signal = 0; 418 } 419 420 void setup_sigsegv_handler(void) 421 { 422 int r, rs; 423 struct sigaction newact; 424 struct sigaction oldact; 425 426 /* #PF is mapped to sigsegv */ 427 int signum = SIGSEGV; 428 429 newact.sa_handler = 0; 430 newact.sa_sigaction = signal_handler; 431 432 /*sigset_t - signals to block while in the handler */ 433 /* get the old signal mask. */ 434 rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); 435 pkey_assert(rs == 0); 436 437 /* call sa_sigaction, not sa_handler*/ 438 newact.sa_flags = SA_SIGINFO; 439 440 newact.sa_restorer = 0; /* void(*)(), obsolete */ 441 r = sigaction(signum, &newact, &oldact); 442 r = sigaction(SIGALRM, &newact, &oldact); 443 pkey_assert(r == 0); 444 } 445 446 void setup_handlers(void) 447 { 448 signal(SIGCHLD, &sig_chld); 449 setup_sigsegv_handler(); 450 } 451 452 pid_t fork_lazy_child(void) 453 { 454 pid_t forkret; 455 456 forkret = fork(); 457 pkey_assert(forkret >= 0); 458 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 459 460 if (!forkret) { 461 /* in the child */ 462 while (1) { 463 dprintf1("child sleeping...\n"); 464 sleep(30); 465 } 466 } 467 return forkret; 468 } 469 470 int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 471 unsigned long pkey) 472 { 473 int sret; 474 475 dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, 476 ptr, size, orig_prot, pkey); 477 478 errno = 0; 479 sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); 480 if (errno) { 481 dprintf2("SYS_mprotect_key sret: %d\n", sret); 482 dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); 483 dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); 484 if (DEBUG_LEVEL >= 2) 485 perror("SYS_mprotect_pkey"); 486 } 487 return sret; 488 } 489 490 int sys_pkey_alloc(unsigned long flags, unsigned long init_val) 491 { 492 int ret = syscall(SYS_pkey_alloc, flags, init_val); 493 dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", 494 __func__, flags, init_val, ret, errno); 495 return ret; 496 } 497 498 int alloc_pkey(void) 499 { 500 int ret; 501 unsigned long init_val = 0x0; 502 503 dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", 504 __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); 505 ret = sys_pkey_alloc(0, init_val); 506 /* 507 * pkey_alloc() sets PKEY register, so we need to reflect it in 508 * shadow_pkey_reg: 509 */ 510 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 511 " shadow: 0x%016llx\n", 512 __func__, __LINE__, ret, __read_pkey_reg(), 513 shadow_pkey_reg); 514 if (ret > 0) { 515 /* clear both the bits: */ 516 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, 517 ~PKEY_MASK); 518 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 519 " shadow: 0x%016llx\n", 520 __func__, 521 __LINE__, ret, __read_pkey_reg(), 522 shadow_pkey_reg); 523 /* 524 * move the new state in from init_val 525 * (remember, we cheated and init_val == pkey_reg format) 526 */ 527 shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, 528 init_val); 529 } 530 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 531 " shadow: 0x%016llx\n", 532 __func__, __LINE__, ret, __read_pkey_reg(), 533 shadow_pkey_reg); 534 dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); 535 /* for shadow checking: */ 536 read_pkey_reg(); 537 dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" 538 " shadow: 0x%016llx\n", 539 __func__, __LINE__, ret, __read_pkey_reg(), 540 shadow_pkey_reg); 541 return ret; 542 } 543 544 int sys_pkey_free(unsigned long pkey) 545 { 546 int ret = syscall(SYS_pkey_free, pkey); 547 dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); 548 return ret; 549 } 550 551 /* 552 * I had a bug where pkey bits could be set by mprotect() but 553 * not cleared. This ensures we get lots of random bit sets 554 * and clears on the vma and pte pkey bits. 555 */ 556 int alloc_random_pkey(void) 557 { 558 int max_nr_pkey_allocs; 559 int ret; 560 int i; 561 int alloced_pkeys[NR_PKEYS]; 562 int nr_alloced = 0; 563 int random_index; 564 memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); 565 566 /* allocate every possible key and make a note of which ones we got */ 567 max_nr_pkey_allocs = NR_PKEYS; 568 for (i = 0; i < max_nr_pkey_allocs; i++) { 569 int new_pkey = alloc_pkey(); 570 if (new_pkey < 0) 571 break; 572 alloced_pkeys[nr_alloced++] = new_pkey; 573 } 574 575 pkey_assert(nr_alloced > 0); 576 /* select a random one out of the allocated ones */ 577 random_index = rand() % nr_alloced; 578 ret = alloced_pkeys[random_index]; 579 /* now zero it out so we don't free it next */ 580 alloced_pkeys[random_index] = 0; 581 582 /* go through the allocated ones that we did not want and free them */ 583 for (i = 0; i < nr_alloced; i++) { 584 int free_ret; 585 if (!alloced_pkeys[i]) 586 continue; 587 free_ret = sys_pkey_free(alloced_pkeys[i]); 588 pkey_assert(!free_ret); 589 } 590 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 591 " shadow: 0x%016llx\n", __func__, 592 __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); 593 return ret; 594 } 595 596 int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, 597 unsigned long pkey) 598 { 599 int nr_iterations = random() % 100; 600 int ret; 601 602 while (0) { 603 int rpkey = alloc_random_pkey(); 604 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 605 dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 606 ptr, size, orig_prot, pkey, ret); 607 if (nr_iterations-- < 0) 608 break; 609 610 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 611 " shadow: 0x%016llx\n", 612 __func__, __LINE__, ret, __read_pkey_reg(), 613 shadow_pkey_reg); 614 sys_pkey_free(rpkey); 615 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 616 " shadow: 0x%016llx\n", 617 __func__, __LINE__, ret, __read_pkey_reg(), 618 shadow_pkey_reg); 619 } 620 pkey_assert(pkey < NR_PKEYS); 621 622 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); 623 dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", 624 ptr, size, orig_prot, pkey, ret); 625 pkey_assert(!ret); 626 dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" 627 " shadow: 0x%016llx\n", __func__, 628 __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); 629 return ret; 630 } 631 632 struct pkey_malloc_record { 633 void *ptr; 634 long size; 635 int prot; 636 }; 637 struct pkey_malloc_record *pkey_malloc_records; 638 struct pkey_malloc_record *pkey_last_malloc_record; 639 long nr_pkey_malloc_records; 640 void record_pkey_malloc(void *ptr, long size, int prot) 641 { 642 long i; 643 struct pkey_malloc_record *rec = NULL; 644 645 for (i = 0; i < nr_pkey_malloc_records; i++) { 646 rec = &pkey_malloc_records[i]; 647 /* find a free record */ 648 if (rec) 649 break; 650 } 651 if (!rec) { 652 /* every record is full */ 653 size_t old_nr_records = nr_pkey_malloc_records; 654 size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); 655 size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); 656 dprintf2("new_nr_records: %zd\n", new_nr_records); 657 dprintf2("new_size: %zd\n", new_size); 658 pkey_malloc_records = realloc(pkey_malloc_records, new_size); 659 pkey_assert(pkey_malloc_records != NULL); 660 rec = &pkey_malloc_records[nr_pkey_malloc_records]; 661 /* 662 * realloc() does not initialize memory, so zero it from 663 * the first new record all the way to the end. 664 */ 665 for (i = 0; i < new_nr_records - old_nr_records; i++) 666 memset(rec + i, 0, sizeof(*rec)); 667 } 668 dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", 669 (int)(rec - pkey_malloc_records), rec, ptr, size); 670 rec->ptr = ptr; 671 rec->size = size; 672 rec->prot = prot; 673 pkey_last_malloc_record = rec; 674 nr_pkey_malloc_records++; 675 } 676 677 void free_pkey_malloc(void *ptr) 678 { 679 long i; 680 int ret; 681 dprintf3("%s(%p)\n", __func__, ptr); 682 for (i = 0; i < nr_pkey_malloc_records; i++) { 683 struct pkey_malloc_record *rec = &pkey_malloc_records[i]; 684 dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", 685 ptr, i, rec, rec->ptr, rec->size); 686 if ((ptr < rec->ptr) || 687 (ptr >= rec->ptr + rec->size)) 688 continue; 689 690 dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", 691 ptr, i, rec, rec->ptr, rec->size); 692 nr_pkey_malloc_records--; 693 ret = munmap(rec->ptr, rec->size); 694 dprintf3("munmap ret: %d\n", ret); 695 pkey_assert(!ret); 696 dprintf3("clearing rec->ptr, rec: %p\n", rec); 697 rec->ptr = NULL; 698 dprintf3("done clearing rec->ptr, rec: %p\n", rec); 699 return; 700 } 701 pkey_assert(false); 702 } 703 704 705 void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) 706 { 707 void *ptr; 708 int ret; 709 710 read_pkey_reg(); 711 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 712 size, prot, pkey); 713 pkey_assert(pkey < NR_PKEYS); 714 ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 715 pkey_assert(ptr != (void *)-1); 716 ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); 717 pkey_assert(!ret); 718 record_pkey_malloc(ptr, size, prot); 719 read_pkey_reg(); 720 721 dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); 722 return ptr; 723 } 724 725 void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) 726 { 727 int ret; 728 void *ptr; 729 730 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 731 size, prot, pkey); 732 /* 733 * Guarantee we can fit at least one huge page in the resulting 734 * allocation by allocating space for 2: 735 */ 736 size = ALIGN_UP(size, HPAGE_SIZE * 2); 737 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 738 pkey_assert(ptr != (void *)-1); 739 record_pkey_malloc(ptr, size, prot); 740 mprotect_pkey(ptr, size, prot, pkey); 741 742 dprintf1("unaligned ptr: %p\n", ptr); 743 ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); 744 dprintf1(" aligned ptr: %p\n", ptr); 745 ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); 746 dprintf1("MADV_HUGEPAGE ret: %d\n", ret); 747 ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); 748 dprintf1("MADV_WILLNEED ret: %d\n", ret); 749 memset(ptr, 0, HPAGE_SIZE); 750 751 dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); 752 return ptr; 753 } 754 755 int hugetlb_setup_ok; 756 #define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" 757 #define GET_NR_HUGE_PAGES 10 758 void setup_hugetlbfs(void) 759 { 760 int err; 761 int fd; 762 char buf[256]; 763 long hpagesz_kb; 764 long hpagesz_mb; 765 766 if (geteuid() != 0) { 767 fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); 768 return; 769 } 770 771 cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); 772 773 /* 774 * Now go make sure that we got the pages and that they 775 * are PMD-level pages. Someone might have made PUD-level 776 * pages the default. 777 */ 778 hpagesz_kb = HPAGE_SIZE / 1024; 779 hpagesz_mb = hpagesz_kb / 1024; 780 sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); 781 fd = open(buf, O_RDONLY); 782 if (fd < 0) { 783 fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", 784 hpagesz_mb, strerror(errno)); 785 return; 786 } 787 788 /* -1 to guarantee leaving the trailing \0 */ 789 err = read(fd, buf, sizeof(buf)-1); 790 close(fd); 791 if (err <= 0) { 792 fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", 793 hpagesz_mb, strerror(errno)); 794 return; 795 } 796 797 if (atoi(buf) != GET_NR_HUGE_PAGES) { 798 fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", 799 hpagesz_mb, buf, GET_NR_HUGE_PAGES); 800 return; 801 } 802 803 hugetlb_setup_ok = 1; 804 } 805 806 void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) 807 { 808 void *ptr; 809 int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; 810 811 if (!hugetlb_setup_ok) 812 return PTR_ERR_ENOTSUP; 813 814 dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); 815 size = ALIGN_UP(size, HPAGE_SIZE * 2); 816 pkey_assert(pkey < NR_PKEYS); 817 ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); 818 pkey_assert(ptr != (void *)-1); 819 mprotect_pkey(ptr, size, prot, pkey); 820 821 record_pkey_malloc(ptr, size, prot); 822 823 dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); 824 return ptr; 825 } 826 827 void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) 828 { 829 void *ptr; 830 int fd; 831 832 dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, 833 size, prot, pkey); 834 pkey_assert(pkey < NR_PKEYS); 835 fd = open("/dax/foo", O_RDWR); 836 pkey_assert(fd >= 0); 837 838 ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); 839 pkey_assert(ptr != (void *)-1); 840 841 mprotect_pkey(ptr, size, prot, pkey); 842 843 record_pkey_malloc(ptr, size, prot); 844 845 dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); 846 close(fd); 847 return ptr; 848 } 849 850 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { 851 852 malloc_pkey_with_mprotect, 853 malloc_pkey_with_mprotect_subpage, 854 malloc_pkey_anon_huge, 855 malloc_pkey_hugetlb 856 /* can not do direct with the pkey_mprotect() API: 857 malloc_pkey_mmap_direct, 858 malloc_pkey_mmap_dax, 859 */ 860 }; 861 862 void *malloc_pkey(long size, int prot, u16 pkey) 863 { 864 void *ret; 865 static int malloc_type; 866 int nr_malloc_types = ARRAY_SIZE(pkey_malloc); 867 868 pkey_assert(pkey < NR_PKEYS); 869 870 while (1) { 871 pkey_assert(malloc_type < nr_malloc_types); 872 873 ret = pkey_malloc[malloc_type](size, prot, pkey); 874 pkey_assert(ret != (void *)-1); 875 876 malloc_type++; 877 if (malloc_type >= nr_malloc_types) 878 malloc_type = (random()%nr_malloc_types); 879 880 /* try again if the malloc_type we tried is unsupported */ 881 if (ret == PTR_ERR_ENOTSUP) 882 continue; 883 884 break; 885 } 886 887 dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, 888 size, prot, pkey, ret); 889 return ret; 890 } 891 892 int last_pkey_faults; 893 #define UNKNOWN_PKEY -2 894 void expected_pkey_fault(int pkey) 895 { 896 dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", 897 __func__, last_pkey_faults, pkey_faults); 898 dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); 899 pkey_assert(last_pkey_faults + 1 == pkey_faults); 900 901 /* 902 * For exec-only memory, we do not know the pkey in 903 * advance, so skip this check. 904 */ 905 if (pkey != UNKNOWN_PKEY) 906 pkey_assert(last_si_pkey == pkey); 907 908 #if defined(__i386__) || defined(__x86_64__) /* arch */ 909 /* 910 * The signal handler shold have cleared out PKEY register to let the 911 * test program continue. We now have to restore it. 912 */ 913 if (__read_pkey_reg() != 0) 914 #else /* arch */ 915 if (__read_pkey_reg() != shadow_pkey_reg) 916 #endif /* arch */ 917 pkey_assert(0); 918 919 __write_pkey_reg(shadow_pkey_reg); 920 dprintf1("%s() set pkey_reg=%016llx to restore state after signal " 921 "nuked it\n", __func__, shadow_pkey_reg); 922 last_pkey_faults = pkey_faults; 923 last_si_pkey = -1; 924 } 925 926 #define do_not_expect_pkey_fault(msg) do { \ 927 if (last_pkey_faults != pkey_faults) \ 928 dprintf0("unexpected PKey fault: %s\n", msg); \ 929 pkey_assert(last_pkey_faults == pkey_faults); \ 930 } while (0) 931 932 int test_fds[10] = { -1 }; 933 int nr_test_fds; 934 void __save_test_fd(int fd) 935 { 936 pkey_assert(fd >= 0); 937 pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); 938 test_fds[nr_test_fds] = fd; 939 nr_test_fds++; 940 } 941 942 int get_test_read_fd(void) 943 { 944 int test_fd = open("/etc/passwd", O_RDONLY); 945 __save_test_fd(test_fd); 946 return test_fd; 947 } 948 949 void close_test_fds(void) 950 { 951 int i; 952 953 for (i = 0; i < nr_test_fds; i++) { 954 if (test_fds[i] < 0) 955 continue; 956 close(test_fds[i]); 957 test_fds[i] = -1; 958 } 959 nr_test_fds = 0; 960 } 961 962 #define barrier() __asm__ __volatile__("": : :"memory") 963 __attribute__((noinline)) int read_ptr(int *ptr) 964 { 965 /* 966 * Keep GCC from optimizing this away somehow 967 */ 968 barrier(); 969 return *ptr; 970 } 971 972 void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) 973 { 974 int i, err; 975 int max_nr_pkey_allocs; 976 int alloced_pkeys[NR_PKEYS]; 977 int nr_alloced = 0; 978 long size; 979 980 pkey_assert(pkey_last_malloc_record); 981 size = pkey_last_malloc_record->size; 982 /* 983 * This is a bit of a hack. But mprotect() requires 984 * huge-page-aligned sizes when operating on hugetlbfs. 985 * So, make sure that we use something that's a multiple 986 * of a huge page when we can. 987 */ 988 if (size >= HPAGE_SIZE) 989 size = HPAGE_SIZE; 990 991 /* allocate every possible key and make sure key-0 never got allocated */ 992 max_nr_pkey_allocs = NR_PKEYS; 993 for (i = 0; i < max_nr_pkey_allocs; i++) { 994 int new_pkey = alloc_pkey(); 995 pkey_assert(new_pkey != 0); 996 997 if (new_pkey < 0) 998 break; 999 alloced_pkeys[nr_alloced++] = new_pkey; 1000 } 1001 /* free all the allocated keys */ 1002 for (i = 0; i < nr_alloced; i++) { 1003 int free_ret; 1004 1005 if (!alloced_pkeys[i]) 1006 continue; 1007 free_ret = sys_pkey_free(alloced_pkeys[i]); 1008 pkey_assert(!free_ret); 1009 } 1010 1011 /* attach key-0 in various modes */ 1012 err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); 1013 pkey_assert(!err); 1014 err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); 1015 pkey_assert(!err); 1016 err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); 1017 pkey_assert(!err); 1018 err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); 1019 pkey_assert(!err); 1020 err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); 1021 pkey_assert(!err); 1022 } 1023 1024 void test_read_of_write_disabled_region(int *ptr, u16 pkey) 1025 { 1026 int ptr_contents; 1027 1028 dprintf1("disabling write access to PKEY[1], doing read\n"); 1029 pkey_write_deny(pkey); 1030 ptr_contents = read_ptr(ptr); 1031 dprintf1("*ptr: %d\n", ptr_contents); 1032 dprintf1("\n"); 1033 } 1034 void test_read_of_access_disabled_region(int *ptr, u16 pkey) 1035 { 1036 int ptr_contents; 1037 1038 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); 1039 read_pkey_reg(); 1040 pkey_access_deny(pkey); 1041 ptr_contents = read_ptr(ptr); 1042 dprintf1("*ptr: %d\n", ptr_contents); 1043 expected_pkey_fault(pkey); 1044 } 1045 1046 void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, 1047 u16 pkey) 1048 { 1049 int ptr_contents; 1050 1051 dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", 1052 pkey, ptr); 1053 ptr_contents = read_ptr(ptr); 1054 dprintf1("reading ptr before disabling the read : %d\n", 1055 ptr_contents); 1056 read_pkey_reg(); 1057 pkey_access_deny(pkey); 1058 ptr_contents = read_ptr(ptr); 1059 dprintf1("*ptr: %d\n", ptr_contents); 1060 expected_pkey_fault(pkey); 1061 } 1062 1063 void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, 1064 u16 pkey) 1065 { 1066 *ptr = __LINE__; 1067 dprintf1("disabling write access; after accessing the page, " 1068 "to PKEY[%02d], doing write\n", pkey); 1069 pkey_write_deny(pkey); 1070 *ptr = __LINE__; 1071 expected_pkey_fault(pkey); 1072 } 1073 1074 void test_write_of_write_disabled_region(int *ptr, u16 pkey) 1075 { 1076 dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); 1077 pkey_write_deny(pkey); 1078 *ptr = __LINE__; 1079 expected_pkey_fault(pkey); 1080 } 1081 void test_write_of_access_disabled_region(int *ptr, u16 pkey) 1082 { 1083 dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); 1084 pkey_access_deny(pkey); 1085 *ptr = __LINE__; 1086 expected_pkey_fault(pkey); 1087 } 1088 1089 void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, 1090 u16 pkey) 1091 { 1092 *ptr = __LINE__; 1093 dprintf1("disabling access; after accessing the page, " 1094 " to PKEY[%02d], doing write\n", pkey); 1095 pkey_access_deny(pkey); 1096 *ptr = __LINE__; 1097 expected_pkey_fault(pkey); 1098 } 1099 1100 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) 1101 { 1102 int ret; 1103 int test_fd = get_test_read_fd(); 1104 1105 dprintf1("disabling access to PKEY[%02d], " 1106 "having kernel read() to buffer\n", pkey); 1107 pkey_access_deny(pkey); 1108 ret = read(test_fd, ptr, 1); 1109 dprintf1("read ret: %d\n", ret); 1110 pkey_assert(ret); 1111 } 1112 void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) 1113 { 1114 int ret; 1115 int test_fd = get_test_read_fd(); 1116 1117 pkey_write_deny(pkey); 1118 ret = read(test_fd, ptr, 100); 1119 dprintf1("read ret: %d\n", ret); 1120 if (ret < 0 && (DEBUG_LEVEL > 0)) 1121 perror("verbose read result (OK for this to be bad)"); 1122 pkey_assert(ret); 1123 } 1124 1125 void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) 1126 { 1127 int pipe_ret, vmsplice_ret; 1128 struct iovec iov; 1129 int pipe_fds[2]; 1130 1131 pipe_ret = pipe(pipe_fds); 1132 1133 pkey_assert(pipe_ret == 0); 1134 dprintf1("disabling access to PKEY[%02d], " 1135 "having kernel vmsplice from buffer\n", pkey); 1136 pkey_access_deny(pkey); 1137 iov.iov_base = ptr; 1138 iov.iov_len = PAGE_SIZE; 1139 vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); 1140 dprintf1("vmsplice() ret: %d\n", vmsplice_ret); 1141 pkey_assert(vmsplice_ret == -1); 1142 1143 close(pipe_fds[0]); 1144 close(pipe_fds[1]); 1145 } 1146 1147 void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) 1148 { 1149 int ignored = 0xdada; 1150 int futex_ret; 1151 int some_int = __LINE__; 1152 1153 dprintf1("disabling write to PKEY[%02d], " 1154 "doing futex gunk in buffer\n", pkey); 1155 *ptr = some_int; 1156 pkey_write_deny(pkey); 1157 futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, 1158 &ignored, ignored); 1159 if (DEBUG_LEVEL > 0) 1160 perror("futex"); 1161 dprintf1("futex() ret: %d\n", futex_ret); 1162 } 1163 1164 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1165 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) 1166 { 1167 int err; 1168 int i; 1169 1170 /* Note: 0 is the default pkey, so don't mess with it */ 1171 for (i = 1; i < NR_PKEYS; i++) { 1172 if (pkey == i) 1173 continue; 1174 1175 dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); 1176 err = sys_pkey_free(i); 1177 pkey_assert(err); 1178 1179 err = sys_pkey_free(i); 1180 pkey_assert(err); 1181 1182 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); 1183 pkey_assert(err); 1184 } 1185 } 1186 1187 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1188 void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) 1189 { 1190 int err; 1191 int bad_pkey = NR_PKEYS+99; 1192 1193 /* pass a known-invalid pkey in: */ 1194 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); 1195 pkey_assert(err); 1196 } 1197 1198 void become_child(void) 1199 { 1200 pid_t forkret; 1201 1202 forkret = fork(); 1203 pkey_assert(forkret >= 0); 1204 dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); 1205 1206 if (!forkret) { 1207 /* in the child */ 1208 return; 1209 } 1210 exit(0); 1211 } 1212 1213 /* Assumes that all pkeys other than 'pkey' are unallocated */ 1214 void test_pkey_alloc_exhaust(int *ptr, u16 pkey) 1215 { 1216 int err; 1217 int allocated_pkeys[NR_PKEYS] = {0}; 1218 int nr_allocated_pkeys = 0; 1219 int i; 1220 1221 for (i = 0; i < NR_PKEYS*3; i++) { 1222 int new_pkey; 1223 dprintf1("%s() alloc loop: %d\n", __func__, i); 1224 new_pkey = alloc_pkey(); 1225 dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" 1226 " shadow: 0x%016llx\n", 1227 __func__, __LINE__, err, __read_pkey_reg(), 1228 shadow_pkey_reg); 1229 read_pkey_reg(); /* for shadow checking */ 1230 dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); 1231 if ((new_pkey == -1) && (errno == ENOSPC)) { 1232 dprintf2("%s() failed to allocate pkey after %d tries\n", 1233 __func__, nr_allocated_pkeys); 1234 } else { 1235 /* 1236 * Ensure the number of successes never 1237 * exceeds the number of keys supported 1238 * in the hardware. 1239 */ 1240 pkey_assert(nr_allocated_pkeys < NR_PKEYS); 1241 allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1242 } 1243 1244 /* 1245 * Make sure that allocation state is properly 1246 * preserved across fork(). 1247 */ 1248 if (i == NR_PKEYS*2) 1249 become_child(); 1250 } 1251 1252 dprintf3("%s()::%d\n", __func__, __LINE__); 1253 1254 /* 1255 * On x86: 1256 * There are 16 pkeys supported in hardware. Three are 1257 * allocated by the time we get here: 1258 * 1. The default key (0) 1259 * 2. One possibly consumed by an execute-only mapping. 1260 * 3. One allocated by the test code and passed in via 1261 * 'pkey' to this function. 1262 * Ensure that we can allocate at least another 13 (16-3). 1263 * 1264 * On powerpc: 1265 * There are either 5, 28, 29 or 32 pkeys supported in 1266 * hardware depending on the page size (4K or 64K) and 1267 * platform (powernv or powervm). Four are allocated by 1268 * the time we get here. These include pkey-0, pkey-1, 1269 * exec-only pkey and the one allocated by the test code. 1270 * Ensure that we can allocate the remaining. 1271 */ 1272 pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); 1273 1274 for (i = 0; i < nr_allocated_pkeys; i++) { 1275 err = sys_pkey_free(allocated_pkeys[i]); 1276 pkey_assert(!err); 1277 read_pkey_reg(); /* for shadow checking */ 1278 } 1279 } 1280 1281 void arch_force_pkey_reg_init(void) 1282 { 1283 #if defined(__i386__) || defined(__x86_64__) /* arch */ 1284 u64 *buf; 1285 1286 /* 1287 * All keys should be allocated and set to allow reads and 1288 * writes, so the register should be all 0. If not, just 1289 * skip the test. 1290 */ 1291 if (read_pkey_reg()) 1292 return; 1293 1294 /* 1295 * Just allocate an absurd about of memory rather than 1296 * doing the XSAVE size enumeration dance. 1297 */ 1298 buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1299 1300 /* These __builtins require compiling with -mxsave */ 1301 1302 /* XSAVE to build a valid buffer: */ 1303 __builtin_ia32_xsave(buf, XSTATE_PKEY); 1304 /* Clear XSTATE_BV[PKRU]: */ 1305 buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; 1306 /* XRSTOR will likely get PKRU back to the init state: */ 1307 __builtin_ia32_xrstor(buf, XSTATE_PKEY); 1308 1309 munmap(buf, 1*MB); 1310 #endif 1311 } 1312 1313 1314 /* 1315 * This is mostly useless on ppc for now. But it will not 1316 * hurt anything and should give some better coverage as 1317 * a long-running test that continually checks the pkey 1318 * register. 1319 */ 1320 void test_pkey_init_state(int *ptr, u16 pkey) 1321 { 1322 int err; 1323 int allocated_pkeys[NR_PKEYS] = {0}; 1324 int nr_allocated_pkeys = 0; 1325 int i; 1326 1327 for (i = 0; i < NR_PKEYS; i++) { 1328 int new_pkey = alloc_pkey(); 1329 1330 if (new_pkey < 0) 1331 continue; 1332 allocated_pkeys[nr_allocated_pkeys++] = new_pkey; 1333 } 1334 1335 dprintf3("%s()::%d\n", __func__, __LINE__); 1336 1337 arch_force_pkey_reg_init(); 1338 1339 /* 1340 * Loop for a bit, hoping to get exercise the kernel 1341 * context switch code. 1342 */ 1343 for (i = 0; i < 1000000; i++) 1344 read_pkey_reg(); 1345 1346 for (i = 0; i < nr_allocated_pkeys; i++) { 1347 err = sys_pkey_free(allocated_pkeys[i]); 1348 pkey_assert(!err); 1349 read_pkey_reg(); /* for shadow checking */ 1350 } 1351 } 1352 1353 /* 1354 * pkey 0 is special. It is allocated by default, so you do not 1355 * have to call pkey_alloc() to use it first. Make sure that it 1356 * is usable. 1357 */ 1358 void test_mprotect_with_pkey_0(int *ptr, u16 pkey) 1359 { 1360 long size; 1361 int prot; 1362 1363 assert(pkey_last_malloc_record); 1364 size = pkey_last_malloc_record->size; 1365 /* 1366 * This is a bit of a hack. But mprotect() requires 1367 * huge-page-aligned sizes when operating on hugetlbfs. 1368 * So, make sure that we use something that's a multiple 1369 * of a huge page when we can. 1370 */ 1371 if (size >= HPAGE_SIZE) 1372 size = HPAGE_SIZE; 1373 prot = pkey_last_malloc_record->prot; 1374 1375 /* Use pkey 0 */ 1376 mprotect_pkey(ptr, size, prot, 0); 1377 1378 /* Make sure that we can set it back to the original pkey. */ 1379 mprotect_pkey(ptr, size, prot, pkey); 1380 } 1381 1382 void test_ptrace_of_child(int *ptr, u16 pkey) 1383 { 1384 __attribute__((__unused__)) int peek_result; 1385 pid_t child_pid; 1386 void *ignored = 0; 1387 long ret; 1388 int status; 1389 /* 1390 * This is the "control" for our little expermient. Make sure 1391 * we can always access it when ptracing. 1392 */ 1393 int *plain_ptr_unaligned = malloc(HPAGE_SIZE); 1394 int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); 1395 1396 /* 1397 * Fork a child which is an exact copy of this process, of course. 1398 * That means we can do all of our tests via ptrace() and then plain 1399 * memory access and ensure they work differently. 1400 */ 1401 child_pid = fork_lazy_child(); 1402 dprintf1("[%d] child pid: %d\n", getpid(), child_pid); 1403 1404 ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); 1405 if (ret) 1406 perror("attach"); 1407 dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); 1408 pkey_assert(ret != -1); 1409 ret = waitpid(child_pid, &status, WUNTRACED); 1410 if ((ret != child_pid) || !(WIFSTOPPED(status))) { 1411 fprintf(stderr, "weird waitpid result %ld stat %x\n", 1412 ret, status); 1413 pkey_assert(0); 1414 } 1415 dprintf2("waitpid ret: %ld\n", ret); 1416 dprintf2("waitpid status: %d\n", status); 1417 1418 pkey_access_deny(pkey); 1419 pkey_write_deny(pkey); 1420 1421 /* Write access, untested for now: 1422 ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); 1423 pkey_assert(ret != -1); 1424 dprintf1("poke at %p: %ld\n", peek_at, ret); 1425 */ 1426 1427 /* 1428 * Try to access the pkey-protected "ptr" via ptrace: 1429 */ 1430 ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); 1431 /* expect it to work, without an error: */ 1432 pkey_assert(ret != -1); 1433 /* Now access from the current task, and expect an exception: */ 1434 peek_result = read_ptr(ptr); 1435 expected_pkey_fault(pkey); 1436 1437 /* 1438 * Try to access the NON-pkey-protected "plain_ptr" via ptrace: 1439 */ 1440 ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); 1441 /* expect it to work, without an error: */ 1442 pkey_assert(ret != -1); 1443 /* Now access from the current task, and expect NO exception: */ 1444 peek_result = read_ptr(plain_ptr); 1445 do_not_expect_pkey_fault("read plain pointer after ptrace"); 1446 1447 ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); 1448 pkey_assert(ret != -1); 1449 1450 ret = kill(child_pid, SIGKILL); 1451 pkey_assert(ret != -1); 1452 1453 wait(&status); 1454 1455 free(plain_ptr_unaligned); 1456 } 1457 1458 void *get_pointer_to_instructions(void) 1459 { 1460 void *p1; 1461 1462 p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); 1463 dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); 1464 /* lots_o_noops_around_write should be page-aligned already */ 1465 assert(p1 == &lots_o_noops_around_write); 1466 1467 /* Point 'p1' at the *second* page of the function: */ 1468 p1 += PAGE_SIZE; 1469 1470 /* 1471 * Try to ensure we fault this in on next touch to ensure 1472 * we get an instruction fault as opposed to a data one 1473 */ 1474 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1475 1476 return p1; 1477 } 1478 1479 void test_executing_on_unreadable_memory(int *ptr, u16 pkey) 1480 { 1481 void *p1; 1482 int scratch; 1483 int ptr_contents; 1484 int ret; 1485 1486 p1 = get_pointer_to_instructions(); 1487 lots_o_noops_around_write(&scratch); 1488 ptr_contents = read_ptr(p1); 1489 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1490 1491 ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); 1492 pkey_assert(!ret); 1493 pkey_access_deny(pkey); 1494 1495 dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); 1496 1497 /* 1498 * Make sure this is an *instruction* fault 1499 */ 1500 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1501 lots_o_noops_around_write(&scratch); 1502 do_not_expect_pkey_fault("executing on PROT_EXEC memory"); 1503 expect_fault_on_read_execonly_key(p1, pkey); 1504 } 1505 1506 void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) 1507 { 1508 void *p1; 1509 int scratch; 1510 int ptr_contents; 1511 int ret; 1512 1513 dprintf1("%s() start\n", __func__); 1514 1515 p1 = get_pointer_to_instructions(); 1516 lots_o_noops_around_write(&scratch); 1517 ptr_contents = read_ptr(p1); 1518 dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); 1519 1520 /* Use a *normal* mprotect(), not mprotect_pkey(): */ 1521 ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); 1522 pkey_assert(!ret); 1523 1524 /* 1525 * Reset the shadow, assuming that the above mprotect() 1526 * correctly changed PKRU, but to an unknown value since 1527 * the actual allocated pkey is unknown. 1528 */ 1529 shadow_pkey_reg = __read_pkey_reg(); 1530 1531 dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); 1532 1533 /* Make sure this is an *instruction* fault */ 1534 madvise(p1, PAGE_SIZE, MADV_DONTNEED); 1535 lots_o_noops_around_write(&scratch); 1536 do_not_expect_pkey_fault("executing on PROT_EXEC memory"); 1537 expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); 1538 1539 /* 1540 * Put the memory back to non-PROT_EXEC. Should clear the 1541 * exec-only pkey off the VMA and allow it to be readable 1542 * again. Go to PROT_NONE first to check for a kernel bug 1543 * that did not clear the pkey when doing PROT_NONE. 1544 */ 1545 ret = mprotect(p1, PAGE_SIZE, PROT_NONE); 1546 pkey_assert(!ret); 1547 1548 ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); 1549 pkey_assert(!ret); 1550 ptr_contents = read_ptr(p1); 1551 do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); 1552 } 1553 1554 #if defined(__i386__) || defined(__x86_64__) 1555 void test_ptrace_modifies_pkru(int *ptr, u16 pkey) 1556 { 1557 u32 new_pkru; 1558 pid_t child; 1559 int status, ret; 1560 int pkey_offset = pkey_reg_xstate_offset(); 1561 size_t xsave_size = cpu_max_xsave_size(); 1562 void *xsave; 1563 u32 *pkey_register; 1564 u64 *xstate_bv; 1565 struct iovec iov; 1566 1567 new_pkru = ~read_pkey_reg(); 1568 /* Don't make PROT_EXEC mappings inaccessible */ 1569 new_pkru &= ~3; 1570 1571 child = fork(); 1572 pkey_assert(child >= 0); 1573 dprintf3("[%d] fork() ret: %d\n", getpid(), child); 1574 if (!child) { 1575 ptrace(PTRACE_TRACEME, 0, 0, 0); 1576 /* Stop and allow the tracer to modify PKRU directly */ 1577 raise(SIGSTOP); 1578 1579 /* 1580 * need __read_pkey_reg() version so we do not do shadow_pkey_reg 1581 * checking 1582 */ 1583 if (__read_pkey_reg() != new_pkru) 1584 exit(1); 1585 1586 /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ 1587 raise(SIGSTOP); 1588 1589 if (__read_pkey_reg() != 0) 1590 exit(1); 1591 1592 /* Stop and allow the tracer to examine PKRU */ 1593 raise(SIGSTOP); 1594 1595 exit(0); 1596 } 1597 1598 pkey_assert(child == waitpid(child, &status, 0)); 1599 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1600 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1601 1602 xsave = (void *)malloc(xsave_size); 1603 pkey_assert(xsave > 0); 1604 1605 /* Modify the PKRU register directly */ 1606 iov.iov_base = xsave; 1607 iov.iov_len = xsave_size; 1608 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1609 pkey_assert(ret == 0); 1610 1611 pkey_register = (u32 *)(xsave + pkey_offset); 1612 pkey_assert(*pkey_register == read_pkey_reg()); 1613 1614 *pkey_register = new_pkru; 1615 1616 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1617 pkey_assert(ret == 0); 1618 1619 /* Test that the modification is visible in ptrace before any execution */ 1620 memset(xsave, 0xCC, xsave_size); 1621 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1622 pkey_assert(ret == 0); 1623 pkey_assert(*pkey_register == new_pkru); 1624 1625 /* Execute the tracee */ 1626 ret = ptrace(PTRACE_CONT, child, 0, 0); 1627 pkey_assert(ret == 0); 1628 1629 /* Test that the tracee saw the PKRU value change */ 1630 pkey_assert(child == waitpid(child, &status, 0)); 1631 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1632 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1633 1634 /* Test that the modification is visible in ptrace after execution */ 1635 memset(xsave, 0xCC, xsave_size); 1636 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1637 pkey_assert(ret == 0); 1638 pkey_assert(*pkey_register == new_pkru); 1639 1640 /* Clear the PKRU bit from XSTATE_BV */ 1641 xstate_bv = (u64 *)(xsave + 512); 1642 *xstate_bv &= ~(1 << 9); 1643 1644 ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1645 pkey_assert(ret == 0); 1646 1647 /* Test that the modification is visible in ptrace before any execution */ 1648 memset(xsave, 0xCC, xsave_size); 1649 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1650 pkey_assert(ret == 0); 1651 pkey_assert(*pkey_register == 0); 1652 1653 ret = ptrace(PTRACE_CONT, child, 0, 0); 1654 pkey_assert(ret == 0); 1655 1656 /* Test that the tracee saw the PKRU value go to 0 */ 1657 pkey_assert(child == waitpid(child, &status, 0)); 1658 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1659 pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); 1660 1661 /* Test that the modification is visible in ptrace after execution */ 1662 memset(xsave, 0xCC, xsave_size); 1663 ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); 1664 pkey_assert(ret == 0); 1665 pkey_assert(*pkey_register == 0); 1666 1667 ret = ptrace(PTRACE_CONT, child, 0, 0); 1668 pkey_assert(ret == 0); 1669 pkey_assert(child == waitpid(child, &status, 0)); 1670 dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); 1671 pkey_assert(WIFEXITED(status)); 1672 pkey_assert(WEXITSTATUS(status) == 0); 1673 free(xsave); 1674 } 1675 #endif 1676 1677 void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) 1678 { 1679 int size = PAGE_SIZE; 1680 int sret; 1681 1682 if (cpu_has_pkeys()) { 1683 dprintf1("SKIP: %s: no CPU support\n", __func__); 1684 return; 1685 } 1686 1687 sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); 1688 pkey_assert(sret < 0); 1689 } 1690 1691 void (*pkey_tests[])(int *ptr, u16 pkey) = { 1692 test_read_of_write_disabled_region, 1693 test_read_of_access_disabled_region, 1694 test_read_of_access_disabled_region_with_page_already_mapped, 1695 test_write_of_write_disabled_region, 1696 test_write_of_write_disabled_region_with_page_already_mapped, 1697 test_write_of_access_disabled_region, 1698 test_write_of_access_disabled_region_with_page_already_mapped, 1699 test_kernel_write_of_access_disabled_region, 1700 test_kernel_write_of_write_disabled_region, 1701 test_kernel_gup_of_access_disabled_region, 1702 test_kernel_gup_write_to_write_disabled_region, 1703 test_executing_on_unreadable_memory, 1704 test_implicit_mprotect_exec_only_memory, 1705 test_mprotect_with_pkey_0, 1706 test_ptrace_of_child, 1707 test_pkey_init_state, 1708 test_pkey_syscalls_on_non_allocated_pkey, 1709 test_pkey_syscalls_bad_args, 1710 test_pkey_alloc_exhaust, 1711 test_pkey_alloc_free_attach_pkey0, 1712 #if defined(__i386__) || defined(__x86_64__) 1713 test_ptrace_modifies_pkru, 1714 #endif 1715 }; 1716 1717 void run_tests_once(void) 1718 { 1719 int *ptr; 1720 int prot = PROT_READ|PROT_WRITE; 1721 1722 for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { 1723 int pkey; 1724 int orig_pkey_faults = pkey_faults; 1725 1726 dprintf1("======================\n"); 1727 dprintf1("test %d preparing...\n", test_nr); 1728 1729 tracing_on(); 1730 pkey = alloc_random_pkey(); 1731 dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); 1732 ptr = malloc_pkey(PAGE_SIZE, prot, pkey); 1733 dprintf1("test %d starting...\n", test_nr); 1734 pkey_tests[test_nr](ptr, pkey); 1735 dprintf1("freeing test memory: %p\n", ptr); 1736 free_pkey_malloc(ptr); 1737 sys_pkey_free(pkey); 1738 1739 dprintf1("pkey_faults: %d\n", pkey_faults); 1740 dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); 1741 1742 tracing_off(); 1743 close_test_fds(); 1744 1745 printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); 1746 dprintf1("======================\n\n"); 1747 } 1748 iteration_nr++; 1749 } 1750 1751 void pkey_setup_shadow(void) 1752 { 1753 shadow_pkey_reg = __read_pkey_reg(); 1754 } 1755 1756 int main(void) 1757 { 1758 int nr_iterations = 22; 1759 int pkeys_supported = is_pkeys_supported(); 1760 1761 srand((unsigned int)time(NULL)); 1762 1763 setup_handlers(); 1764 1765 printf("has pkeys: %d\n", pkeys_supported); 1766 1767 if (!pkeys_supported) { 1768 int size = PAGE_SIZE; 1769 int *ptr; 1770 1771 printf("running PKEY tests for unsupported CPU/OS\n"); 1772 1773 ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 1774 assert(ptr != (void *)-1); 1775 test_mprotect_pkey_on_unsupported_cpu(ptr, 1); 1776 exit(0); 1777 } 1778 1779 pkey_setup_shadow(); 1780 printf("startup pkey_reg: %016llx\n", read_pkey_reg()); 1781 setup_hugetlbfs(); 1782 1783 while (nr_iterations-- > 0) 1784 run_tests_once(); 1785 1786 printf("done (all tests OK)\n"); 1787 return 0; 1788 } 1789