1 /* 2 * PowerPC version 3 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) 4 * 5 * Derived from "arch/i386/mm/fault.c" 6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * 8 * Modified by Cort Dougan and Paul Mackerras. 9 * 10 * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 */ 17 18 #include <linux/signal.h> 19 #include <linux/sched.h> 20 #include <linux/sched/task_stack.h> 21 #include <linux/kernel.h> 22 #include <linux/errno.h> 23 #include <linux/string.h> 24 #include <linux/types.h> 25 #include <linux/pagemap.h> 26 #include <linux/ptrace.h> 27 #include <linux/mman.h> 28 #include <linux/mm.h> 29 #include <linux/interrupt.h> 30 #include <linux/highmem.h> 31 #include <linux/extable.h> 32 #include <linux/kprobes.h> 33 #include <linux/kdebug.h> 34 #include <linux/perf_event.h> 35 #include <linux/ratelimit.h> 36 #include <linux/context_tracking.h> 37 #include <linux/hugetlb.h> 38 #include <linux/uaccess.h> 39 40 #include <asm/firmware.h> 41 #include <asm/page.h> 42 #include <asm/pgtable.h> 43 #include <asm/mmu.h> 44 #include <asm/mmu_context.h> 45 #include <asm/tlbflush.h> 46 #include <asm/siginfo.h> 47 #include <asm/debug.h> 48 49 static inline bool notify_page_fault(struct pt_regs *regs) 50 { 51 bool ret = false; 52 53 #ifdef CONFIG_KPROBES 54 /* kprobe_running() needs smp_processor_id() */ 55 if (!user_mode(regs)) { 56 preempt_disable(); 57 if (kprobe_running() && kprobe_fault_handler(regs, 11)) 58 ret = true; 59 preempt_enable(); 60 } 61 #endif /* CONFIG_KPROBES */ 62 63 if (unlikely(debugger_fault_handler(regs))) 64 ret = true; 65 66 return ret; 67 } 68 69 /* 70 * Check whether the instruction inst is a store using 71 * an update addressing form which will update r1. 72 */ 73 static bool store_updates_sp(unsigned int inst) 74 { 75 /* check for 1 in the rA field */ 76 if (((inst >> 16) & 0x1f) != 1) 77 return false; 78 /* check major opcode */ 79 switch (inst >> 26) { 80 case OP_STWU: 81 case OP_STBU: 82 case OP_STHU: 83 case OP_STFSU: 84 case OP_STFDU: 85 return true; 86 case OP_STD: /* std or stdu */ 87 return (inst & 3) == 1; 88 case OP_31: 89 /* check minor opcode */ 90 switch ((inst >> 1) & 0x3ff) { 91 case OP_31_XOP_STDUX: 92 case OP_31_XOP_STWUX: 93 case OP_31_XOP_STBUX: 94 case OP_31_XOP_STHUX: 95 case OP_31_XOP_STFSUX: 96 case OP_31_XOP_STFDUX: 97 return true; 98 } 99 } 100 return false; 101 } 102 /* 103 * do_page_fault error handling helpers 104 */ 105 106 static int 107 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code, 108 int pkey) 109 { 110 /* 111 * If we are in kernel mode, bail out with a SEGV, this will 112 * be caught by the assembly which will restore the non-volatile 113 * registers before calling bad_page_fault() 114 */ 115 if (!user_mode(regs)) 116 return SIGSEGV; 117 118 _exception_pkey(SIGSEGV, regs, si_code, address, pkey); 119 120 return 0; 121 } 122 123 static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address) 124 { 125 return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0); 126 } 127 128 static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code, 129 int pkey) 130 { 131 struct mm_struct *mm = current->mm; 132 133 /* 134 * Something tried to access memory that isn't in our memory map.. 135 * Fix it, but check if it's kernel or user first.. 136 */ 137 up_read(&mm->mmap_sem); 138 139 return __bad_area_nosemaphore(regs, address, si_code, pkey); 140 } 141 142 static noinline int bad_area(struct pt_regs *regs, unsigned long address) 143 { 144 return __bad_area(regs, address, SEGV_MAPERR, 0); 145 } 146 147 static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address, 148 int pkey) 149 { 150 return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey); 151 } 152 153 static noinline int bad_access(struct pt_regs *regs, unsigned long address) 154 { 155 return __bad_area(regs, address, SEGV_ACCERR, 0); 156 } 157 158 static int do_sigbus(struct pt_regs *regs, unsigned long address, 159 unsigned int fault) 160 { 161 siginfo_t info; 162 unsigned int lsb = 0; 163 164 if (!user_mode(regs)) 165 return SIGBUS; 166 167 current->thread.trap_nr = BUS_ADRERR; 168 clear_siginfo(&info); 169 info.si_signo = SIGBUS; 170 info.si_errno = 0; 171 info.si_code = BUS_ADRERR; 172 info.si_addr = (void __user *)address; 173 #ifdef CONFIG_MEMORY_FAILURE 174 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { 175 pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 176 current->comm, current->pid, address); 177 info.si_code = BUS_MCEERR_AR; 178 } 179 180 if (fault & VM_FAULT_HWPOISON_LARGE) 181 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 182 if (fault & VM_FAULT_HWPOISON) 183 lsb = PAGE_SHIFT; 184 #endif 185 info.si_addr_lsb = lsb; 186 force_sig_info(SIGBUS, &info, current); 187 return 0; 188 } 189 190 static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) 191 { 192 /* 193 * Kernel page fault interrupted by SIGKILL. We have no reason to 194 * continue processing. 195 */ 196 if (fatal_signal_pending(current) && !user_mode(regs)) 197 return SIGKILL; 198 199 /* Out of memory */ 200 if (fault & VM_FAULT_OOM) { 201 /* 202 * We ran out of memory, or some other thing happened to us that 203 * made us unable to handle the page fault gracefully. 204 */ 205 if (!user_mode(regs)) 206 return SIGSEGV; 207 pagefault_out_of_memory(); 208 } else { 209 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 210 VM_FAULT_HWPOISON_LARGE)) 211 return do_sigbus(regs, addr, fault); 212 else if (fault & VM_FAULT_SIGSEGV) 213 return bad_area_nosemaphore(regs, addr); 214 else 215 BUG(); 216 } 217 return 0; 218 } 219 220 /* Is this a bad kernel fault ? */ 221 static bool bad_kernel_fault(bool is_exec, unsigned long error_code, 222 unsigned long address) 223 { 224 if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) { 225 printk_ratelimited(KERN_CRIT "kernel tried to execute" 226 " exec-protected page (%lx) -" 227 "exploit attempt? (uid: %d)\n", 228 address, from_kuid(&init_user_ns, 229 current_uid())); 230 } 231 return is_exec || (address >= TASK_SIZE); 232 } 233 234 static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, 235 struct vm_area_struct *vma, unsigned int flags, 236 bool *must_retry) 237 { 238 /* 239 * N.B. The POWER/Open ABI allows programs to access up to 240 * 288 bytes below the stack pointer. 241 * The kernel signal delivery code writes up to about 1.5kB 242 * below the stack pointer (r1) before decrementing it. 243 * The exec code can write slightly over 640kB to the stack 244 * before setting the user r1. Thus we allow the stack to 245 * expand to 1MB without further checks. 246 */ 247 if (address + 0x100000 < vma->vm_end) { 248 unsigned int __user *nip = (unsigned int __user *)regs->nip; 249 /* get user regs even if this fault is in kernel mode */ 250 struct pt_regs *uregs = current->thread.regs; 251 if (uregs == NULL) 252 return true; 253 254 /* 255 * A user-mode access to an address a long way below 256 * the stack pointer is only valid if the instruction 257 * is one which would update the stack pointer to the 258 * address accessed if the instruction completed, 259 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb 260 * (or the byte, halfword, float or double forms). 261 * 262 * If we don't check this then any write to the area 263 * between the last mapped region and the stack will 264 * expand the stack rather than segfaulting. 265 */ 266 if (address + 2048 >= uregs->gpr[1]) 267 return false; 268 269 if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && 270 access_ok(VERIFY_READ, nip, sizeof(*nip))) { 271 unsigned int inst; 272 int res; 273 274 pagefault_disable(); 275 res = __get_user_inatomic(inst, nip); 276 pagefault_enable(); 277 if (!res) 278 return !store_updates_sp(inst); 279 *must_retry = true; 280 } 281 return true; 282 } 283 return false; 284 } 285 286 static bool access_error(bool is_write, bool is_exec, 287 struct vm_area_struct *vma) 288 { 289 /* 290 * Allow execution from readable areas if the MMU does not 291 * provide separate controls over reading and executing. 292 * 293 * Note: That code used to not be enabled for 4xx/BookE. 294 * It is now as I/D cache coherency for these is done at 295 * set_pte_at() time and I see no reason why the test 296 * below wouldn't be valid on those processors. This -may- 297 * break programs compiled with a really old ABI though. 298 */ 299 if (is_exec) { 300 return !(vma->vm_flags & VM_EXEC) && 301 (cpu_has_feature(CPU_FTR_NOEXECUTE) || 302 !(vma->vm_flags & (VM_READ | VM_WRITE))); 303 } 304 305 if (is_write) { 306 if (unlikely(!(vma->vm_flags & VM_WRITE))) 307 return true; 308 return false; 309 } 310 311 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 312 return true; 313 /* 314 * We should ideally do the vma pkey access check here. But in the 315 * fault path, handle_mm_fault() also does the same check. To avoid 316 * these multiple checks, we skip it here and handle access error due 317 * to pkeys later. 318 */ 319 return false; 320 } 321 322 #ifdef CONFIG_PPC_SMLPAR 323 static inline void cmo_account_page_fault(void) 324 { 325 if (firmware_has_feature(FW_FEATURE_CMO)) { 326 u32 page_ins; 327 328 preempt_disable(); 329 page_ins = be32_to_cpu(get_lppaca()->page_ins); 330 page_ins += 1 << PAGE_FACTOR; 331 get_lppaca()->page_ins = cpu_to_be32(page_ins); 332 preempt_enable(); 333 } 334 } 335 #else 336 static inline void cmo_account_page_fault(void) { } 337 #endif /* CONFIG_PPC_SMLPAR */ 338 339 #ifdef CONFIG_PPC_STD_MMU 340 static void sanity_check_fault(bool is_write, unsigned long error_code) 341 { 342 /* 343 * For hash translation mode, we should never get a 344 * PROTFAULT. Any update to pte to reduce access will result in us 345 * removing the hash page table entry, thus resulting in a DSISR_NOHPTE 346 * fault instead of DSISR_PROTFAULT. 347 * 348 * A pte update to relax the access will not result in a hash page table 349 * entry invalidate and hence can result in DSISR_PROTFAULT. 350 * ptep_set_access_flags() doesn't do a hpte flush. This is why we have 351 * the special !is_write in the below conditional. 352 * 353 * For platforms that doesn't supports coherent icache and do support 354 * per page noexec bit, we do setup things such that we do the 355 * sync between D/I cache via fault. But that is handled via low level 356 * hash fault code (hash_page_do_lazy_icache()) and we should not reach 357 * here in such case. 358 * 359 * For wrong access that can result in PROTFAULT, the above vma->vm_flags 360 * check should handle those and hence we should fall to the bad_area 361 * handling correctly. 362 * 363 * For embedded with per page exec support that doesn't support coherent 364 * icache we do get PROTFAULT and we handle that D/I cache sync in 365 * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON 366 * is conditional for server MMU. 367 * 368 * For radix, we can get prot fault for autonuma case, because radix 369 * page table will have them marked noaccess for user. 370 */ 371 if (!radix_enabled() && !is_write) 372 WARN_ON_ONCE(error_code & DSISR_PROTFAULT); 373 } 374 #else 375 static void sanity_check_fault(bool is_write, unsigned long error_code) { } 376 #endif /* CONFIG_PPC_STD_MMU */ 377 378 /* 379 * Define the correct "is_write" bit in error_code based 380 * on the processor family 381 */ 382 #if (defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) 383 #define page_fault_is_write(__err) ((__err) & ESR_DST) 384 #define page_fault_is_bad(__err) (0) 385 #else 386 #define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE) 387 #if defined(CONFIG_PPC_8xx) 388 #define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G) 389 #elif defined(CONFIG_PPC64) 390 #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S) 391 #else 392 #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S) 393 #endif 394 #endif 395 396 /* 397 * For 600- and 800-family processors, the error_code parameter is DSISR 398 * for a data fault, SRR1 for an instruction fault. For 400-family processors 399 * the error_code parameter is ESR for a data fault, 0 for an instruction 400 * fault. 401 * For 64-bit processors, the error_code parameter is 402 * - DSISR for a non-SLB data access fault, 403 * - SRR1 & 0x08000000 for a non-SLB instruction access fault 404 * - 0 any SLB fault. 405 * 406 * The return value is 0 if the fault was handled, or the signal 407 * number if this is a kernel fault that can't be handled here. 408 */ 409 static int __do_page_fault(struct pt_regs *regs, unsigned long address, 410 unsigned long error_code) 411 { 412 struct vm_area_struct * vma; 413 struct mm_struct *mm = current->mm; 414 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 415 int is_exec = TRAP(regs) == 0x400; 416 int is_user = user_mode(regs); 417 int is_write = page_fault_is_write(error_code); 418 int fault, major = 0; 419 bool must_retry = false; 420 421 if (notify_page_fault(regs)) 422 return 0; 423 424 if (unlikely(page_fault_is_bad(error_code))) { 425 if (is_user) { 426 _exception(SIGBUS, regs, BUS_OBJERR, address); 427 return 0; 428 } 429 return SIGBUS; 430 } 431 432 /* Additional sanity check(s) */ 433 sanity_check_fault(is_write, error_code); 434 435 /* 436 * The kernel should never take an execute fault nor should it 437 * take a page fault to a kernel address. 438 */ 439 if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address))) 440 return SIGSEGV; 441 442 /* 443 * If we're in an interrupt, have no user context or are running 444 * in a region with pagefaults disabled then we must not take the fault 445 */ 446 if (unlikely(faulthandler_disabled() || !mm)) { 447 if (is_user) 448 printk_ratelimited(KERN_ERR "Page fault in user mode" 449 " with faulthandler_disabled()=%d" 450 " mm=%p\n", 451 faulthandler_disabled(), mm); 452 return bad_area_nosemaphore(regs, address); 453 } 454 455 /* We restore the interrupt state now */ 456 if (!arch_irq_disabled_regs(regs)) 457 local_irq_enable(); 458 459 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 460 461 if (error_code & DSISR_KEYFAULT) 462 return bad_key_fault_exception(regs, address, 463 get_mm_addr_key(mm, address)); 464 465 /* 466 * We want to do this outside mmap_sem, because reading code around nip 467 * can result in fault, which will cause a deadlock when called with 468 * mmap_sem held 469 */ 470 if (is_user) 471 flags |= FAULT_FLAG_USER; 472 if (is_write) 473 flags |= FAULT_FLAG_WRITE; 474 if (is_exec) 475 flags |= FAULT_FLAG_INSTRUCTION; 476 477 /* When running in the kernel we expect faults to occur only to 478 * addresses in user space. All other faults represent errors in the 479 * kernel and should generate an OOPS. Unfortunately, in the case of an 480 * erroneous fault occurring in a code path which already holds mmap_sem 481 * we will deadlock attempting to validate the fault against the 482 * address space. Luckily the kernel only validly references user 483 * space from well defined areas of code, which are listed in the 484 * exceptions table. 485 * 486 * As the vast majority of faults will be valid we will only perform 487 * the source reference check when there is a possibility of a deadlock. 488 * Attempt to lock the address space, if we cannot we then validate the 489 * source. If this is invalid we can skip the address space check, 490 * thus avoiding the deadlock. 491 */ 492 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 493 if (!is_user && !search_exception_tables(regs->nip)) 494 return bad_area_nosemaphore(regs, address); 495 496 retry: 497 down_read(&mm->mmap_sem); 498 } else { 499 /* 500 * The above down_read_trylock() might have succeeded in 501 * which case we'll have missed the might_sleep() from 502 * down_read(): 503 */ 504 might_sleep(); 505 } 506 507 vma = find_vma(mm, address); 508 if (unlikely(!vma)) 509 return bad_area(regs, address); 510 if (likely(vma->vm_start <= address)) 511 goto good_area; 512 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) 513 return bad_area(regs, address); 514 515 /* The stack is being expanded, check if it's valid */ 516 if (unlikely(bad_stack_expansion(regs, address, vma, flags, 517 &must_retry))) { 518 if (!must_retry) 519 return bad_area(regs, address); 520 521 up_read(&mm->mmap_sem); 522 if (fault_in_pages_readable((const char __user *)regs->nip, 523 sizeof(unsigned int))) 524 return bad_area_nosemaphore(regs, address); 525 goto retry; 526 } 527 528 /* Try to expand it */ 529 if (unlikely(expand_stack(vma, address))) 530 return bad_area(regs, address); 531 532 good_area: 533 if (unlikely(access_error(is_write, is_exec, vma))) 534 return bad_access(regs, address); 535 536 /* 537 * If for any reason at all we couldn't handle the fault, 538 * make sure we exit gracefully rather than endlessly redo 539 * the fault. 540 */ 541 fault = handle_mm_fault(vma, address, flags); 542 543 #ifdef CONFIG_PPC_MEM_KEYS 544 /* 545 * we skipped checking for access error due to key earlier. 546 * Check that using handle_mm_fault error return. 547 */ 548 if (unlikely(fault & VM_FAULT_SIGSEGV) && 549 !arch_vma_access_permitted(vma, is_write, is_exec, 0)) { 550 551 int pkey = vma_pkey(vma); 552 553 up_read(&mm->mmap_sem); 554 return bad_key_fault_exception(regs, address, pkey); 555 } 556 #endif /* CONFIG_PPC_MEM_KEYS */ 557 558 major |= fault & VM_FAULT_MAJOR; 559 560 /* 561 * Handle the retry right now, the mmap_sem has been released in that 562 * case. 563 */ 564 if (unlikely(fault & VM_FAULT_RETRY)) { 565 /* We retry only once */ 566 if (flags & FAULT_FLAG_ALLOW_RETRY) { 567 /* 568 * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 569 * of starvation. 570 */ 571 flags &= ~FAULT_FLAG_ALLOW_RETRY; 572 flags |= FAULT_FLAG_TRIED; 573 if (!fatal_signal_pending(current)) 574 goto retry; 575 } 576 577 /* 578 * User mode? Just return to handle the fatal exception otherwise 579 * return to bad_page_fault 580 */ 581 return is_user ? 0 : SIGBUS; 582 } 583 584 up_read(¤t->mm->mmap_sem); 585 586 if (unlikely(fault & VM_FAULT_ERROR)) 587 return mm_fault_error(regs, address, fault); 588 589 /* 590 * Major/minor page fault accounting. 591 */ 592 if (major) { 593 current->maj_flt++; 594 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); 595 cmo_account_page_fault(); 596 } else { 597 current->min_flt++; 598 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); 599 } 600 return 0; 601 } 602 NOKPROBE_SYMBOL(__do_page_fault); 603 604 int do_page_fault(struct pt_regs *regs, unsigned long address, 605 unsigned long error_code) 606 { 607 enum ctx_state prev_state = exception_enter(); 608 int rc = __do_page_fault(regs, address, error_code); 609 exception_exit(prev_state); 610 return rc; 611 } 612 NOKPROBE_SYMBOL(do_page_fault); 613 614 /* 615 * bad_page_fault is called when we have a bad access from the kernel. 616 * It is called from the DSI and ISI handlers in head.S and from some 617 * of the procedures in traps.c. 618 */ 619 void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) 620 { 621 const struct exception_table_entry *entry; 622 623 /* Are we prepared to handle this fault? */ 624 if ((entry = search_exception_tables(regs->nip)) != NULL) { 625 regs->nip = extable_fixup(entry); 626 return; 627 } 628 629 /* kernel has accessed a bad area */ 630 631 switch (TRAP(regs)) { 632 case 0x300: 633 case 0x380: 634 printk(KERN_ALERT "Unable to handle kernel paging request for " 635 "data at address 0x%08lx\n", regs->dar); 636 break; 637 case 0x400: 638 case 0x480: 639 printk(KERN_ALERT "Unable to handle kernel paging request for " 640 "instruction fetch\n"); 641 break; 642 case 0x600: 643 printk(KERN_ALERT "Unable to handle kernel paging request for " 644 "unaligned access at address 0x%08lx\n", regs->dar); 645 break; 646 default: 647 printk(KERN_ALERT "Unable to handle kernel paging request for " 648 "unknown fault\n"); 649 break; 650 } 651 printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", 652 regs->nip); 653 654 if (task_stack_end_corrupted(current)) 655 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 656 657 die("Kernel access of bad area", regs, sig); 658 } 659