1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 5 */ 6 #include <linux/magic.h> /* STACK_END_MAGIC */ 7 #include <linux/sched.h> /* test_thread_flag(), ... */ 8 #include <linux/kdebug.h> /* oops_begin/end, ... */ 9 #include <linux/module.h> /* search_exception_table */ 10 #include <linux/bootmem.h> /* max_low_pfn */ 11 #include <linux/kprobes.h> /* __kprobes, ... */ 12 #include <linux/mmiotrace.h> /* kmmio_handler, ... */ 13 #include <linux/perf_event.h> /* perf_sw_event */ 14 15 #include <asm/traps.h> /* dotraplinkage, ... */ 16 #include <asm/pgalloc.h> /* pgd_*(), ... */ 17 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 18 19 /* 20 * Page fault error code bits: 21 * 22 * bit 0 == 0: no page found 1: protection fault 23 * bit 1 == 0: read access 1: write access 24 * bit 2 == 0: kernel-mode access 1: user-mode access 25 * bit 3 == 1: use of reserved bit detected 26 * bit 4 == 1: fault was an instruction fetch 27 */ 28 enum x86_pf_error_code { 29 30 PF_PROT = 1 << 0, 31 PF_WRITE = 1 << 1, 32 PF_USER = 1 << 2, 33 PF_RSVD = 1 << 3, 34 PF_INSTR = 1 << 4, 35 }; 36 37 /* 38 * Returns 0 if mmiotrace is disabled, or if the fault is not 39 * handled by mmiotrace: 40 */ 41 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 42 { 43 if (unlikely(is_kmmio_active())) 44 if (kmmio_handler(regs, addr) == 1) 45 return -1; 46 return 0; 47 } 48 49 static inline int notify_page_fault(struct pt_regs *regs) 50 { 51 int ret = 0; 52 53 /* kprobe_running() needs smp_processor_id() */ 54 if (kprobes_built_in() && !user_mode_vm(regs)) { 55 preempt_disable(); 56 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 57 ret = 1; 58 preempt_enable(); 59 } 60 61 return ret; 62 } 63 64 /* 65 * Prefetch quirks: 66 * 67 * 32-bit mode: 68 * 69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 70 * Check that here and ignore it. 71 * 72 * 64-bit mode: 73 * 74 * Sometimes the CPU reports invalid exceptions on prefetch. 75 * Check that here and ignore it. 76 * 77 * Opcode checker based on code by Richard Brunner. 78 */ 79 static inline int 80 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, 81 unsigned char opcode, int *prefetch) 82 { 83 unsigned char instr_hi = opcode & 0xf0; 84 unsigned char instr_lo = opcode & 0x0f; 85 86 switch (instr_hi) { 87 case 0x20: 88 case 0x30: 89 /* 90 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 91 * In X86_64 long mode, the CPU will signal invalid 92 * opcode if some of these prefixes are present so 93 * X86_64 will never get here anyway 94 */ 95 return ((instr_lo & 7) == 0x6); 96 #ifdef CONFIG_X86_64 97 case 0x40: 98 /* 99 * In AMD64 long mode 0x40..0x4F are valid REX prefixes 100 * Need to figure out under what instruction mode the 101 * instruction was issued. Could check the LDT for lm, 102 * but for now it's good enough to assume that long 103 * mode only uses well known segments or kernel. 104 */ 105 return (!user_mode(regs)) || (regs->cs == __USER_CS); 106 #endif 107 case 0x60: 108 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 109 return (instr_lo & 0xC) == 0x4; 110 case 0xF0: 111 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 112 return !instr_lo || (instr_lo>>1) == 1; 113 case 0x00: 114 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 115 if (probe_kernel_address(instr, opcode)) 116 return 0; 117 118 *prefetch = (instr_lo == 0xF) && 119 (opcode == 0x0D || opcode == 0x18); 120 return 0; 121 default: 122 return 0; 123 } 124 } 125 126 static int 127 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) 128 { 129 unsigned char *max_instr; 130 unsigned char *instr; 131 int prefetch = 0; 132 133 /* 134 * If it was a exec (instruction fetch) fault on NX page, then 135 * do not ignore the fault: 136 */ 137 if (error_code & PF_INSTR) 138 return 0; 139 140 instr = (void *)convert_ip_to_linear(current, regs); 141 max_instr = instr + 15; 142 143 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 144 return 0; 145 146 while (instr < max_instr) { 147 unsigned char opcode; 148 149 if (probe_kernel_address(instr, opcode)) 150 break; 151 152 instr++; 153 154 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) 155 break; 156 } 157 return prefetch; 158 } 159 160 static void 161 force_sig_info_fault(int si_signo, int si_code, unsigned long address, 162 struct task_struct *tsk) 163 { 164 siginfo_t info; 165 166 info.si_signo = si_signo; 167 info.si_errno = 0; 168 info.si_code = si_code; 169 info.si_addr = (void __user *)address; 170 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 171 172 force_sig_info(si_signo, &info, tsk); 173 } 174 175 DEFINE_SPINLOCK(pgd_lock); 176 LIST_HEAD(pgd_list); 177 178 #ifdef CONFIG_X86_32 179 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 180 { 181 unsigned index = pgd_index(address); 182 pgd_t *pgd_k; 183 pud_t *pud, *pud_k; 184 pmd_t *pmd, *pmd_k; 185 186 pgd += index; 187 pgd_k = init_mm.pgd + index; 188 189 if (!pgd_present(*pgd_k)) 190 return NULL; 191 192 /* 193 * set_pgd(pgd, *pgd_k); here would be useless on PAE 194 * and redundant with the set_pmd() on non-PAE. As would 195 * set_pud. 196 */ 197 pud = pud_offset(pgd, address); 198 pud_k = pud_offset(pgd_k, address); 199 if (!pud_present(*pud_k)) 200 return NULL; 201 202 pmd = pmd_offset(pud, address); 203 pmd_k = pmd_offset(pud_k, address); 204 if (!pmd_present(*pmd_k)) 205 return NULL; 206 207 if (!pmd_present(*pmd)) 208 set_pmd(pmd, *pmd_k); 209 else 210 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 211 212 return pmd_k; 213 } 214 215 void vmalloc_sync_all(void) 216 { 217 unsigned long address; 218 219 if (SHARED_KERNEL_PMD) 220 return; 221 222 for (address = VMALLOC_START & PMD_MASK; 223 address >= TASK_SIZE && address < FIXADDR_TOP; 224 address += PMD_SIZE) { 225 226 unsigned long flags; 227 struct page *page; 228 229 spin_lock_irqsave(&pgd_lock, flags); 230 list_for_each_entry(page, &pgd_list, lru) { 231 if (!vmalloc_sync_one(page_address(page), address)) 232 break; 233 } 234 spin_unlock_irqrestore(&pgd_lock, flags); 235 } 236 } 237 238 /* 239 * 32-bit: 240 * 241 * Handle a fault on the vmalloc or module mapping area 242 */ 243 static noinline int vmalloc_fault(unsigned long address) 244 { 245 unsigned long pgd_paddr; 246 pmd_t *pmd_k; 247 pte_t *pte_k; 248 249 /* Make sure we are in vmalloc area: */ 250 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 251 return -1; 252 253 /* 254 * Synchronize this task's top level page-table 255 * with the 'reference' page table. 256 * 257 * Do _not_ use "current" here. We might be inside 258 * an interrupt in the middle of a task switch.. 259 */ 260 pgd_paddr = read_cr3(); 261 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 262 if (!pmd_k) 263 return -1; 264 265 pte_k = pte_offset_kernel(pmd_k, address); 266 if (!pte_present(*pte_k)) 267 return -1; 268 269 return 0; 270 } 271 272 /* 273 * Did it hit the DOS screen memory VA from vm86 mode? 274 */ 275 static inline void 276 check_v8086_mode(struct pt_regs *regs, unsigned long address, 277 struct task_struct *tsk) 278 { 279 unsigned long bit; 280 281 if (!v8086_mode(regs)) 282 return; 283 284 bit = (address - 0xA0000) >> PAGE_SHIFT; 285 if (bit < 32) 286 tsk->thread.screen_bitmap |= 1 << bit; 287 } 288 289 static bool low_pfn(unsigned long pfn) 290 { 291 return pfn < max_low_pfn; 292 } 293 294 static void dump_pagetable(unsigned long address) 295 { 296 pgd_t *base = __va(read_cr3()); 297 pgd_t *pgd = &base[pgd_index(address)]; 298 pmd_t *pmd; 299 pte_t *pte; 300 301 #ifdef CONFIG_X86_PAE 302 printk("*pdpt = %016Lx ", pgd_val(*pgd)); 303 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 304 goto out; 305 #endif 306 pmd = pmd_offset(pud_offset(pgd, address), address); 307 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 308 309 /* 310 * We must not directly access the pte in the highpte 311 * case if the page table is located in highmem. 312 * And let's rather not kmap-atomic the pte, just in case 313 * it's allocated already: 314 */ 315 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) 316 goto out; 317 318 pte = pte_offset_kernel(pmd, address); 319 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 320 out: 321 printk("\n"); 322 } 323 324 #else /* CONFIG_X86_64: */ 325 326 void vmalloc_sync_all(void) 327 { 328 unsigned long address; 329 330 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; 331 address += PGDIR_SIZE) { 332 333 const pgd_t *pgd_ref = pgd_offset_k(address); 334 unsigned long flags; 335 struct page *page; 336 337 if (pgd_none(*pgd_ref)) 338 continue; 339 340 spin_lock_irqsave(&pgd_lock, flags); 341 list_for_each_entry(page, &pgd_list, lru) { 342 pgd_t *pgd; 343 pgd = (pgd_t *)page_address(page) + pgd_index(address); 344 if (pgd_none(*pgd)) 345 set_pgd(pgd, *pgd_ref); 346 else 347 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 348 } 349 spin_unlock_irqrestore(&pgd_lock, flags); 350 } 351 } 352 353 /* 354 * 64-bit: 355 * 356 * Handle a fault on the vmalloc area 357 * 358 * This assumes no large pages in there. 359 */ 360 static noinline int vmalloc_fault(unsigned long address) 361 { 362 pgd_t *pgd, *pgd_ref; 363 pud_t *pud, *pud_ref; 364 pmd_t *pmd, *pmd_ref; 365 pte_t *pte, *pte_ref; 366 367 /* Make sure we are in vmalloc area: */ 368 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 369 return -1; 370 371 /* 372 * Copy kernel mappings over when needed. This can also 373 * happen within a race in page table update. In the later 374 * case just flush: 375 */ 376 pgd = pgd_offset(current->active_mm, address); 377 pgd_ref = pgd_offset_k(address); 378 if (pgd_none(*pgd_ref)) 379 return -1; 380 381 if (pgd_none(*pgd)) 382 set_pgd(pgd, *pgd_ref); 383 else 384 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 385 386 /* 387 * Below here mismatches are bugs because these lower tables 388 * are shared: 389 */ 390 391 pud = pud_offset(pgd, address); 392 pud_ref = pud_offset(pgd_ref, address); 393 if (pud_none(*pud_ref)) 394 return -1; 395 396 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 397 BUG(); 398 399 pmd = pmd_offset(pud, address); 400 pmd_ref = pmd_offset(pud_ref, address); 401 if (pmd_none(*pmd_ref)) 402 return -1; 403 404 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 405 BUG(); 406 407 pte_ref = pte_offset_kernel(pmd_ref, address); 408 if (!pte_present(*pte_ref)) 409 return -1; 410 411 pte = pte_offset_kernel(pmd, address); 412 413 /* 414 * Don't use pte_page here, because the mappings can point 415 * outside mem_map, and the NUMA hash lookup cannot handle 416 * that: 417 */ 418 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 419 BUG(); 420 421 return 0; 422 } 423 424 static const char errata93_warning[] = 425 KERN_ERR 426 "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 427 "******* Working around it, but it may cause SEGVs or burn power.\n" 428 "******* Please consider a BIOS update.\n" 429 "******* Disabling USB legacy in the BIOS may also help.\n"; 430 431 /* 432 * No vm86 mode in 64-bit mode: 433 */ 434 static inline void 435 check_v8086_mode(struct pt_regs *regs, unsigned long address, 436 struct task_struct *tsk) 437 { 438 } 439 440 static int bad_address(void *p) 441 { 442 unsigned long dummy; 443 444 return probe_kernel_address((unsigned long *)p, dummy); 445 } 446 447 static void dump_pagetable(unsigned long address) 448 { 449 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); 450 pgd_t *pgd = base + pgd_index(address); 451 pud_t *pud; 452 pmd_t *pmd; 453 pte_t *pte; 454 455 if (bad_address(pgd)) 456 goto bad; 457 458 printk("PGD %lx ", pgd_val(*pgd)); 459 460 if (!pgd_present(*pgd)) 461 goto out; 462 463 pud = pud_offset(pgd, address); 464 if (bad_address(pud)) 465 goto bad; 466 467 printk("PUD %lx ", pud_val(*pud)); 468 if (!pud_present(*pud) || pud_large(*pud)) 469 goto out; 470 471 pmd = pmd_offset(pud, address); 472 if (bad_address(pmd)) 473 goto bad; 474 475 printk("PMD %lx ", pmd_val(*pmd)); 476 if (!pmd_present(*pmd) || pmd_large(*pmd)) 477 goto out; 478 479 pte = pte_offset_kernel(pmd, address); 480 if (bad_address(pte)) 481 goto bad; 482 483 printk("PTE %lx", pte_val(*pte)); 484 out: 485 printk("\n"); 486 return; 487 bad: 488 printk("BAD\n"); 489 } 490 491 #endif /* CONFIG_X86_64 */ 492 493 /* 494 * Workaround for K8 erratum #93 & buggy BIOS. 495 * 496 * BIOS SMM functions are required to use a specific workaround 497 * to avoid corruption of the 64bit RIP register on C stepping K8. 498 * 499 * A lot of BIOS that didn't get tested properly miss this. 500 * 501 * The OS sees this as a page fault with the upper 32bits of RIP cleared. 502 * Try to work around it here. 503 * 504 * Note we only handle faults in kernel here. 505 * Does nothing on 32-bit. 506 */ 507 static int is_errata93(struct pt_regs *regs, unsigned long address) 508 { 509 #ifdef CONFIG_X86_64 510 if (address != regs->ip) 511 return 0; 512 513 if ((address >> 32) != 0) 514 return 0; 515 516 address |= 0xffffffffUL << 32; 517 if ((address >= (u64)_stext && address <= (u64)_etext) || 518 (address >= MODULES_VADDR && address <= MODULES_END)) { 519 printk_once(errata93_warning); 520 regs->ip = address; 521 return 1; 522 } 523 #endif 524 return 0; 525 } 526 527 /* 528 * Work around K8 erratum #100 K8 in compat mode occasionally jumps 529 * to illegal addresses >4GB. 530 * 531 * We catch this in the page fault handler because these addresses 532 * are not reachable. Just detect this case and return. Any code 533 * segment in LDT is compatibility mode. 534 */ 535 static int is_errata100(struct pt_regs *regs, unsigned long address) 536 { 537 #ifdef CONFIG_X86_64 538 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) 539 return 1; 540 #endif 541 return 0; 542 } 543 544 static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 545 { 546 #ifdef CONFIG_X86_F00F_BUG 547 unsigned long nr; 548 549 /* 550 * Pentium F0 0F C7 C8 bug workaround: 551 */ 552 if (boot_cpu_data.f00f_bug) { 553 nr = (address - idt_descr.address) >> 3; 554 555 if (nr == 6) { 556 do_invalid_op(regs, 0); 557 return 1; 558 } 559 } 560 #endif 561 return 0; 562 } 563 564 static const char nx_warning[] = KERN_CRIT 565 "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; 566 567 static void 568 show_fault_oops(struct pt_regs *regs, unsigned long error_code, 569 unsigned long address) 570 { 571 if (!oops_may_print()) 572 return; 573 574 if (error_code & PF_INSTR) { 575 unsigned int level; 576 577 pte_t *pte = lookup_address(address, &level); 578 579 if (pte && pte_present(*pte) && !pte_exec(*pte)) 580 printk(nx_warning, current_uid()); 581 } 582 583 printk(KERN_ALERT "BUG: unable to handle kernel "); 584 if (address < PAGE_SIZE) 585 printk(KERN_CONT "NULL pointer dereference"); 586 else 587 printk(KERN_CONT "paging request"); 588 589 printk(KERN_CONT " at %p\n", (void *) address); 590 printk(KERN_ALERT "IP:"); 591 printk_address(regs->ip, 1); 592 593 dump_pagetable(address); 594 } 595 596 static noinline void 597 pgtable_bad(struct pt_regs *regs, unsigned long error_code, 598 unsigned long address) 599 { 600 struct task_struct *tsk; 601 unsigned long flags; 602 int sig; 603 604 flags = oops_begin(); 605 tsk = current; 606 sig = SIGKILL; 607 608 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 609 tsk->comm, address); 610 dump_pagetable(address); 611 612 tsk->thread.cr2 = address; 613 tsk->thread.trap_no = 14; 614 tsk->thread.error_code = error_code; 615 616 if (__die("Bad pagetable", regs, error_code)) 617 sig = 0; 618 619 oops_end(flags, regs, sig); 620 } 621 622 static noinline void 623 no_context(struct pt_regs *regs, unsigned long error_code, 624 unsigned long address) 625 { 626 struct task_struct *tsk = current; 627 unsigned long *stackend; 628 unsigned long flags; 629 int sig; 630 631 /* Are we prepared to handle this kernel fault? */ 632 if (fixup_exception(regs)) 633 return; 634 635 /* 636 * 32-bit: 637 * 638 * Valid to do another page fault here, because if this fault 639 * had been triggered by is_prefetch fixup_exception would have 640 * handled it. 641 * 642 * 64-bit: 643 * 644 * Hall of shame of CPU/BIOS bugs. 645 */ 646 if (is_prefetch(regs, error_code, address)) 647 return; 648 649 if (is_errata93(regs, address)) 650 return; 651 652 /* 653 * Oops. The kernel tried to access some bad page. We'll have to 654 * terminate things with extreme prejudice: 655 */ 656 flags = oops_begin(); 657 658 show_fault_oops(regs, error_code, address); 659 660 stackend = end_of_stack(tsk); 661 if (*stackend != STACK_END_MAGIC) 662 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 663 664 tsk->thread.cr2 = address; 665 tsk->thread.trap_no = 14; 666 tsk->thread.error_code = error_code; 667 668 sig = SIGKILL; 669 if (__die("Oops", regs, error_code)) 670 sig = 0; 671 672 /* Executive summary in case the body of the oops scrolled away */ 673 printk(KERN_EMERG "CR2: %016lx\n", address); 674 675 oops_end(flags, regs, sig); 676 } 677 678 /* 679 * Print out info about fatal segfaults, if the show_unhandled_signals 680 * sysctl is set: 681 */ 682 static inline void 683 show_signal_msg(struct pt_regs *regs, unsigned long error_code, 684 unsigned long address, struct task_struct *tsk) 685 { 686 if (!unhandled_signal(tsk, SIGSEGV)) 687 return; 688 689 if (!printk_ratelimit()) 690 return; 691 692 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 693 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 694 tsk->comm, task_pid_nr(tsk), address, 695 (void *)regs->ip, (void *)regs->sp, error_code); 696 697 print_vma_addr(KERN_CONT " in ", regs->ip); 698 699 printk(KERN_CONT "\n"); 700 } 701 702 static void 703 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 704 unsigned long address, int si_code) 705 { 706 struct task_struct *tsk = current; 707 708 /* User mode accesses just cause a SIGSEGV */ 709 if (error_code & PF_USER) { 710 /* 711 * It's possible to have interrupts off here: 712 */ 713 local_irq_enable(); 714 715 /* 716 * Valid to do another page fault here because this one came 717 * from user space: 718 */ 719 if (is_prefetch(regs, error_code, address)) 720 return; 721 722 if (is_errata100(regs, address)) 723 return; 724 725 if (unlikely(show_unhandled_signals)) 726 show_signal_msg(regs, error_code, address, tsk); 727 728 /* Kernel addresses are always protection faults: */ 729 tsk->thread.cr2 = address; 730 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 731 tsk->thread.trap_no = 14; 732 733 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 734 735 return; 736 } 737 738 if (is_f00f_bug(regs, address)) 739 return; 740 741 no_context(regs, error_code, address); 742 } 743 744 static noinline void 745 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 746 unsigned long address) 747 { 748 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 749 } 750 751 static void 752 __bad_area(struct pt_regs *regs, unsigned long error_code, 753 unsigned long address, int si_code) 754 { 755 struct mm_struct *mm = current->mm; 756 757 /* 758 * Something tried to access memory that isn't in our memory map.. 759 * Fix it, but check if it's kernel or user first.. 760 */ 761 up_read(&mm->mmap_sem); 762 763 __bad_area_nosemaphore(regs, error_code, address, si_code); 764 } 765 766 static noinline void 767 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) 768 { 769 __bad_area(regs, error_code, address, SEGV_MAPERR); 770 } 771 772 static noinline void 773 bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 774 unsigned long address) 775 { 776 __bad_area(regs, error_code, address, SEGV_ACCERR); 777 } 778 779 /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ 780 static void 781 out_of_memory(struct pt_regs *regs, unsigned long error_code, 782 unsigned long address) 783 { 784 /* 785 * We ran out of memory, call the OOM killer, and return the userspace 786 * (which will retry the fault, or kill us if we got oom-killed): 787 */ 788 up_read(¤t->mm->mmap_sem); 789 790 pagefault_out_of_memory(); 791 } 792 793 static void 794 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 795 unsigned int fault) 796 { 797 struct task_struct *tsk = current; 798 struct mm_struct *mm = tsk->mm; 799 int code = BUS_ADRERR; 800 801 up_read(&mm->mmap_sem); 802 803 /* Kernel mode? Handle exceptions or die: */ 804 if (!(error_code & PF_USER)) 805 no_context(regs, error_code, address); 806 807 /* User-space => ok to do another page fault: */ 808 if (is_prefetch(regs, error_code, address)) 809 return; 810 811 tsk->thread.cr2 = address; 812 tsk->thread.error_code = error_code; 813 tsk->thread.trap_no = 14; 814 815 #ifdef CONFIG_MEMORY_FAILURE 816 if (fault & VM_FAULT_HWPOISON) { 817 printk(KERN_ERR 818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 819 tsk->comm, tsk->pid, address); 820 code = BUS_MCEERR_AR; 821 } 822 #endif 823 force_sig_info_fault(SIGBUS, code, address, tsk); 824 } 825 826 static noinline void 827 mm_fault_error(struct pt_regs *regs, unsigned long error_code, 828 unsigned long address, unsigned int fault) 829 { 830 if (fault & VM_FAULT_OOM) { 831 out_of_memory(regs, error_code, address); 832 } else { 833 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 834 do_sigbus(regs, error_code, address, fault); 835 else 836 BUG(); 837 } 838 } 839 840 static int spurious_fault_check(unsigned long error_code, pte_t *pte) 841 { 842 if ((error_code & PF_WRITE) && !pte_write(*pte)) 843 return 0; 844 845 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 846 return 0; 847 848 return 1; 849 } 850 851 /* 852 * Handle a spurious fault caused by a stale TLB entry. 853 * 854 * This allows us to lazily refresh the TLB when increasing the 855 * permissions of a kernel page (RO -> RW or NX -> X). Doing it 856 * eagerly is very expensive since that implies doing a full 857 * cross-processor TLB flush, even if no stale TLB entries exist 858 * on other processors. 859 * 860 * There are no security implications to leaving a stale TLB when 861 * increasing the permissions on a page. 862 */ 863 static noinline int 864 spurious_fault(unsigned long error_code, unsigned long address) 865 { 866 pgd_t *pgd; 867 pud_t *pud; 868 pmd_t *pmd; 869 pte_t *pte; 870 int ret; 871 872 /* Reserved-bit violation or user access to kernel space? */ 873 if (error_code & (PF_USER | PF_RSVD)) 874 return 0; 875 876 pgd = init_mm.pgd + pgd_index(address); 877 if (!pgd_present(*pgd)) 878 return 0; 879 880 pud = pud_offset(pgd, address); 881 if (!pud_present(*pud)) 882 return 0; 883 884 if (pud_large(*pud)) 885 return spurious_fault_check(error_code, (pte_t *) pud); 886 887 pmd = pmd_offset(pud, address); 888 if (!pmd_present(*pmd)) 889 return 0; 890 891 if (pmd_large(*pmd)) 892 return spurious_fault_check(error_code, (pte_t *) pmd); 893 894 pte = pte_offset_kernel(pmd, address); 895 if (!pte_present(*pte)) 896 return 0; 897 898 ret = spurious_fault_check(error_code, pte); 899 if (!ret) 900 return 0; 901 902 /* 903 * Make sure we have permissions in PMD. 904 * If not, then there's a bug in the page tables: 905 */ 906 ret = spurious_fault_check(error_code, (pte_t *) pmd); 907 WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); 908 909 return ret; 910 } 911 912 int show_unhandled_signals = 1; 913 914 static inline int 915 access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 916 { 917 if (write) { 918 /* write, present and write, not present: */ 919 if (unlikely(!(vma->vm_flags & VM_WRITE))) 920 return 1; 921 return 0; 922 } 923 924 /* read, present: */ 925 if (unlikely(error_code & PF_PROT)) 926 return 1; 927 928 /* read, not present: */ 929 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 930 return 1; 931 932 return 0; 933 } 934 935 static int fault_in_kernel_space(unsigned long address) 936 { 937 return address >= TASK_SIZE_MAX; 938 } 939 940 /* 941 * This routine handles page faults. It determines the address, 942 * and the problem, and then passes it off to one of the appropriate 943 * routines. 944 */ 945 dotraplinkage void __kprobes 946 do_page_fault(struct pt_regs *regs, unsigned long error_code) 947 { 948 struct vm_area_struct *vma; 949 struct task_struct *tsk; 950 unsigned long address; 951 struct mm_struct *mm; 952 int write; 953 int fault; 954 955 tsk = current; 956 mm = tsk->mm; 957 958 /* Get the faulting address: */ 959 address = read_cr2(); 960 961 /* 962 * Detect and handle instructions that would cause a page fault for 963 * both a tracked kernel page and a userspace page. 964 */ 965 if (kmemcheck_active(regs)) 966 kmemcheck_hide(regs); 967 prefetchw(&mm->mmap_sem); 968 969 if (unlikely(kmmio_fault(regs, address))) 970 return; 971 972 /* 973 * We fault-in kernel-space virtual memory on-demand. The 974 * 'reference' page table is init_mm.pgd. 975 * 976 * NOTE! We MUST NOT take any locks for this case. We may 977 * be in an interrupt or a critical region, and should 978 * only copy the information from the master page table, 979 * nothing more. 980 * 981 * This verifies that the fault happens in kernel space 982 * (error_code & 4) == 0, and that the fault was not a 983 * protection error (error_code & 9) == 0. 984 */ 985 if (unlikely(fault_in_kernel_space(address))) { 986 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { 987 if (vmalloc_fault(address) >= 0) 988 return; 989 990 if (kmemcheck_fault(regs, address, error_code)) 991 return; 992 } 993 994 /* Can handle a stale RO->RW TLB: */ 995 if (spurious_fault(error_code, address)) 996 return; 997 998 /* kprobes don't want to hook the spurious faults: */ 999 if (notify_page_fault(regs)) 1000 return; 1001 /* 1002 * Don't take the mm semaphore here. If we fixup a prefetch 1003 * fault we could otherwise deadlock: 1004 */ 1005 bad_area_nosemaphore(regs, error_code, address); 1006 1007 return; 1008 } 1009 1010 /* kprobes don't want to hook the spurious faults: */ 1011 if (unlikely(notify_page_fault(regs))) 1012 return; 1013 /* 1014 * It's safe to allow irq's after cr2 has been saved and the 1015 * vmalloc fault has been handled. 1016 * 1017 * User-mode registers count as a user access even for any 1018 * potential system fault or CPU buglet: 1019 */ 1020 if (user_mode_vm(regs)) { 1021 local_irq_enable(); 1022 error_code |= PF_USER; 1023 } else { 1024 if (regs->flags & X86_EFLAGS_IF) 1025 local_irq_enable(); 1026 } 1027 1028 if (unlikely(error_code & PF_RSVD)) 1029 pgtable_bad(regs, error_code, address); 1030 1031 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 1032 1033 /* 1034 * If we're in an interrupt, have no user context or are running 1035 * in an atomic region then we must not take the fault: 1036 */ 1037 if (unlikely(in_atomic() || !mm)) { 1038 bad_area_nosemaphore(regs, error_code, address); 1039 return; 1040 } 1041 1042 /* 1043 * When running in the kernel we expect faults to occur only to 1044 * addresses in user space. All other faults represent errors in 1045 * the kernel and should generate an OOPS. Unfortunately, in the 1046 * case of an erroneous fault occurring in a code path which already 1047 * holds mmap_sem we will deadlock attempting to validate the fault 1048 * against the address space. Luckily the kernel only validly 1049 * references user space from well defined areas of code, which are 1050 * listed in the exceptions table. 1051 * 1052 * As the vast majority of faults will be valid we will only perform 1053 * the source reference check when there is a possibility of a 1054 * deadlock. Attempt to lock the address space, if we cannot we then 1055 * validate the source. If this is invalid we can skip the address 1056 * space check, thus avoiding the deadlock: 1057 */ 1058 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1059 if ((error_code & PF_USER) == 0 && 1060 !search_exception_tables(regs->ip)) { 1061 bad_area_nosemaphore(regs, error_code, address); 1062 return; 1063 } 1064 down_read(&mm->mmap_sem); 1065 } else { 1066 /* 1067 * The above down_read_trylock() might have succeeded in 1068 * which case we'll have missed the might_sleep() from 1069 * down_read(): 1070 */ 1071 might_sleep(); 1072 } 1073 1074 vma = find_vma(mm, address); 1075 if (unlikely(!vma)) { 1076 bad_area(regs, error_code, address); 1077 return; 1078 } 1079 if (likely(vma->vm_start <= address)) 1080 goto good_area; 1081 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 1082 bad_area(regs, error_code, address); 1083 return; 1084 } 1085 if (error_code & PF_USER) { 1086 /* 1087 * Accessing the stack below %sp is always a bug. 1088 * The large cushion allows instructions like enter 1089 * and pusha to work. ("enter $65535, $31" pushes 1090 * 32 pointers and then decrements %sp by 65535.) 1091 */ 1092 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { 1093 bad_area(regs, error_code, address); 1094 return; 1095 } 1096 } 1097 if (unlikely(expand_stack(vma, address))) { 1098 bad_area(regs, error_code, address); 1099 return; 1100 } 1101 1102 /* 1103 * Ok, we have a good vm_area for this memory access, so 1104 * we can handle it.. 1105 */ 1106 good_area: 1107 write = error_code & PF_WRITE; 1108 1109 if (unlikely(access_error(error_code, write, vma))) { 1110 bad_area_access_error(regs, error_code, address); 1111 return; 1112 } 1113 1114 /* 1115 * If for any reason at all we couldn't handle the fault, 1116 * make sure we exit gracefully rather than endlessly redo 1117 * the fault: 1118 */ 1119 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1120 1121 if (unlikely(fault & VM_FAULT_ERROR)) { 1122 mm_fault_error(regs, error_code, address, fault); 1123 return; 1124 } 1125 1126 if (fault & VM_FAULT_MAJOR) { 1127 tsk->maj_flt++; 1128 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1129 regs, address); 1130 } else { 1131 tsk->min_flt++; 1132 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1133 regs, address); 1134 } 1135 1136 check_v8086_mode(regs, address, tsk); 1137 1138 up_read(&mm->mmap_sem); 1139 } 1140