1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 4 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 5 */ 6 #include <linux/magic.h> /* STACK_END_MAGIC */ 7 #include <linux/sched.h> /* test_thread_flag(), ... */ 8 #include <linux/kdebug.h> /* oops_begin/end, ... */ 9 #include <linux/module.h> /* search_exception_table */ 10 #include <linux/bootmem.h> /* max_low_pfn */ 11 #include <linux/kprobes.h> /* __kprobes, ... */ 12 #include <linux/mmiotrace.h> /* kmmio_handler, ... */ 13 #include <linux/perf_event.h> /* perf_sw_event */ 14 15 #include <asm/traps.h> /* dotraplinkage, ... */ 16 #include <asm/pgalloc.h> /* pgd_*(), ... */ 17 #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 18 19 /* 20 * Page fault error code bits: 21 * 22 * bit 0 == 0: no page found 1: protection fault 23 * bit 1 == 0: read access 1: write access 24 * bit 2 == 0: kernel-mode access 1: user-mode access 25 * bit 3 == 1: use of reserved bit detected 26 * bit 4 == 1: fault was an instruction fetch 27 */ 28 enum x86_pf_error_code { 29 30 PF_PROT = 1 << 0, 31 PF_WRITE = 1 << 1, 32 PF_USER = 1 << 2, 33 PF_RSVD = 1 << 3, 34 PF_INSTR = 1 << 4, 35 }; 36 37 /* 38 * Returns 0 if mmiotrace is disabled, or if the fault is not 39 * handled by mmiotrace: 40 */ 41 static inline int __kprobes 42 kmmio_fault(struct pt_regs *regs, unsigned long addr) 43 { 44 if (unlikely(is_kmmio_active())) 45 if (kmmio_handler(regs, addr) == 1) 46 return -1; 47 return 0; 48 } 49 50 static inline int __kprobes notify_page_fault(struct pt_regs *regs) 51 { 52 int ret = 0; 53 54 /* kprobe_running() needs smp_processor_id() */ 55 if (kprobes_built_in() && !user_mode_vm(regs)) { 56 preempt_disable(); 57 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 58 ret = 1; 59 preempt_enable(); 60 } 61 62 return ret; 63 } 64 65 /* 66 * Prefetch quirks: 67 * 68 * 32-bit mode: 69 * 70 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 71 * Check that here and ignore it. 72 * 73 * 64-bit mode: 74 * 75 * Sometimes the CPU reports invalid exceptions on prefetch. 76 * Check that here and ignore it. 77 * 78 * Opcode checker based on code by Richard Brunner. 79 */ 80 static inline int 81 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, 82 unsigned char opcode, int *prefetch) 83 { 84 unsigned char instr_hi = opcode & 0xf0; 85 unsigned char instr_lo = opcode & 0x0f; 86 87 switch (instr_hi) { 88 case 0x20: 89 case 0x30: 90 /* 91 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 92 * In X86_64 long mode, the CPU will signal invalid 93 * opcode if some of these prefixes are present so 94 * X86_64 will never get here anyway 95 */ 96 return ((instr_lo & 7) == 0x6); 97 #ifdef CONFIG_X86_64 98 case 0x40: 99 /* 100 * In AMD64 long mode 0x40..0x4F are valid REX prefixes 101 * Need to figure out under what instruction mode the 102 * instruction was issued. Could check the LDT for lm, 103 * but for now it's good enough to assume that long 104 * mode only uses well known segments or kernel. 105 */ 106 return (!user_mode(regs)) || (regs->cs == __USER_CS); 107 #endif 108 case 0x60: 109 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 110 return (instr_lo & 0xC) == 0x4; 111 case 0xF0: 112 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 113 return !instr_lo || (instr_lo>>1) == 1; 114 case 0x00: 115 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 116 if (probe_kernel_address(instr, opcode)) 117 return 0; 118 119 *prefetch = (instr_lo == 0xF) && 120 (opcode == 0x0D || opcode == 0x18); 121 return 0; 122 default: 123 return 0; 124 } 125 } 126 127 static int 128 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) 129 { 130 unsigned char *max_instr; 131 unsigned char *instr; 132 int prefetch = 0; 133 134 /* 135 * If it was a exec (instruction fetch) fault on NX page, then 136 * do not ignore the fault: 137 */ 138 if (error_code & PF_INSTR) 139 return 0; 140 141 instr = (void *)convert_ip_to_linear(current, regs); 142 max_instr = instr + 15; 143 144 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 145 return 0; 146 147 while (instr < max_instr) { 148 unsigned char opcode; 149 150 if (probe_kernel_address(instr, opcode)) 151 break; 152 153 instr++; 154 155 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) 156 break; 157 } 158 return prefetch; 159 } 160 161 static void 162 force_sig_info_fault(int si_signo, int si_code, unsigned long address, 163 struct task_struct *tsk) 164 { 165 siginfo_t info; 166 167 info.si_signo = si_signo; 168 info.si_errno = 0; 169 info.si_code = si_code; 170 info.si_addr = (void __user *)address; 171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 172 173 force_sig_info(si_signo, &info, tsk); 174 } 175 176 DEFINE_SPINLOCK(pgd_lock); 177 LIST_HEAD(pgd_list); 178 179 #ifdef CONFIG_X86_32 180 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 181 { 182 unsigned index = pgd_index(address); 183 pgd_t *pgd_k; 184 pud_t *pud, *pud_k; 185 pmd_t *pmd, *pmd_k; 186 187 pgd += index; 188 pgd_k = init_mm.pgd + index; 189 190 if (!pgd_present(*pgd_k)) 191 return NULL; 192 193 /* 194 * set_pgd(pgd, *pgd_k); here would be useless on PAE 195 * and redundant with the set_pmd() on non-PAE. As would 196 * set_pud. 197 */ 198 pud = pud_offset(pgd, address); 199 pud_k = pud_offset(pgd_k, address); 200 if (!pud_present(*pud_k)) 201 return NULL; 202 203 pmd = pmd_offset(pud, address); 204 pmd_k = pmd_offset(pud_k, address); 205 if (!pmd_present(*pmd_k)) 206 return NULL; 207 208 if (!pmd_present(*pmd)) 209 set_pmd(pmd, *pmd_k); 210 else 211 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 212 213 return pmd_k; 214 } 215 216 void vmalloc_sync_all(void) 217 { 218 unsigned long address; 219 220 if (SHARED_KERNEL_PMD) 221 return; 222 223 for (address = VMALLOC_START & PMD_MASK; 224 address >= TASK_SIZE && address < FIXADDR_TOP; 225 address += PMD_SIZE) { 226 227 unsigned long flags; 228 struct page *page; 229 230 spin_lock_irqsave(&pgd_lock, flags); 231 list_for_each_entry(page, &pgd_list, lru) { 232 if (!vmalloc_sync_one(page_address(page), address)) 233 break; 234 } 235 spin_unlock_irqrestore(&pgd_lock, flags); 236 } 237 } 238 239 /* 240 * 32-bit: 241 * 242 * Handle a fault on the vmalloc or module mapping area 243 */ 244 static noinline __kprobes int vmalloc_fault(unsigned long address) 245 { 246 unsigned long pgd_paddr; 247 pmd_t *pmd_k; 248 pte_t *pte_k; 249 250 /* Make sure we are in vmalloc area: */ 251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 252 return -1; 253 254 /* 255 * Synchronize this task's top level page-table 256 * with the 'reference' page table. 257 * 258 * Do _not_ use "current" here. We might be inside 259 * an interrupt in the middle of a task switch.. 260 */ 261 pgd_paddr = read_cr3(); 262 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 263 if (!pmd_k) 264 return -1; 265 266 pte_k = pte_offset_kernel(pmd_k, address); 267 if (!pte_present(*pte_k)) 268 return -1; 269 270 return 0; 271 } 272 273 /* 274 * Did it hit the DOS screen memory VA from vm86 mode? 275 */ 276 static inline void 277 check_v8086_mode(struct pt_regs *regs, unsigned long address, 278 struct task_struct *tsk) 279 { 280 unsigned long bit; 281 282 if (!v8086_mode(regs)) 283 return; 284 285 bit = (address - 0xA0000) >> PAGE_SHIFT; 286 if (bit < 32) 287 tsk->thread.screen_bitmap |= 1 << bit; 288 } 289 290 static bool low_pfn(unsigned long pfn) 291 { 292 return pfn < max_low_pfn; 293 } 294 295 static void dump_pagetable(unsigned long address) 296 { 297 pgd_t *base = __va(read_cr3()); 298 pgd_t *pgd = &base[pgd_index(address)]; 299 pmd_t *pmd; 300 pte_t *pte; 301 302 #ifdef CONFIG_X86_PAE 303 printk("*pdpt = %016Lx ", pgd_val(*pgd)); 304 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 305 goto out; 306 #endif 307 pmd = pmd_offset(pud_offset(pgd, address), address); 308 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 309 310 /* 311 * We must not directly access the pte in the highpte 312 * case if the page table is located in highmem. 313 * And let's rather not kmap-atomic the pte, just in case 314 * it's allocated already: 315 */ 316 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) 317 goto out; 318 319 pte = pte_offset_kernel(pmd, address); 320 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 321 out: 322 printk("\n"); 323 } 324 325 #else /* CONFIG_X86_64: */ 326 327 void vmalloc_sync_all(void) 328 { 329 unsigned long address; 330 331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; 332 address += PGDIR_SIZE) { 333 334 const pgd_t *pgd_ref = pgd_offset_k(address); 335 unsigned long flags; 336 struct page *page; 337 338 if (pgd_none(*pgd_ref)) 339 continue; 340 341 spin_lock_irqsave(&pgd_lock, flags); 342 list_for_each_entry(page, &pgd_list, lru) { 343 pgd_t *pgd; 344 pgd = (pgd_t *)page_address(page) + pgd_index(address); 345 if (pgd_none(*pgd)) 346 set_pgd(pgd, *pgd_ref); 347 else 348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 349 } 350 spin_unlock_irqrestore(&pgd_lock, flags); 351 } 352 } 353 354 /* 355 * 64-bit: 356 * 357 * Handle a fault on the vmalloc area 358 * 359 * This assumes no large pages in there. 360 */ 361 static noinline __kprobes int vmalloc_fault(unsigned long address) 362 { 363 pgd_t *pgd, *pgd_ref; 364 pud_t *pud, *pud_ref; 365 pmd_t *pmd, *pmd_ref; 366 pte_t *pte, *pte_ref; 367 368 /* Make sure we are in vmalloc area: */ 369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 370 return -1; 371 372 /* 373 * Copy kernel mappings over when needed. This can also 374 * happen within a race in page table update. In the later 375 * case just flush: 376 */ 377 pgd = pgd_offset(current->active_mm, address); 378 pgd_ref = pgd_offset_k(address); 379 if (pgd_none(*pgd_ref)) 380 return -1; 381 382 if (pgd_none(*pgd)) 383 set_pgd(pgd, *pgd_ref); 384 else 385 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 386 387 /* 388 * Below here mismatches are bugs because these lower tables 389 * are shared: 390 */ 391 392 pud = pud_offset(pgd, address); 393 pud_ref = pud_offset(pgd_ref, address); 394 if (pud_none(*pud_ref)) 395 return -1; 396 397 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 398 BUG(); 399 400 pmd = pmd_offset(pud, address); 401 pmd_ref = pmd_offset(pud_ref, address); 402 if (pmd_none(*pmd_ref)) 403 return -1; 404 405 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 406 BUG(); 407 408 pte_ref = pte_offset_kernel(pmd_ref, address); 409 if (!pte_present(*pte_ref)) 410 return -1; 411 412 pte = pte_offset_kernel(pmd, address); 413 414 /* 415 * Don't use pte_page here, because the mappings can point 416 * outside mem_map, and the NUMA hash lookup cannot handle 417 * that: 418 */ 419 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 420 BUG(); 421 422 return 0; 423 } 424 425 static const char errata93_warning[] = 426 KERN_ERR 427 "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 428 "******* Working around it, but it may cause SEGVs or burn power.\n" 429 "******* Please consider a BIOS update.\n" 430 "******* Disabling USB legacy in the BIOS may also help.\n"; 431 432 /* 433 * No vm86 mode in 64-bit mode: 434 */ 435 static inline void 436 check_v8086_mode(struct pt_regs *regs, unsigned long address, 437 struct task_struct *tsk) 438 { 439 } 440 441 static int bad_address(void *p) 442 { 443 unsigned long dummy; 444 445 return probe_kernel_address((unsigned long *)p, dummy); 446 } 447 448 static void dump_pagetable(unsigned long address) 449 { 450 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); 451 pgd_t *pgd = base + pgd_index(address); 452 pud_t *pud; 453 pmd_t *pmd; 454 pte_t *pte; 455 456 if (bad_address(pgd)) 457 goto bad; 458 459 printk("PGD %lx ", pgd_val(*pgd)); 460 461 if (!pgd_present(*pgd)) 462 goto out; 463 464 pud = pud_offset(pgd, address); 465 if (bad_address(pud)) 466 goto bad; 467 468 printk("PUD %lx ", pud_val(*pud)); 469 if (!pud_present(*pud) || pud_large(*pud)) 470 goto out; 471 472 pmd = pmd_offset(pud, address); 473 if (bad_address(pmd)) 474 goto bad; 475 476 printk("PMD %lx ", pmd_val(*pmd)); 477 if (!pmd_present(*pmd) || pmd_large(*pmd)) 478 goto out; 479 480 pte = pte_offset_kernel(pmd, address); 481 if (bad_address(pte)) 482 goto bad; 483 484 printk("PTE %lx", pte_val(*pte)); 485 out: 486 printk("\n"); 487 return; 488 bad: 489 printk("BAD\n"); 490 } 491 492 #endif /* CONFIG_X86_64 */ 493 494 /* 495 * Workaround for K8 erratum #93 & buggy BIOS. 496 * 497 * BIOS SMM functions are required to use a specific workaround 498 * to avoid corruption of the 64bit RIP register on C stepping K8. 499 * 500 * A lot of BIOS that didn't get tested properly miss this. 501 * 502 * The OS sees this as a page fault with the upper 32bits of RIP cleared. 503 * Try to work around it here. 504 * 505 * Note we only handle faults in kernel here. 506 * Does nothing on 32-bit. 507 */ 508 static int is_errata93(struct pt_regs *regs, unsigned long address) 509 { 510 #ifdef CONFIG_X86_64 511 if (address != regs->ip) 512 return 0; 513 514 if ((address >> 32) != 0) 515 return 0; 516 517 address |= 0xffffffffUL << 32; 518 if ((address >= (u64)_stext && address <= (u64)_etext) || 519 (address >= MODULES_VADDR && address <= MODULES_END)) { 520 printk_once(errata93_warning); 521 regs->ip = address; 522 return 1; 523 } 524 #endif 525 return 0; 526 } 527 528 /* 529 * Work around K8 erratum #100 K8 in compat mode occasionally jumps 530 * to illegal addresses >4GB. 531 * 532 * We catch this in the page fault handler because these addresses 533 * are not reachable. Just detect this case and return. Any code 534 * segment in LDT is compatibility mode. 535 */ 536 static int is_errata100(struct pt_regs *regs, unsigned long address) 537 { 538 #ifdef CONFIG_X86_64 539 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) 540 return 1; 541 #endif 542 return 0; 543 } 544 545 static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 546 { 547 #ifdef CONFIG_X86_F00F_BUG 548 unsigned long nr; 549 550 /* 551 * Pentium F0 0F C7 C8 bug workaround: 552 */ 553 if (boot_cpu_data.f00f_bug) { 554 nr = (address - idt_descr.address) >> 3; 555 556 if (nr == 6) { 557 do_invalid_op(regs, 0); 558 return 1; 559 } 560 } 561 #endif 562 return 0; 563 } 564 565 static const char nx_warning[] = KERN_CRIT 566 "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; 567 568 static void 569 show_fault_oops(struct pt_regs *regs, unsigned long error_code, 570 unsigned long address) 571 { 572 if (!oops_may_print()) 573 return; 574 575 if (error_code & PF_INSTR) { 576 unsigned int level; 577 578 pte_t *pte = lookup_address(address, &level); 579 580 if (pte && pte_present(*pte) && !pte_exec(*pte)) 581 printk(nx_warning, current_uid()); 582 } 583 584 printk(KERN_ALERT "BUG: unable to handle kernel "); 585 if (address < PAGE_SIZE) 586 printk(KERN_CONT "NULL pointer dereference"); 587 else 588 printk(KERN_CONT "paging request"); 589 590 printk(KERN_CONT " at %p\n", (void *) address); 591 printk(KERN_ALERT "IP:"); 592 printk_address(regs->ip, 1); 593 594 dump_pagetable(address); 595 } 596 597 static noinline void 598 pgtable_bad(struct pt_regs *regs, unsigned long error_code, 599 unsigned long address) 600 { 601 struct task_struct *tsk; 602 unsigned long flags; 603 int sig; 604 605 flags = oops_begin(); 606 tsk = current; 607 sig = SIGKILL; 608 609 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 610 tsk->comm, address); 611 dump_pagetable(address); 612 613 tsk->thread.cr2 = address; 614 tsk->thread.trap_no = 14; 615 tsk->thread.error_code = error_code; 616 617 if (__die("Bad pagetable", regs, error_code)) 618 sig = 0; 619 620 oops_end(flags, regs, sig); 621 } 622 623 static noinline void 624 no_context(struct pt_regs *regs, unsigned long error_code, 625 unsigned long address) 626 { 627 struct task_struct *tsk = current; 628 unsigned long *stackend; 629 unsigned long flags; 630 int sig; 631 632 /* Are we prepared to handle this kernel fault? */ 633 if (fixup_exception(regs)) 634 return; 635 636 /* 637 * 32-bit: 638 * 639 * Valid to do another page fault here, because if this fault 640 * had been triggered by is_prefetch fixup_exception would have 641 * handled it. 642 * 643 * 64-bit: 644 * 645 * Hall of shame of CPU/BIOS bugs. 646 */ 647 if (is_prefetch(regs, error_code, address)) 648 return; 649 650 if (is_errata93(regs, address)) 651 return; 652 653 /* 654 * Oops. The kernel tried to access some bad page. We'll have to 655 * terminate things with extreme prejudice: 656 */ 657 flags = oops_begin(); 658 659 show_fault_oops(regs, error_code, address); 660 661 stackend = end_of_stack(tsk); 662 if (tsk != &init_task && *stackend != STACK_END_MAGIC) 663 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 664 665 tsk->thread.cr2 = address; 666 tsk->thread.trap_no = 14; 667 tsk->thread.error_code = error_code; 668 669 sig = SIGKILL; 670 if (__die("Oops", regs, error_code)) 671 sig = 0; 672 673 /* Executive summary in case the body of the oops scrolled away */ 674 printk(KERN_EMERG "CR2: %016lx\n", address); 675 676 oops_end(flags, regs, sig); 677 } 678 679 /* 680 * Print out info about fatal segfaults, if the show_unhandled_signals 681 * sysctl is set: 682 */ 683 static inline void 684 show_signal_msg(struct pt_regs *regs, unsigned long error_code, 685 unsigned long address, struct task_struct *tsk) 686 { 687 if (!unhandled_signal(tsk, SIGSEGV)) 688 return; 689 690 if (!printk_ratelimit()) 691 return; 692 693 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 694 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 695 tsk->comm, task_pid_nr(tsk), address, 696 (void *)regs->ip, (void *)regs->sp, error_code); 697 698 print_vma_addr(KERN_CONT " in ", regs->ip); 699 700 printk(KERN_CONT "\n"); 701 } 702 703 static void 704 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 705 unsigned long address, int si_code) 706 { 707 struct task_struct *tsk = current; 708 709 /* User mode accesses just cause a SIGSEGV */ 710 if (error_code & PF_USER) { 711 /* 712 * It's possible to have interrupts off here: 713 */ 714 local_irq_enable(); 715 716 /* 717 * Valid to do another page fault here because this one came 718 * from user space: 719 */ 720 if (is_prefetch(regs, error_code, address)) 721 return; 722 723 if (is_errata100(regs, address)) 724 return; 725 726 if (unlikely(show_unhandled_signals)) 727 show_signal_msg(regs, error_code, address, tsk); 728 729 /* Kernel addresses are always protection faults: */ 730 tsk->thread.cr2 = address; 731 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 732 tsk->thread.trap_no = 14; 733 734 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 735 736 return; 737 } 738 739 if (is_f00f_bug(regs, address)) 740 return; 741 742 no_context(regs, error_code, address); 743 } 744 745 static noinline void 746 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 747 unsigned long address) 748 { 749 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 750 } 751 752 static void 753 __bad_area(struct pt_regs *regs, unsigned long error_code, 754 unsigned long address, int si_code) 755 { 756 struct mm_struct *mm = current->mm; 757 758 /* 759 * Something tried to access memory that isn't in our memory map.. 760 * Fix it, but check if it's kernel or user first.. 761 */ 762 up_read(&mm->mmap_sem); 763 764 __bad_area_nosemaphore(regs, error_code, address, si_code); 765 } 766 767 static noinline void 768 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) 769 { 770 __bad_area(regs, error_code, address, SEGV_MAPERR); 771 } 772 773 static noinline void 774 bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 775 unsigned long address) 776 { 777 __bad_area(regs, error_code, address, SEGV_ACCERR); 778 } 779 780 /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ 781 static void 782 out_of_memory(struct pt_regs *regs, unsigned long error_code, 783 unsigned long address) 784 { 785 /* 786 * We ran out of memory, call the OOM killer, and return the userspace 787 * (which will retry the fault, or kill us if we got oom-killed): 788 */ 789 up_read(¤t->mm->mmap_sem); 790 791 pagefault_out_of_memory(); 792 } 793 794 static void 795 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 796 unsigned int fault) 797 { 798 struct task_struct *tsk = current; 799 struct mm_struct *mm = tsk->mm; 800 int code = BUS_ADRERR; 801 802 up_read(&mm->mmap_sem); 803 804 /* Kernel mode? Handle exceptions or die: */ 805 if (!(error_code & PF_USER)) 806 no_context(regs, error_code, address); 807 808 /* User-space => ok to do another page fault: */ 809 if (is_prefetch(regs, error_code, address)) 810 return; 811 812 tsk->thread.cr2 = address; 813 tsk->thread.error_code = error_code; 814 tsk->thread.trap_no = 14; 815 816 #ifdef CONFIG_MEMORY_FAILURE 817 if (fault & VM_FAULT_HWPOISON) { 818 printk(KERN_ERR 819 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 820 tsk->comm, tsk->pid, address); 821 code = BUS_MCEERR_AR; 822 } 823 #endif 824 force_sig_info_fault(SIGBUS, code, address, tsk); 825 } 826 827 static noinline void 828 mm_fault_error(struct pt_regs *regs, unsigned long error_code, 829 unsigned long address, unsigned int fault) 830 { 831 if (fault & VM_FAULT_OOM) { 832 out_of_memory(regs, error_code, address); 833 } else { 834 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 835 do_sigbus(regs, error_code, address, fault); 836 else 837 BUG(); 838 } 839 } 840 841 static int spurious_fault_check(unsigned long error_code, pte_t *pte) 842 { 843 if ((error_code & PF_WRITE) && !pte_write(*pte)) 844 return 0; 845 846 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 847 return 0; 848 849 return 1; 850 } 851 852 /* 853 * Handle a spurious fault caused by a stale TLB entry. 854 * 855 * This allows us to lazily refresh the TLB when increasing the 856 * permissions of a kernel page (RO -> RW or NX -> X). Doing it 857 * eagerly is very expensive since that implies doing a full 858 * cross-processor TLB flush, even if no stale TLB entries exist 859 * on other processors. 860 * 861 * There are no security implications to leaving a stale TLB when 862 * increasing the permissions on a page. 863 */ 864 static noinline __kprobes int 865 spurious_fault(unsigned long error_code, unsigned long address) 866 { 867 pgd_t *pgd; 868 pud_t *pud; 869 pmd_t *pmd; 870 pte_t *pte; 871 int ret; 872 873 /* Reserved-bit violation or user access to kernel space? */ 874 if (error_code & (PF_USER | PF_RSVD)) 875 return 0; 876 877 pgd = init_mm.pgd + pgd_index(address); 878 if (!pgd_present(*pgd)) 879 return 0; 880 881 pud = pud_offset(pgd, address); 882 if (!pud_present(*pud)) 883 return 0; 884 885 if (pud_large(*pud)) 886 return spurious_fault_check(error_code, (pte_t *) pud); 887 888 pmd = pmd_offset(pud, address); 889 if (!pmd_present(*pmd)) 890 return 0; 891 892 if (pmd_large(*pmd)) 893 return spurious_fault_check(error_code, (pte_t *) pmd); 894 895 pte = pte_offset_kernel(pmd, address); 896 if (!pte_present(*pte)) 897 return 0; 898 899 ret = spurious_fault_check(error_code, pte); 900 if (!ret) 901 return 0; 902 903 /* 904 * Make sure we have permissions in PMD. 905 * If not, then there's a bug in the page tables: 906 */ 907 ret = spurious_fault_check(error_code, (pte_t *) pmd); 908 WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); 909 910 return ret; 911 } 912 913 int show_unhandled_signals = 1; 914 915 static inline int 916 access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 917 { 918 if (write) { 919 /* write, present and write, not present: */ 920 if (unlikely(!(vma->vm_flags & VM_WRITE))) 921 return 1; 922 return 0; 923 } 924 925 /* read, present: */ 926 if (unlikely(error_code & PF_PROT)) 927 return 1; 928 929 /* read, not present: */ 930 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 931 return 1; 932 933 return 0; 934 } 935 936 static int fault_in_kernel_space(unsigned long address) 937 { 938 return address >= TASK_SIZE_MAX; 939 } 940 941 /* 942 * This routine handles page faults. It determines the address, 943 * and the problem, and then passes it off to one of the appropriate 944 * routines. 945 */ 946 dotraplinkage void __kprobes 947 do_page_fault(struct pt_regs *regs, unsigned long error_code) 948 { 949 struct vm_area_struct *vma; 950 struct task_struct *tsk; 951 unsigned long address; 952 struct mm_struct *mm; 953 int write; 954 int fault; 955 956 tsk = current; 957 mm = tsk->mm; 958 959 /* Get the faulting address: */ 960 address = read_cr2(); 961 962 /* 963 * Detect and handle instructions that would cause a page fault for 964 * both a tracked kernel page and a userspace page. 965 */ 966 if (kmemcheck_active(regs)) 967 kmemcheck_hide(regs); 968 prefetchw(&mm->mmap_sem); 969 970 if (unlikely(kmmio_fault(regs, address))) 971 return; 972 973 /* 974 * We fault-in kernel-space virtual memory on-demand. The 975 * 'reference' page table is init_mm.pgd. 976 * 977 * NOTE! We MUST NOT take any locks for this case. We may 978 * be in an interrupt or a critical region, and should 979 * only copy the information from the master page table, 980 * nothing more. 981 * 982 * This verifies that the fault happens in kernel space 983 * (error_code & 4) == 0, and that the fault was not a 984 * protection error (error_code & 9) == 0. 985 */ 986 if (unlikely(fault_in_kernel_space(address))) { 987 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { 988 if (vmalloc_fault(address) >= 0) 989 return; 990 991 if (kmemcheck_fault(regs, address, error_code)) 992 return; 993 } 994 995 /* Can handle a stale RO->RW TLB: */ 996 if (spurious_fault(error_code, address)) 997 return; 998 999 /* kprobes don't want to hook the spurious faults: */ 1000 if (notify_page_fault(regs)) 1001 return; 1002 /* 1003 * Don't take the mm semaphore here. If we fixup a prefetch 1004 * fault we could otherwise deadlock: 1005 */ 1006 bad_area_nosemaphore(regs, error_code, address); 1007 1008 return; 1009 } 1010 1011 /* kprobes don't want to hook the spurious faults: */ 1012 if (unlikely(notify_page_fault(regs))) 1013 return; 1014 /* 1015 * It's safe to allow irq's after cr2 has been saved and the 1016 * vmalloc fault has been handled. 1017 * 1018 * User-mode registers count as a user access even for any 1019 * potential system fault or CPU buglet: 1020 */ 1021 if (user_mode_vm(regs)) { 1022 local_irq_enable(); 1023 error_code |= PF_USER; 1024 } else { 1025 if (regs->flags & X86_EFLAGS_IF) 1026 local_irq_enable(); 1027 } 1028 1029 if (unlikely(error_code & PF_RSVD)) 1030 pgtable_bad(regs, error_code, address); 1031 1032 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); 1033 1034 /* 1035 * If we're in an interrupt, have no user context or are running 1036 * in an atomic region then we must not take the fault: 1037 */ 1038 if (unlikely(in_atomic() || !mm)) { 1039 bad_area_nosemaphore(regs, error_code, address); 1040 return; 1041 } 1042 1043 /* 1044 * When running in the kernel we expect faults to occur only to 1045 * addresses in user space. All other faults represent errors in 1046 * the kernel and should generate an OOPS. Unfortunately, in the 1047 * case of an erroneous fault occurring in a code path which already 1048 * holds mmap_sem we will deadlock attempting to validate the fault 1049 * against the address space. Luckily the kernel only validly 1050 * references user space from well defined areas of code, which are 1051 * listed in the exceptions table. 1052 * 1053 * As the vast majority of faults will be valid we will only perform 1054 * the source reference check when there is a possibility of a 1055 * deadlock. Attempt to lock the address space, if we cannot we then 1056 * validate the source. If this is invalid we can skip the address 1057 * space check, thus avoiding the deadlock: 1058 */ 1059 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1060 if ((error_code & PF_USER) == 0 && 1061 !search_exception_tables(regs->ip)) { 1062 bad_area_nosemaphore(regs, error_code, address); 1063 return; 1064 } 1065 down_read(&mm->mmap_sem); 1066 } else { 1067 /* 1068 * The above down_read_trylock() might have succeeded in 1069 * which case we'll have missed the might_sleep() from 1070 * down_read(): 1071 */ 1072 might_sleep(); 1073 } 1074 1075 vma = find_vma(mm, address); 1076 if (unlikely(!vma)) { 1077 bad_area(regs, error_code, address); 1078 return; 1079 } 1080 if (likely(vma->vm_start <= address)) 1081 goto good_area; 1082 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 1083 bad_area(regs, error_code, address); 1084 return; 1085 } 1086 if (error_code & PF_USER) { 1087 /* 1088 * Accessing the stack below %sp is always a bug. 1089 * The large cushion allows instructions like enter 1090 * and pusha to work. ("enter $65535, $31" pushes 1091 * 32 pointers and then decrements %sp by 65535.) 1092 */ 1093 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { 1094 bad_area(regs, error_code, address); 1095 return; 1096 } 1097 } 1098 if (unlikely(expand_stack(vma, address))) { 1099 bad_area(regs, error_code, address); 1100 return; 1101 } 1102 1103 /* 1104 * Ok, we have a good vm_area for this memory access, so 1105 * we can handle it.. 1106 */ 1107 good_area: 1108 write = error_code & PF_WRITE; 1109 1110 if (unlikely(access_error(error_code, write, vma))) { 1111 bad_area_access_error(regs, error_code, address); 1112 return; 1113 } 1114 1115 /* 1116 * If for any reason at all we couldn't handle the fault, 1117 * make sure we exit gracefully rather than endlessly redo 1118 * the fault: 1119 */ 1120 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1121 1122 if (unlikely(fault & VM_FAULT_ERROR)) { 1123 mm_fault_error(regs, error_code, address, fault); 1124 return; 1125 } 1126 1127 if (fault & VM_FAULT_MAJOR) { 1128 tsk->maj_flt++; 1129 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1130 regs, address); 1131 } else { 1132 tsk->min_flt++; 1133 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1134 regs, address); 1135 } 1136 1137 check_v8086_mode(regs, address, tsk); 1138 1139 up_read(&mm->mmap_sem); 1140 } 1141