1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 4 */ 5 6 #include <linux/signal.h> 7 #include <linux/sched.h> 8 #include <linux/kernel.h> 9 #include <linux/errno.h> 10 #include <linux/string.h> 11 #include <linux/types.h> 12 #include <linux/ptrace.h> 13 #include <linux/mman.h> 14 #include <linux/mm.h> 15 #include <linux/smp.h> 16 #include <linux/interrupt.h> 17 #include <linux/init.h> 18 #include <linux/tty.h> 19 #include <linux/vt_kern.h> /* For unblank_screen() */ 20 #include <linux/compiler.h> 21 #include <linux/highmem.h> 22 #include <linux/bootmem.h> /* for max_low_pfn */ 23 #include <linux/vmalloc.h> 24 #include <linux/module.h> 25 #include <linux/kprobes.h> 26 #include <linux/uaccess.h> 27 #include <linux/kdebug.h> 28 29 #include <asm/system.h> 30 #include <asm/desc.h> 31 #include <asm/segment.h> 32 #include <asm/pgalloc.h> 33 #include <asm/smp.h> 34 #include <asm/tlbflush.h> 35 #include <asm/proto.h> 36 #include <asm-generic/sections.h> 37 38 /* 39 * Page fault error code bits 40 * bit 0 == 0 means no page found, 1 means protection fault 41 * bit 1 == 0 means read, 1 means write 42 * bit 2 == 0 means kernel, 1 means user-mode 43 * bit 3 == 1 means use of reserved bit detected 44 * bit 4 == 1 means fault was an instruction fetch 45 */ 46 #define PF_PROT (1<<0) 47 #define PF_WRITE (1<<1) 48 #define PF_USER (1<<2) 49 #define PF_RSVD (1<<3) 50 #define PF_INSTR (1<<4) 51 52 static inline int notify_page_fault(struct pt_regs *regs) 53 { 54 #ifdef CONFIG_KPROBES 55 int ret = 0; 56 57 /* kprobe_running() needs smp_processor_id() */ 58 #ifdef CONFIG_X86_32 59 if (!user_mode_vm(regs)) { 60 #else 61 if (!user_mode(regs)) { 62 #endif 63 preempt_disable(); 64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 65 ret = 1; 66 preempt_enable(); 67 } 68 69 return ret; 70 #else 71 return 0; 72 #endif 73 } 74 75 /* 76 * X86_32 77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 78 * Check that here and ignore it. 79 * 80 * X86_64 81 * Sometimes the CPU reports invalid exceptions on prefetch. 82 * Check that here and ignore it. 83 * 84 * Opcode checker based on code by Richard Brunner 85 */ 86 static int is_prefetch(struct pt_regs *regs, unsigned long addr, 87 unsigned long error_code) 88 { 89 unsigned char *instr; 90 int scan_more = 1; 91 int prefetch = 0; 92 unsigned char *max_instr; 93 94 /* 95 * If it was a exec (instruction fetch) fault on NX page, then 96 * do not ignore the fault: 97 */ 98 if (error_code & PF_INSTR) 99 return 0; 100 101 instr = (unsigned char *)convert_ip_to_linear(current, regs); 102 max_instr = instr + 15; 103 104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 105 return 0; 106 107 while (scan_more && instr < max_instr) { 108 unsigned char opcode; 109 unsigned char instr_hi; 110 unsigned char instr_lo; 111 112 if (probe_kernel_address(instr, opcode)) 113 break; 114 115 instr_hi = opcode & 0xf0; 116 instr_lo = opcode & 0x0f; 117 instr++; 118 119 switch (instr_hi) { 120 case 0x20: 121 case 0x30: 122 /* 123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 124 * In X86_64 long mode, the CPU will signal invalid 125 * opcode if some of these prefixes are present so 126 * X86_64 will never get here anyway 127 */ 128 scan_more = ((instr_lo & 7) == 0x6); 129 break; 130 #ifdef CONFIG_X86_64 131 case 0x40: 132 /* 133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes 134 * Need to figure out under what instruction mode the 135 * instruction was issued. Could check the LDT for lm, 136 * but for now it's good enough to assume that long 137 * mode only uses well known segments or kernel. 138 */ 139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); 140 break; 141 #endif 142 case 0x60: 143 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 144 scan_more = (instr_lo & 0xC) == 0x4; 145 break; 146 case 0xF0: 147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 148 scan_more = !instr_lo || (instr_lo>>1) == 1; 149 break; 150 case 0x00: 151 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 152 scan_more = 0; 153 154 if (probe_kernel_address(instr, opcode)) 155 break; 156 prefetch = (instr_lo == 0xF) && 157 (opcode == 0x0D || opcode == 0x18); 158 break; 159 default: 160 scan_more = 0; 161 break; 162 } 163 } 164 return prefetch; 165 } 166 167 static void force_sig_info_fault(int si_signo, int si_code, 168 unsigned long address, struct task_struct *tsk) 169 { 170 siginfo_t info; 171 172 info.si_signo = si_signo; 173 info.si_errno = 0; 174 info.si_code = si_code; 175 info.si_addr = (void __user *)address; 176 force_sig_info(si_signo, &info, tsk); 177 } 178 179 #ifdef CONFIG_X86_64 180 static int bad_address(void *p) 181 { 182 unsigned long dummy; 183 return probe_kernel_address((unsigned long *)p, dummy); 184 } 185 #endif 186 187 static void dump_pagetable(unsigned long address) 188 { 189 #ifdef CONFIG_X86_32 190 __typeof__(pte_val(__pte(0))) page; 191 192 page = read_cr3(); 193 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 194 #ifdef CONFIG_X86_PAE 195 printk("*pdpt = %016Lx ", page); 196 if ((page >> PAGE_SHIFT) < max_low_pfn 197 && page & _PAGE_PRESENT) { 198 page &= PAGE_MASK; 199 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 200 & (PTRS_PER_PMD - 1)]; 201 printk(KERN_CONT "*pde = %016Lx ", page); 202 page &= ~_PAGE_NX; 203 } 204 #else 205 printk("*pde = %08lx ", page); 206 #endif 207 208 /* 209 * We must not directly access the pte in the highpte 210 * case if the page table is located in highmem. 211 * And let's rather not kmap-atomic the pte, just in case 212 * it's allocated already. 213 */ 214 if ((page >> PAGE_SHIFT) < max_low_pfn 215 && (page & _PAGE_PRESENT) 216 && !(page & _PAGE_PSE)) { 217 page &= PAGE_MASK; 218 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 219 & (PTRS_PER_PTE - 1)]; 220 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); 221 } 222 223 printk("\n"); 224 #else /* CONFIG_X86_64 */ 225 pgd_t *pgd; 226 pud_t *pud; 227 pmd_t *pmd; 228 pte_t *pte; 229 230 pgd = (pgd_t *)read_cr3(); 231 232 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 233 pgd += pgd_index(address); 234 if (bad_address(pgd)) goto bad; 235 printk("PGD %lx ", pgd_val(*pgd)); 236 if (!pgd_present(*pgd)) goto ret; 237 238 pud = pud_offset(pgd, address); 239 if (bad_address(pud)) goto bad; 240 printk("PUD %lx ", pud_val(*pud)); 241 if (!pud_present(*pud) || pud_large(*pud)) 242 goto ret; 243 244 pmd = pmd_offset(pud, address); 245 if (bad_address(pmd)) goto bad; 246 printk("PMD %lx ", pmd_val(*pmd)); 247 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; 248 249 pte = pte_offset_kernel(pmd, address); 250 if (bad_address(pte)) goto bad; 251 printk("PTE %lx", pte_val(*pte)); 252 ret: 253 printk("\n"); 254 return; 255 bad: 256 printk("BAD\n"); 257 #endif 258 } 259 260 #ifdef CONFIG_X86_32 261 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 262 { 263 unsigned index = pgd_index(address); 264 pgd_t *pgd_k; 265 pud_t *pud, *pud_k; 266 pmd_t *pmd, *pmd_k; 267 268 pgd += index; 269 pgd_k = init_mm.pgd + index; 270 271 if (!pgd_present(*pgd_k)) 272 return NULL; 273 274 /* 275 * set_pgd(pgd, *pgd_k); here would be useless on PAE 276 * and redundant with the set_pmd() on non-PAE. As would 277 * set_pud. 278 */ 279 280 pud = pud_offset(pgd, address); 281 pud_k = pud_offset(pgd_k, address); 282 if (!pud_present(*pud_k)) 283 return NULL; 284 285 pmd = pmd_offset(pud, address); 286 pmd_k = pmd_offset(pud_k, address); 287 if (!pmd_present(*pmd_k)) 288 return NULL; 289 if (!pmd_present(*pmd)) { 290 set_pmd(pmd, *pmd_k); 291 arch_flush_lazy_mmu_mode(); 292 } else 293 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 294 return pmd_k; 295 } 296 #endif 297 298 #ifdef CONFIG_X86_64 299 static const char errata93_warning[] = 300 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 301 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 302 KERN_ERR "******* Please consider a BIOS update.\n" 303 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 304 #endif 305 306 /* Workaround for K8 erratum #93 & buggy BIOS. 307 BIOS SMM functions are required to use a specific workaround 308 to avoid corruption of the 64bit RIP register on C stepping K8. 309 A lot of BIOS that didn't get tested properly miss this. 310 The OS sees this as a page fault with the upper 32bits of RIP cleared. 311 Try to work around it here. 312 Note we only handle faults in kernel here. 313 Does nothing for X86_32 314 */ 315 static int is_errata93(struct pt_regs *regs, unsigned long address) 316 { 317 #ifdef CONFIG_X86_64 318 static int warned; 319 if (address != regs->ip) 320 return 0; 321 if ((address >> 32) != 0) 322 return 0; 323 address |= 0xffffffffUL << 32; 324 if ((address >= (u64)_stext && address <= (u64)_etext) || 325 (address >= MODULES_VADDR && address <= MODULES_END)) { 326 if (!warned) { 327 printk(errata93_warning); 328 warned = 1; 329 } 330 regs->ip = address; 331 return 1; 332 } 333 #endif 334 return 0; 335 } 336 337 /* 338 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal 339 * addresses >4GB. We catch this in the page fault handler because these 340 * addresses are not reachable. Just detect this case and return. Any code 341 * segment in LDT is compatibility mode. 342 */ 343 static int is_errata100(struct pt_regs *regs, unsigned long address) 344 { 345 #ifdef CONFIG_X86_64 346 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && 347 (address >> 32)) 348 return 1; 349 #endif 350 return 0; 351 } 352 353 void do_invalid_op(struct pt_regs *, unsigned long); 354 355 static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 356 { 357 #ifdef CONFIG_X86_F00F_BUG 358 unsigned long nr; 359 /* 360 * Pentium F0 0F C7 C8 bug workaround. 361 */ 362 if (boot_cpu_data.f00f_bug) { 363 nr = (address - idt_descr.address) >> 3; 364 365 if (nr == 6) { 366 do_invalid_op(regs, 0); 367 return 1; 368 } 369 } 370 #endif 371 return 0; 372 } 373 374 static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, 375 unsigned long address) 376 { 377 #ifdef CONFIG_X86_32 378 if (!oops_may_print()) 379 return; 380 #endif 381 382 #ifdef CONFIG_X86_PAE 383 if (error_code & PF_INSTR) { 384 unsigned int level; 385 pte_t *pte = lookup_address(address, &level); 386 387 if (pte && pte_present(*pte) && !pte_exec(*pte)) 388 printk(KERN_CRIT "kernel tried to execute " 389 "NX-protected page - exploit attempt? " 390 "(uid: %d)\n", current->uid); 391 } 392 #endif 393 394 printk(KERN_ALERT "BUG: unable to handle kernel "); 395 if (address < PAGE_SIZE) 396 printk(KERN_CONT "NULL pointer dereference"); 397 else 398 printk(KERN_CONT "paging request"); 399 #ifdef CONFIG_X86_32 400 printk(KERN_CONT " at %08lx\n", address); 401 #else 402 printk(KERN_CONT " at %016lx\n", address); 403 #endif 404 printk(KERN_ALERT "IP:"); 405 printk_address(regs->ip, 1); 406 dump_pagetable(address); 407 } 408 409 #ifdef CONFIG_X86_64 410 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 411 unsigned long error_code) 412 { 413 unsigned long flags = oops_begin(); 414 struct task_struct *tsk; 415 416 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 417 current->comm, address); 418 dump_pagetable(address); 419 tsk = current; 420 tsk->thread.cr2 = address; 421 tsk->thread.trap_no = 14; 422 tsk->thread.error_code = error_code; 423 if (__die("Bad pagetable", regs, error_code)) 424 regs = NULL; 425 oops_end(flags, regs, SIGKILL); 426 } 427 #endif 428 429 static int spurious_fault_check(unsigned long error_code, pte_t *pte) 430 { 431 if ((error_code & PF_WRITE) && !pte_write(*pte)) 432 return 0; 433 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 434 return 0; 435 436 return 1; 437 } 438 439 /* 440 * Handle a spurious fault caused by a stale TLB entry. This allows 441 * us to lazily refresh the TLB when increasing the permissions of a 442 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very 443 * expensive since that implies doing a full cross-processor TLB 444 * flush, even if no stale TLB entries exist on other processors. 445 * There are no security implications to leaving a stale TLB when 446 * increasing the permissions on a page. 447 */ 448 static int spurious_fault(unsigned long address, 449 unsigned long error_code) 450 { 451 pgd_t *pgd; 452 pud_t *pud; 453 pmd_t *pmd; 454 pte_t *pte; 455 456 /* Reserved-bit violation or user access to kernel space? */ 457 if (error_code & (PF_USER | PF_RSVD)) 458 return 0; 459 460 pgd = init_mm.pgd + pgd_index(address); 461 if (!pgd_present(*pgd)) 462 return 0; 463 464 pud = pud_offset(pgd, address); 465 if (!pud_present(*pud)) 466 return 0; 467 468 if (pud_large(*pud)) 469 return spurious_fault_check(error_code, (pte_t *) pud); 470 471 pmd = pmd_offset(pud, address); 472 if (!pmd_present(*pmd)) 473 return 0; 474 475 if (pmd_large(*pmd)) 476 return spurious_fault_check(error_code, (pte_t *) pmd); 477 478 pte = pte_offset_kernel(pmd, address); 479 if (!pte_present(*pte)) 480 return 0; 481 482 return spurious_fault_check(error_code, pte); 483 } 484 485 /* 486 * X86_32 487 * Handle a fault on the vmalloc or module mapping area 488 * 489 * X86_64 490 * Handle a fault on the vmalloc area 491 * 492 * This assumes no large pages in there. 493 */ 494 static int vmalloc_fault(unsigned long address) 495 { 496 #ifdef CONFIG_X86_32 497 unsigned long pgd_paddr; 498 pmd_t *pmd_k; 499 pte_t *pte_k; 500 /* 501 * Synchronize this task's top level page-table 502 * with the 'reference' page table. 503 * 504 * Do _not_ use "current" here. We might be inside 505 * an interrupt in the middle of a task switch.. 506 */ 507 pgd_paddr = read_cr3(); 508 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 509 if (!pmd_k) 510 return -1; 511 pte_k = pte_offset_kernel(pmd_k, address); 512 if (!pte_present(*pte_k)) 513 return -1; 514 return 0; 515 #else 516 pgd_t *pgd, *pgd_ref; 517 pud_t *pud, *pud_ref; 518 pmd_t *pmd, *pmd_ref; 519 pte_t *pte, *pte_ref; 520 521 /* Make sure we are in vmalloc area */ 522 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 523 return -1; 524 525 /* Copy kernel mappings over when needed. This can also 526 happen within a race in page table update. In the later 527 case just flush. */ 528 529 pgd = pgd_offset(current->mm ?: &init_mm, address); 530 pgd_ref = pgd_offset_k(address); 531 if (pgd_none(*pgd_ref)) 532 return -1; 533 if (pgd_none(*pgd)) 534 set_pgd(pgd, *pgd_ref); 535 else 536 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 537 538 /* Below here mismatches are bugs because these lower tables 539 are shared */ 540 541 pud = pud_offset(pgd, address); 542 pud_ref = pud_offset(pgd_ref, address); 543 if (pud_none(*pud_ref)) 544 return -1; 545 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 546 BUG(); 547 pmd = pmd_offset(pud, address); 548 pmd_ref = pmd_offset(pud_ref, address); 549 if (pmd_none(*pmd_ref)) 550 return -1; 551 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 552 BUG(); 553 pte_ref = pte_offset_kernel(pmd_ref, address); 554 if (!pte_present(*pte_ref)) 555 return -1; 556 pte = pte_offset_kernel(pmd, address); 557 /* Don't use pte_page here, because the mappings can point 558 outside mem_map, and the NUMA hash lookup cannot handle 559 that. */ 560 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 561 BUG(); 562 return 0; 563 #endif 564 } 565 566 int show_unhandled_signals = 1; 567 568 /* 569 * This routine handles page faults. It determines the address, 570 * and the problem, and then passes it off to one of the appropriate 571 * routines. 572 */ 573 #ifdef CONFIG_X86_64 574 asmlinkage 575 #endif 576 void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 577 { 578 struct task_struct *tsk; 579 struct mm_struct *mm; 580 struct vm_area_struct *vma; 581 unsigned long address; 582 int write, si_code; 583 int fault; 584 #ifdef CONFIG_X86_64 585 unsigned long flags; 586 #endif 587 588 /* 589 * We can fault from pretty much anywhere, with unknown IRQ state. 590 */ 591 trace_hardirqs_fixup(); 592 593 tsk = current; 594 mm = tsk->mm; 595 prefetchw(&mm->mmap_sem); 596 597 /* get the address */ 598 address = read_cr2(); 599 600 si_code = SEGV_MAPERR; 601 602 if (notify_page_fault(regs)) 603 return; 604 605 /* 606 * We fault-in kernel-space virtual memory on-demand. The 607 * 'reference' page table is init_mm.pgd. 608 * 609 * NOTE! We MUST NOT take any locks for this case. We may 610 * be in an interrupt or a critical region, and should 611 * only copy the information from the master page table, 612 * nothing more. 613 * 614 * This verifies that the fault happens in kernel space 615 * (error_code & 4) == 0, and that the fault was not a 616 * protection error (error_code & 9) == 0. 617 */ 618 #ifdef CONFIG_X86_32 619 if (unlikely(address >= TASK_SIZE)) { 620 #else 621 if (unlikely(address >= TASK_SIZE64)) { 622 #endif 623 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 624 vmalloc_fault(address) >= 0) 625 return; 626 627 /* Can handle a stale RO->RW TLB */ 628 if (spurious_fault(address, error_code)) 629 return; 630 631 /* 632 * Don't take the mm semaphore here. If we fixup a prefetch 633 * fault we could otherwise deadlock. 634 */ 635 goto bad_area_nosemaphore; 636 } 637 638 639 #ifdef CONFIG_X86_32 640 /* It's safe to allow irq's after cr2 has been saved and the vmalloc 641 fault has been handled. */ 642 if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) 643 local_irq_enable(); 644 645 /* 646 * If we're in an interrupt, have no user context or are running in an 647 * atomic region then we must not take the fault. 648 */ 649 if (in_atomic() || !mm) 650 goto bad_area_nosemaphore; 651 #else /* CONFIG_X86_64 */ 652 if (likely(regs->flags & X86_EFLAGS_IF)) 653 local_irq_enable(); 654 655 if (unlikely(error_code & PF_RSVD)) 656 pgtable_bad(address, regs, error_code); 657 658 /* 659 * If we're in an interrupt, have no user context or are running in an 660 * atomic region then we must not take the fault. 661 */ 662 if (unlikely(in_atomic() || !mm)) 663 goto bad_area_nosemaphore; 664 665 /* 666 * User-mode registers count as a user access even for any 667 * potential system fault or CPU buglet. 668 */ 669 if (user_mode_vm(regs)) 670 error_code |= PF_USER; 671 again: 672 #endif 673 /* When running in the kernel we expect faults to occur only to 674 * addresses in user space. All other faults represent errors in the 675 * kernel and should generate an OOPS. Unfortunately, in the case of an 676 * erroneous fault occurring in a code path which already holds mmap_sem 677 * we will deadlock attempting to validate the fault against the 678 * address space. Luckily the kernel only validly references user 679 * space from well defined areas of code, which are listed in the 680 * exceptions table. 681 * 682 * As the vast majority of faults will be valid we will only perform 683 * the source reference check when there is a possibility of a deadlock. 684 * Attempt to lock the address space, if we cannot we then validate the 685 * source. If this is invalid we can skip the address space check, 686 * thus avoiding the deadlock. 687 */ 688 if (!down_read_trylock(&mm->mmap_sem)) { 689 if ((error_code & PF_USER) == 0 && 690 !search_exception_tables(regs->ip)) 691 goto bad_area_nosemaphore; 692 down_read(&mm->mmap_sem); 693 } 694 695 vma = find_vma(mm, address); 696 if (!vma) 697 goto bad_area; 698 if (vma->vm_start <= address) 699 goto good_area; 700 if (!(vma->vm_flags & VM_GROWSDOWN)) 701 goto bad_area; 702 if (error_code & PF_USER) { 703 /* 704 * Accessing the stack below %sp is always a bug. 705 * The large cushion allows instructions like enter 706 * and pusha to work. ("enter $65535,$31" pushes 707 * 32 pointers and then decrements %sp by 65535.) 708 */ 709 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 710 goto bad_area; 711 } 712 if (expand_stack(vma, address)) 713 goto bad_area; 714 /* 715 * Ok, we have a good vm_area for this memory access, so 716 * we can handle it.. 717 */ 718 good_area: 719 si_code = SEGV_ACCERR; 720 write = 0; 721 switch (error_code & (PF_PROT|PF_WRITE)) { 722 default: /* 3: write, present */ 723 /* fall through */ 724 case PF_WRITE: /* write, not present */ 725 if (!(vma->vm_flags & VM_WRITE)) 726 goto bad_area; 727 write++; 728 break; 729 case PF_PROT: /* read, present */ 730 goto bad_area; 731 case 0: /* read, not present */ 732 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 733 goto bad_area; 734 } 735 736 #ifdef CONFIG_X86_32 737 survive: 738 #endif 739 /* 740 * If for any reason at all we couldn't handle the fault, 741 * make sure we exit gracefully rather than endlessly redo 742 * the fault. 743 */ 744 fault = handle_mm_fault(mm, vma, address, write); 745 if (unlikely(fault & VM_FAULT_ERROR)) { 746 if (fault & VM_FAULT_OOM) 747 goto out_of_memory; 748 else if (fault & VM_FAULT_SIGBUS) 749 goto do_sigbus; 750 BUG(); 751 } 752 if (fault & VM_FAULT_MAJOR) 753 tsk->maj_flt++; 754 else 755 tsk->min_flt++; 756 757 #ifdef CONFIG_X86_32 758 /* 759 * Did it hit the DOS screen memory VA from vm86 mode? 760 */ 761 if (v8086_mode(regs)) { 762 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; 763 if (bit < 32) 764 tsk->thread.screen_bitmap |= 1 << bit; 765 } 766 #endif 767 up_read(&mm->mmap_sem); 768 return; 769 770 /* 771 * Something tried to access memory that isn't in our memory map.. 772 * Fix it, but check if it's kernel or user first.. 773 */ 774 bad_area: 775 up_read(&mm->mmap_sem); 776 777 bad_area_nosemaphore: 778 /* User mode accesses just cause a SIGSEGV */ 779 if (error_code & PF_USER) { 780 /* 781 * It's possible to have interrupts off here. 782 */ 783 local_irq_enable(); 784 785 /* 786 * Valid to do another page fault here because this one came 787 * from user space. 788 */ 789 if (is_prefetch(regs, address, error_code)) 790 return; 791 792 if (is_errata100(regs, address)) 793 return; 794 795 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 796 printk_ratelimit()) { 797 printk( 798 #ifdef CONFIG_X86_32 799 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", 800 #else 801 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", 802 #endif 803 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 804 tsk->comm, task_pid_nr(tsk), address, regs->ip, 805 regs->sp, error_code); 806 print_vma_addr(" in ", regs->ip); 807 printk("\n"); 808 } 809 810 tsk->thread.cr2 = address; 811 /* Kernel addresses are always protection faults */ 812 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 813 tsk->thread.trap_no = 14; 814 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 815 return; 816 } 817 818 if (is_f00f_bug(regs, address)) 819 return; 820 821 no_context: 822 /* Are we prepared to handle this kernel fault? */ 823 if (fixup_exception(regs)) 824 return; 825 826 /* 827 * X86_32 828 * Valid to do another page fault here, because if this fault 829 * had been triggered by is_prefetch fixup_exception would have 830 * handled it. 831 * 832 * X86_64 833 * Hall of shame of CPU/BIOS bugs. 834 */ 835 if (is_prefetch(regs, address, error_code)) 836 return; 837 838 if (is_errata93(regs, address)) 839 return; 840 841 /* 842 * Oops. The kernel tried to access some bad page. We'll have to 843 * terminate things with extreme prejudice. 844 */ 845 #ifdef CONFIG_X86_32 846 bust_spinlocks(1); 847 #else 848 flags = oops_begin(); 849 #endif 850 851 show_fault_oops(regs, error_code, address); 852 853 tsk->thread.cr2 = address; 854 tsk->thread.trap_no = 14; 855 tsk->thread.error_code = error_code; 856 857 #ifdef CONFIG_X86_32 858 die("Oops", regs, error_code); 859 bust_spinlocks(0); 860 do_exit(SIGKILL); 861 #else 862 if (__die("Oops", regs, error_code)) 863 regs = NULL; 864 /* Executive summary in case the body of the oops scrolled away */ 865 printk(KERN_EMERG "CR2: %016lx\n", address); 866 oops_end(flags, regs, SIGKILL); 867 #endif 868 869 /* 870 * We ran out of memory, or some other thing happened to us that made 871 * us unable to handle the page fault gracefully. 872 */ 873 out_of_memory: 874 up_read(&mm->mmap_sem); 875 if (is_global_init(tsk)) { 876 yield(); 877 #ifdef CONFIG_X86_32 878 down_read(&mm->mmap_sem); 879 goto survive; 880 #else 881 goto again; 882 #endif 883 } 884 885 printk("VM: killing process %s\n", tsk->comm); 886 if (error_code & PF_USER) 887 do_group_exit(SIGKILL); 888 goto no_context; 889 890 do_sigbus: 891 up_read(&mm->mmap_sem); 892 893 /* Kernel mode? Handle exceptions or die */ 894 if (!(error_code & PF_USER)) 895 goto no_context; 896 #ifdef CONFIG_X86_32 897 /* User space => ok to do another page fault */ 898 if (is_prefetch(regs, address, error_code)) 899 return; 900 #endif 901 tsk->thread.cr2 = address; 902 tsk->thread.error_code = error_code; 903 tsk->thread.trap_no = 14; 904 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 905 } 906 907 DEFINE_SPINLOCK(pgd_lock); 908 LIST_HEAD(pgd_list); 909 910 void vmalloc_sync_all(void) 911 { 912 #ifdef CONFIG_X86_32 913 /* 914 * Note that races in the updates of insync and start aren't 915 * problematic: insync can only get set bits added, and updates to 916 * start are only improving performance (without affecting correctness 917 * if undone). 918 */ 919 static DECLARE_BITMAP(insync, PTRS_PER_PGD); 920 static unsigned long start = TASK_SIZE; 921 unsigned long address; 922 923 if (SHARED_KERNEL_PMD) 924 return; 925 926 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 927 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 928 if (!test_bit(pgd_index(address), insync)) { 929 unsigned long flags; 930 struct page *page; 931 932 spin_lock_irqsave(&pgd_lock, flags); 933 list_for_each_entry(page, &pgd_list, lru) { 934 if (!vmalloc_sync_one(page_address(page), 935 address)) 936 break; 937 } 938 spin_unlock_irqrestore(&pgd_lock, flags); 939 if (!page) 940 set_bit(pgd_index(address), insync); 941 } 942 if (address == start && test_bit(pgd_index(address), insync)) 943 start = address + PGDIR_SIZE; 944 } 945 #else /* CONFIG_X86_64 */ 946 /* 947 * Note that races in the updates of insync and start aren't 948 * problematic: insync can only get set bits added, and updates to 949 * start are only improving performance (without affecting correctness 950 * if undone). 951 */ 952 static DECLARE_BITMAP(insync, PTRS_PER_PGD); 953 static unsigned long start = VMALLOC_START & PGDIR_MASK; 954 unsigned long address; 955 956 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 957 if (!test_bit(pgd_index(address), insync)) { 958 const pgd_t *pgd_ref = pgd_offset_k(address); 959 unsigned long flags; 960 struct page *page; 961 962 if (pgd_none(*pgd_ref)) 963 continue; 964 spin_lock_irqsave(&pgd_lock, flags); 965 list_for_each_entry(page, &pgd_list, lru) { 966 pgd_t *pgd; 967 pgd = (pgd_t *)page_address(page) + pgd_index(address); 968 if (pgd_none(*pgd)) 969 set_pgd(pgd, *pgd_ref); 970 else 971 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 972 } 973 spin_unlock_irqrestore(&pgd_lock, flags); 974 set_bit(pgd_index(address), insync); 975 } 976 if (address == start) 977 start = address + PGDIR_SIZE; 978 } 979 /* Check that there is no need to do the same for the modules area. */ 980 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 981 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 982 (__START_KERNEL & PGDIR_MASK))); 983 #endif 984 } 985