1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 4 */ 5 6 #include <linux/signal.h> 7 #include <linux/sched.h> 8 #include <linux/kernel.h> 9 #include <linux/errno.h> 10 #include <linux/string.h> 11 #include <linux/types.h> 12 #include <linux/ptrace.h> 13 #include <linux/mman.h> 14 #include <linux/mm.h> 15 #include <linux/smp.h> 16 #include <linux/interrupt.h> 17 #include <linux/init.h> 18 #include <linux/tty.h> 19 #include <linux/vt_kern.h> /* For unblank_screen() */ 20 #include <linux/compiler.h> 21 #include <linux/highmem.h> 22 #include <linux/bootmem.h> /* for max_low_pfn */ 23 #include <linux/vmalloc.h> 24 #include <linux/module.h> 25 #include <linux/kprobes.h> 26 #include <linux/uaccess.h> 27 #include <linux/kdebug.h> 28 29 #include <asm/system.h> 30 #include <asm/desc.h> 31 #include <asm/segment.h> 32 #include <asm/pgalloc.h> 33 #include <asm/smp.h> 34 #include <asm/tlbflush.h> 35 #include <asm/proto.h> 36 #include <asm-generic/sections.h> 37 38 /* 39 * Page fault error code bits 40 * bit 0 == 0 means no page found, 1 means protection fault 41 * bit 1 == 0 means read, 1 means write 42 * bit 2 == 0 means kernel, 1 means user-mode 43 * bit 3 == 1 means use of reserved bit detected 44 * bit 4 == 1 means fault was an instruction fetch 45 */ 46 #define PF_PROT (1<<0) 47 #define PF_WRITE (1<<1) 48 #define PF_USER (1<<2) 49 #define PF_RSVD (1<<3) 50 #define PF_INSTR (1<<4) 51 52 static inline int notify_page_fault(struct pt_regs *regs) 53 { 54 #ifdef CONFIG_KPROBES 55 int ret = 0; 56 57 /* kprobe_running() needs smp_processor_id() */ 58 #ifdef CONFIG_X86_32 59 if (!user_mode_vm(regs)) { 60 #else 61 if (!user_mode(regs)) { 62 #endif 63 preempt_disable(); 64 if (kprobe_running() && kprobe_fault_handler(regs, 14)) 65 ret = 1; 66 preempt_enable(); 67 } 68 69 return ret; 70 #else 71 return 0; 72 #endif 73 } 74 75 /* 76 * X86_32 77 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 78 * Check that here and ignore it. 79 * 80 * X86_64 81 * Sometimes the CPU reports invalid exceptions on prefetch. 82 * Check that here and ignore it. 83 * 84 * Opcode checker based on code by Richard Brunner 85 */ 86 static int is_prefetch(struct pt_regs *regs, unsigned long addr, 87 unsigned long error_code) 88 { 89 unsigned char *instr; 90 int scan_more = 1; 91 int prefetch = 0; 92 unsigned char *max_instr; 93 94 #ifdef CONFIG_X86_32 95 if (!(__supported_pte_mask & _PAGE_NX)) 96 return 0; 97 #endif 98 99 /* If it was a exec fault on NX page, ignore */ 100 if (error_code & PF_INSTR) 101 return 0; 102 103 instr = (unsigned char *)convert_ip_to_linear(current, regs); 104 max_instr = instr + 15; 105 106 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 107 return 0; 108 109 while (scan_more && instr < max_instr) { 110 unsigned char opcode; 111 unsigned char instr_hi; 112 unsigned char instr_lo; 113 114 if (probe_kernel_address(instr, opcode)) 115 break; 116 117 instr_hi = opcode & 0xf0; 118 instr_lo = opcode & 0x0f; 119 instr++; 120 121 switch (instr_hi) { 122 case 0x20: 123 case 0x30: 124 /* 125 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 126 * In X86_64 long mode, the CPU will signal invalid 127 * opcode if some of these prefixes are present so 128 * X86_64 will never get here anyway 129 */ 130 scan_more = ((instr_lo & 7) == 0x6); 131 break; 132 #ifdef CONFIG_X86_64 133 case 0x40: 134 /* 135 * In AMD64 long mode 0x40..0x4F are valid REX prefixes 136 * Need to figure out under what instruction mode the 137 * instruction was issued. Could check the LDT for lm, 138 * but for now it's good enough to assume that long 139 * mode only uses well known segments or kernel. 140 */ 141 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); 142 break; 143 #endif 144 case 0x60: 145 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 146 scan_more = (instr_lo & 0xC) == 0x4; 147 break; 148 case 0xF0: 149 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 150 scan_more = !instr_lo || (instr_lo>>1) == 1; 151 break; 152 case 0x00: 153 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 154 scan_more = 0; 155 156 if (probe_kernel_address(instr, opcode)) 157 break; 158 prefetch = (instr_lo == 0xF) && 159 (opcode == 0x0D || opcode == 0x18); 160 break; 161 default: 162 scan_more = 0; 163 break; 164 } 165 } 166 return prefetch; 167 } 168 169 static void force_sig_info_fault(int si_signo, int si_code, 170 unsigned long address, struct task_struct *tsk) 171 { 172 siginfo_t info; 173 174 info.si_signo = si_signo; 175 info.si_errno = 0; 176 info.si_code = si_code; 177 info.si_addr = (void __user *)address; 178 force_sig_info(si_signo, &info, tsk); 179 } 180 181 #ifdef CONFIG_X86_64 182 static int bad_address(void *p) 183 { 184 unsigned long dummy; 185 return probe_kernel_address((unsigned long *)p, dummy); 186 } 187 #endif 188 189 void dump_pagetable(unsigned long address) 190 { 191 #ifdef CONFIG_X86_32 192 __typeof__(pte_val(__pte(0))) page; 193 194 page = read_cr3(); 195 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 196 #ifdef CONFIG_X86_PAE 197 printk("*pdpt = %016Lx ", page); 198 if ((page >> PAGE_SHIFT) < max_low_pfn 199 && page & _PAGE_PRESENT) { 200 page &= PAGE_MASK; 201 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 202 & (PTRS_PER_PMD - 1)]; 203 printk(KERN_CONT "*pde = %016Lx ", page); 204 page &= ~_PAGE_NX; 205 } 206 #else 207 printk("*pde = %08lx ", page); 208 #endif 209 210 /* 211 * We must not directly access the pte in the highpte 212 * case if the page table is located in highmem. 213 * And let's rather not kmap-atomic the pte, just in case 214 * it's allocated already. 215 */ 216 if ((page >> PAGE_SHIFT) < max_low_pfn 217 && (page & _PAGE_PRESENT) 218 && !(page & _PAGE_PSE)) { 219 page &= PAGE_MASK; 220 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 221 & (PTRS_PER_PTE - 1)]; 222 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); 223 } 224 225 printk("\n"); 226 #else /* CONFIG_X86_64 */ 227 pgd_t *pgd; 228 pud_t *pud; 229 pmd_t *pmd; 230 pte_t *pte; 231 232 pgd = (pgd_t *)read_cr3(); 233 234 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 235 pgd += pgd_index(address); 236 if (bad_address(pgd)) goto bad; 237 printk("PGD %lx ", pgd_val(*pgd)); 238 if (!pgd_present(*pgd)) goto ret; 239 240 pud = pud_offset(pgd, address); 241 if (bad_address(pud)) goto bad; 242 printk("PUD %lx ", pud_val(*pud)); 243 if (!pud_present(*pud)) goto ret; 244 245 pmd = pmd_offset(pud, address); 246 if (bad_address(pmd)) goto bad; 247 printk("PMD %lx ", pmd_val(*pmd)); 248 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; 249 250 pte = pte_offset_kernel(pmd, address); 251 if (bad_address(pte)) goto bad; 252 printk("PTE %lx", pte_val(*pte)); 253 ret: 254 printk("\n"); 255 return; 256 bad: 257 printk("BAD\n"); 258 #endif 259 } 260 261 #ifdef CONFIG_X86_32 262 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 263 { 264 unsigned index = pgd_index(address); 265 pgd_t *pgd_k; 266 pud_t *pud, *pud_k; 267 pmd_t *pmd, *pmd_k; 268 269 pgd += index; 270 pgd_k = init_mm.pgd + index; 271 272 if (!pgd_present(*pgd_k)) 273 return NULL; 274 275 /* 276 * set_pgd(pgd, *pgd_k); here would be useless on PAE 277 * and redundant with the set_pmd() on non-PAE. As would 278 * set_pud. 279 */ 280 281 pud = pud_offset(pgd, address); 282 pud_k = pud_offset(pgd_k, address); 283 if (!pud_present(*pud_k)) 284 return NULL; 285 286 pmd = pmd_offset(pud, address); 287 pmd_k = pmd_offset(pud_k, address); 288 if (!pmd_present(*pmd_k)) 289 return NULL; 290 if (!pmd_present(*pmd)) { 291 set_pmd(pmd, *pmd_k); 292 arch_flush_lazy_mmu_mode(); 293 } else 294 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 295 return pmd_k; 296 } 297 #endif 298 299 #ifdef CONFIG_X86_64 300 static const char errata93_warning[] = 301 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 302 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 303 KERN_ERR "******* Please consider a BIOS update.\n" 304 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 305 #endif 306 307 /* Workaround for K8 erratum #93 & buggy BIOS. 308 BIOS SMM functions are required to use a specific workaround 309 to avoid corruption of the 64bit RIP register on C stepping K8. 310 A lot of BIOS that didn't get tested properly miss this. 311 The OS sees this as a page fault with the upper 32bits of RIP cleared. 312 Try to work around it here. 313 Note we only handle faults in kernel here. 314 Does nothing for X86_32 315 */ 316 static int is_errata93(struct pt_regs *regs, unsigned long address) 317 { 318 #ifdef CONFIG_X86_64 319 static int warned; 320 if (address != regs->ip) 321 return 0; 322 if ((address >> 32) != 0) 323 return 0; 324 address |= 0xffffffffUL << 32; 325 if ((address >= (u64)_stext && address <= (u64)_etext) || 326 (address >= MODULES_VADDR && address <= MODULES_END)) { 327 if (!warned) { 328 printk(errata93_warning); 329 warned = 1; 330 } 331 regs->ip = address; 332 return 1; 333 } 334 #endif 335 return 0; 336 } 337 338 /* 339 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal 340 * addresses >4GB. We catch this in the page fault handler because these 341 * addresses are not reachable. Just detect this case and return. Any code 342 * segment in LDT is compatibility mode. 343 */ 344 static int is_errata100(struct pt_regs *regs, unsigned long address) 345 { 346 #ifdef CONFIG_X86_64 347 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && 348 (address >> 32)) 349 return 1; 350 #endif 351 return 0; 352 } 353 354 void do_invalid_op(struct pt_regs *, unsigned long); 355 356 static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 357 { 358 #ifdef CONFIG_X86_F00F_BUG 359 unsigned long nr; 360 /* 361 * Pentium F0 0F C7 C8 bug workaround. 362 */ 363 if (boot_cpu_data.f00f_bug) { 364 nr = (address - idt_descr.address) >> 3; 365 366 if (nr == 6) { 367 do_invalid_op(regs, 0); 368 return 1; 369 } 370 } 371 #endif 372 return 0; 373 } 374 375 static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, 376 unsigned long address) 377 { 378 #ifdef CONFIG_X86_32 379 if (!oops_may_print()) 380 return; 381 #endif 382 383 #ifdef CONFIG_X86_PAE 384 if (error_code & PF_INSTR) { 385 unsigned int level; 386 pte_t *pte = lookup_address(address, &level); 387 388 if (pte && pte_present(*pte) && !pte_exec(*pte)) 389 printk(KERN_CRIT "kernel tried to execute " 390 "NX-protected page - exploit attempt? " 391 "(uid: %d)\n", current->uid); 392 } 393 #endif 394 395 printk(KERN_ALERT "BUG: unable to handle kernel "); 396 if (address < PAGE_SIZE) 397 printk(KERN_CONT "NULL pointer dereference"); 398 else 399 printk(KERN_CONT "paging request"); 400 #ifdef CONFIG_X86_32 401 printk(KERN_CONT " at %08lx\n", address); 402 #else 403 printk(KERN_CONT " at %016lx\n", address); 404 #endif 405 printk(KERN_ALERT "IP:"); 406 printk_address(regs->ip, 1); 407 dump_pagetable(address); 408 } 409 410 #ifdef CONFIG_X86_64 411 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 412 unsigned long error_code) 413 { 414 unsigned long flags = oops_begin(); 415 struct task_struct *tsk; 416 417 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 418 current->comm, address); 419 dump_pagetable(address); 420 tsk = current; 421 tsk->thread.cr2 = address; 422 tsk->thread.trap_no = 14; 423 tsk->thread.error_code = error_code; 424 if (__die("Bad pagetable", regs, error_code)) 425 regs = NULL; 426 oops_end(flags, regs, SIGKILL); 427 } 428 #endif 429 430 /* 431 * Handle a spurious fault caused by a stale TLB entry. This allows 432 * us to lazily refresh the TLB when increasing the permissions of a 433 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very 434 * expensive since that implies doing a full cross-processor TLB 435 * flush, even if no stale TLB entries exist on other processors. 436 * There are no security implications to leaving a stale TLB when 437 * increasing the permissions on a page. 438 */ 439 static int spurious_fault(unsigned long address, 440 unsigned long error_code) 441 { 442 pgd_t *pgd; 443 pud_t *pud; 444 pmd_t *pmd; 445 pte_t *pte; 446 447 /* Reserved-bit violation or user access to kernel space? */ 448 if (error_code & (PF_USER | PF_RSVD)) 449 return 0; 450 451 pgd = init_mm.pgd + pgd_index(address); 452 if (!pgd_present(*pgd)) 453 return 0; 454 455 pud = pud_offset(pgd, address); 456 if (!pud_present(*pud)) 457 return 0; 458 459 pmd = pmd_offset(pud, address); 460 if (!pmd_present(*pmd)) 461 return 0; 462 463 pte = pte_offset_kernel(pmd, address); 464 if (!pte_present(*pte)) 465 return 0; 466 467 if ((error_code & PF_WRITE) && !pte_write(*pte)) 468 return 0; 469 if ((error_code & PF_INSTR) && !pte_exec(*pte)) 470 return 0; 471 472 return 1; 473 } 474 475 /* 476 * X86_32 477 * Handle a fault on the vmalloc or module mapping area 478 * 479 * X86_64 480 * Handle a fault on the vmalloc area 481 * 482 * This assumes no large pages in there. 483 */ 484 static int vmalloc_fault(unsigned long address) 485 { 486 #ifdef CONFIG_X86_32 487 unsigned long pgd_paddr; 488 pmd_t *pmd_k; 489 pte_t *pte_k; 490 /* 491 * Synchronize this task's top level page-table 492 * with the 'reference' page table. 493 * 494 * Do _not_ use "current" here. We might be inside 495 * an interrupt in the middle of a task switch.. 496 */ 497 pgd_paddr = read_cr3(); 498 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 499 if (!pmd_k) 500 return -1; 501 pte_k = pte_offset_kernel(pmd_k, address); 502 if (!pte_present(*pte_k)) 503 return -1; 504 return 0; 505 #else 506 pgd_t *pgd, *pgd_ref; 507 pud_t *pud, *pud_ref; 508 pmd_t *pmd, *pmd_ref; 509 pte_t *pte, *pte_ref; 510 511 /* Copy kernel mappings over when needed. This can also 512 happen within a race in page table update. In the later 513 case just flush. */ 514 515 pgd = pgd_offset(current->mm ?: &init_mm, address); 516 pgd_ref = pgd_offset_k(address); 517 if (pgd_none(*pgd_ref)) 518 return -1; 519 if (pgd_none(*pgd)) 520 set_pgd(pgd, *pgd_ref); 521 else 522 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 523 524 /* Below here mismatches are bugs because these lower tables 525 are shared */ 526 527 pud = pud_offset(pgd, address); 528 pud_ref = pud_offset(pgd_ref, address); 529 if (pud_none(*pud_ref)) 530 return -1; 531 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 532 BUG(); 533 pmd = pmd_offset(pud, address); 534 pmd_ref = pmd_offset(pud_ref, address); 535 if (pmd_none(*pmd_ref)) 536 return -1; 537 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 538 BUG(); 539 pte_ref = pte_offset_kernel(pmd_ref, address); 540 if (!pte_present(*pte_ref)) 541 return -1; 542 pte = pte_offset_kernel(pmd, address); 543 /* Don't use pte_page here, because the mappings can point 544 outside mem_map, and the NUMA hash lookup cannot handle 545 that. */ 546 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 547 BUG(); 548 return 0; 549 #endif 550 } 551 552 int show_unhandled_signals = 1; 553 554 /* 555 * This routine handles page faults. It determines the address, 556 * and the problem, and then passes it off to one of the appropriate 557 * routines. 558 */ 559 #ifdef CONFIG_X86_64 560 asmlinkage 561 #endif 562 void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 563 { 564 struct task_struct *tsk; 565 struct mm_struct *mm; 566 struct vm_area_struct *vma; 567 unsigned long address; 568 int write, si_code; 569 int fault; 570 #ifdef CONFIG_X86_64 571 unsigned long flags; 572 #endif 573 574 /* 575 * We can fault from pretty much anywhere, with unknown IRQ state. 576 */ 577 trace_hardirqs_fixup(); 578 579 tsk = current; 580 mm = tsk->mm; 581 prefetchw(&mm->mmap_sem); 582 583 /* get the address */ 584 address = read_cr2(); 585 586 si_code = SEGV_MAPERR; 587 588 if (notify_page_fault(regs)) 589 return; 590 591 /* 592 * We fault-in kernel-space virtual memory on-demand. The 593 * 'reference' page table is init_mm.pgd. 594 * 595 * NOTE! We MUST NOT take any locks for this case. We may 596 * be in an interrupt or a critical region, and should 597 * only copy the information from the master page table, 598 * nothing more. 599 * 600 * This verifies that the fault happens in kernel space 601 * (error_code & 4) == 0, and that the fault was not a 602 * protection error (error_code & 9) == 0. 603 */ 604 #ifdef CONFIG_X86_32 605 if (unlikely(address >= TASK_SIZE)) { 606 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 607 vmalloc_fault(address) >= 0) 608 return; 609 610 /* Can handle a stale RO->RW TLB */ 611 if (spurious_fault(address, error_code)) 612 return; 613 614 /* 615 * Don't take the mm semaphore here. If we fixup a prefetch 616 * fault we could otherwise deadlock. 617 */ 618 goto bad_area_nosemaphore; 619 } 620 621 /* It's safe to allow irq's after cr2 has been saved and the vmalloc 622 fault has been handled. */ 623 if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) 624 local_irq_enable(); 625 626 /* 627 * If we're in an interrupt, have no user context or are running in an 628 * atomic region then we must not take the fault. 629 */ 630 if (in_atomic() || !mm) 631 goto bad_area_nosemaphore; 632 #else /* CONFIG_X86_64 */ 633 if (unlikely(address >= TASK_SIZE64)) { 634 /* 635 * Don't check for the module range here: its PML4 636 * is always initialized because it's shared with the main 637 * kernel text. Only vmalloc may need PML4 syncups. 638 */ 639 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 640 ((address >= VMALLOC_START && address < VMALLOC_END))) { 641 if (vmalloc_fault(address) >= 0) 642 return; 643 } 644 645 /* Can handle a stale RO->RW TLB */ 646 if (spurious_fault(address, error_code)) 647 return; 648 649 /* 650 * Don't take the mm semaphore here. If we fixup a prefetch 651 * fault we could otherwise deadlock. 652 */ 653 goto bad_area_nosemaphore; 654 } 655 if (likely(regs->flags & X86_EFLAGS_IF)) 656 local_irq_enable(); 657 658 if (unlikely(error_code & PF_RSVD)) 659 pgtable_bad(address, regs, error_code); 660 661 /* 662 * If we're in an interrupt, have no user context or are running in an 663 * atomic region then we must not take the fault. 664 */ 665 if (unlikely(in_atomic() || !mm)) 666 goto bad_area_nosemaphore; 667 668 /* 669 * User-mode registers count as a user access even for any 670 * potential system fault or CPU buglet. 671 */ 672 if (user_mode_vm(regs)) 673 error_code |= PF_USER; 674 again: 675 #endif 676 /* When running in the kernel we expect faults to occur only to 677 * addresses in user space. All other faults represent errors in the 678 * kernel and should generate an OOPS. Unfortunately, in the case of an 679 * erroneous fault occurring in a code path which already holds mmap_sem 680 * we will deadlock attempting to validate the fault against the 681 * address space. Luckily the kernel only validly references user 682 * space from well defined areas of code, which are listed in the 683 * exceptions table. 684 * 685 * As the vast majority of faults will be valid we will only perform 686 * the source reference check when there is a possibility of a deadlock. 687 * Attempt to lock the address space, if we cannot we then validate the 688 * source. If this is invalid we can skip the address space check, 689 * thus avoiding the deadlock. 690 */ 691 if (!down_read_trylock(&mm->mmap_sem)) { 692 if ((error_code & PF_USER) == 0 && 693 !search_exception_tables(regs->ip)) 694 goto bad_area_nosemaphore; 695 down_read(&mm->mmap_sem); 696 } 697 698 vma = find_vma(mm, address); 699 if (!vma) 700 goto bad_area; 701 if (vma->vm_start <= address) 702 goto good_area; 703 if (!(vma->vm_flags & VM_GROWSDOWN)) 704 goto bad_area; 705 if (error_code & PF_USER) { 706 /* 707 * Accessing the stack below %sp is always a bug. 708 * The large cushion allows instructions like enter 709 * and pusha to work. ("enter $65535,$31" pushes 710 * 32 pointers and then decrements %sp by 65535.) 711 */ 712 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 713 goto bad_area; 714 } 715 if (expand_stack(vma, address)) 716 goto bad_area; 717 /* 718 * Ok, we have a good vm_area for this memory access, so 719 * we can handle it.. 720 */ 721 good_area: 722 si_code = SEGV_ACCERR; 723 write = 0; 724 switch (error_code & (PF_PROT|PF_WRITE)) { 725 default: /* 3: write, present */ 726 /* fall through */ 727 case PF_WRITE: /* write, not present */ 728 if (!(vma->vm_flags & VM_WRITE)) 729 goto bad_area; 730 write++; 731 break; 732 case PF_PROT: /* read, present */ 733 goto bad_area; 734 case 0: /* read, not present */ 735 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 736 goto bad_area; 737 } 738 739 #ifdef CONFIG_X86_32 740 survive: 741 #endif 742 /* 743 * If for any reason at all we couldn't handle the fault, 744 * make sure we exit gracefully rather than endlessly redo 745 * the fault. 746 */ 747 fault = handle_mm_fault(mm, vma, address, write); 748 if (unlikely(fault & VM_FAULT_ERROR)) { 749 if (fault & VM_FAULT_OOM) 750 goto out_of_memory; 751 else if (fault & VM_FAULT_SIGBUS) 752 goto do_sigbus; 753 BUG(); 754 } 755 if (fault & VM_FAULT_MAJOR) 756 tsk->maj_flt++; 757 else 758 tsk->min_flt++; 759 760 #ifdef CONFIG_X86_32 761 /* 762 * Did it hit the DOS screen memory VA from vm86 mode? 763 */ 764 if (v8086_mode(regs)) { 765 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; 766 if (bit < 32) 767 tsk->thread.screen_bitmap |= 1 << bit; 768 } 769 #endif 770 up_read(&mm->mmap_sem); 771 return; 772 773 /* 774 * Something tried to access memory that isn't in our memory map.. 775 * Fix it, but check if it's kernel or user first.. 776 */ 777 bad_area: 778 up_read(&mm->mmap_sem); 779 780 bad_area_nosemaphore: 781 /* User mode accesses just cause a SIGSEGV */ 782 if (error_code & PF_USER) { 783 /* 784 * It's possible to have interrupts off here. 785 */ 786 local_irq_enable(); 787 788 /* 789 * Valid to do another page fault here because this one came 790 * from user space. 791 */ 792 if (is_prefetch(regs, address, error_code)) 793 return; 794 795 if (is_errata100(regs, address)) 796 return; 797 798 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 799 printk_ratelimit()) { 800 printk( 801 #ifdef CONFIG_X86_32 802 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", 803 #else 804 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", 805 #endif 806 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 807 tsk->comm, task_pid_nr(tsk), address, regs->ip, 808 regs->sp, error_code); 809 print_vma_addr(" in ", regs->ip); 810 printk("\n"); 811 } 812 813 tsk->thread.cr2 = address; 814 /* Kernel addresses are always protection faults */ 815 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 816 tsk->thread.trap_no = 14; 817 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 818 return; 819 } 820 821 if (is_f00f_bug(regs, address)) 822 return; 823 824 no_context: 825 /* Are we prepared to handle this kernel fault? */ 826 if (fixup_exception(regs)) 827 return; 828 829 /* 830 * X86_32 831 * Valid to do another page fault here, because if this fault 832 * had been triggered by is_prefetch fixup_exception would have 833 * handled it. 834 * 835 * X86_64 836 * Hall of shame of CPU/BIOS bugs. 837 */ 838 if (is_prefetch(regs, address, error_code)) 839 return; 840 841 if (is_errata93(regs, address)) 842 return; 843 844 /* 845 * Oops. The kernel tried to access some bad page. We'll have to 846 * terminate things with extreme prejudice. 847 */ 848 #ifdef CONFIG_X86_32 849 bust_spinlocks(1); 850 #else 851 flags = oops_begin(); 852 #endif 853 854 show_fault_oops(regs, error_code, address); 855 856 tsk->thread.cr2 = address; 857 tsk->thread.trap_no = 14; 858 tsk->thread.error_code = error_code; 859 860 #ifdef CONFIG_X86_32 861 die("Oops", regs, error_code); 862 bust_spinlocks(0); 863 do_exit(SIGKILL); 864 #else 865 if (__die("Oops", regs, error_code)) 866 regs = NULL; 867 /* Executive summary in case the body of the oops scrolled away */ 868 printk(KERN_EMERG "CR2: %016lx\n", address); 869 oops_end(flags, regs, SIGKILL); 870 #endif 871 872 /* 873 * We ran out of memory, or some other thing happened to us that made 874 * us unable to handle the page fault gracefully. 875 */ 876 out_of_memory: 877 up_read(&mm->mmap_sem); 878 if (is_global_init(tsk)) { 879 yield(); 880 #ifdef CONFIG_X86_32 881 down_read(&mm->mmap_sem); 882 goto survive; 883 #else 884 goto again; 885 #endif 886 } 887 888 printk("VM: killing process %s\n", tsk->comm); 889 if (error_code & PF_USER) 890 do_group_exit(SIGKILL); 891 goto no_context; 892 893 do_sigbus: 894 up_read(&mm->mmap_sem); 895 896 /* Kernel mode? Handle exceptions or die */ 897 if (!(error_code & PF_USER)) 898 goto no_context; 899 #ifdef CONFIG_X86_32 900 /* User space => ok to do another page fault */ 901 if (is_prefetch(regs, address, error_code)) 902 return; 903 #endif 904 tsk->thread.cr2 = address; 905 tsk->thread.error_code = error_code; 906 tsk->thread.trap_no = 14; 907 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 908 } 909 910 DEFINE_SPINLOCK(pgd_lock); 911 LIST_HEAD(pgd_list); 912 913 void vmalloc_sync_all(void) 914 { 915 #ifdef CONFIG_X86_32 916 /* 917 * Note that races in the updates of insync and start aren't 918 * problematic: insync can only get set bits added, and updates to 919 * start are only improving performance (without affecting correctness 920 * if undone). 921 */ 922 static DECLARE_BITMAP(insync, PTRS_PER_PGD); 923 static unsigned long start = TASK_SIZE; 924 unsigned long address; 925 926 if (SHARED_KERNEL_PMD) 927 return; 928 929 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 930 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 931 if (!test_bit(pgd_index(address), insync)) { 932 unsigned long flags; 933 struct page *page; 934 935 spin_lock_irqsave(&pgd_lock, flags); 936 list_for_each_entry(page, &pgd_list, lru) { 937 if (!vmalloc_sync_one(page_address(page), 938 address)) 939 break; 940 } 941 spin_unlock_irqrestore(&pgd_lock, flags); 942 if (!page) 943 set_bit(pgd_index(address), insync); 944 } 945 if (address == start && test_bit(pgd_index(address), insync)) 946 start = address + PGDIR_SIZE; 947 } 948 #else /* CONFIG_X86_64 */ 949 /* 950 * Note that races in the updates of insync and start aren't 951 * problematic: insync can only get set bits added, and updates to 952 * start are only improving performance (without affecting correctness 953 * if undone). 954 */ 955 static DECLARE_BITMAP(insync, PTRS_PER_PGD); 956 static unsigned long start = VMALLOC_START & PGDIR_MASK; 957 unsigned long address; 958 959 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 960 if (!test_bit(pgd_index(address), insync)) { 961 const pgd_t *pgd_ref = pgd_offset_k(address); 962 struct page *page; 963 964 if (pgd_none(*pgd_ref)) 965 continue; 966 spin_lock(&pgd_lock); 967 list_for_each_entry(page, &pgd_list, lru) { 968 pgd_t *pgd; 969 pgd = (pgd_t *)page_address(page) + pgd_index(address); 970 if (pgd_none(*pgd)) 971 set_pgd(pgd, *pgd_ref); 972 else 973 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 974 } 975 spin_unlock(&pgd_lock); 976 set_bit(pgd_index(address), insync); 977 } 978 if (address == start) 979 start = address + PGDIR_SIZE; 980 } 981 /* Check that there is no need to do the same for the modules area. */ 982 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 983 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 984 (__START_KERNEL & PGDIR_MASK))); 985 #endif 986 } 987