1c61e211dSHarvey Harrison /* 2c61e211dSHarvey Harrison * Copyright (C) 1995 Linus Torvalds 3c61e211dSHarvey Harrison * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 4c61e211dSHarvey Harrison */ 5c61e211dSHarvey Harrison 6c61e211dSHarvey Harrison #include <linux/signal.h> 7c61e211dSHarvey Harrison #include <linux/sched.h> 8c61e211dSHarvey Harrison #include <linux/kernel.h> 9c61e211dSHarvey Harrison #include <linux/errno.h> 10c61e211dSHarvey Harrison #include <linux/string.h> 11c61e211dSHarvey Harrison #include <linux/types.h> 12c61e211dSHarvey Harrison #include <linux/ptrace.h> 130fd0e3daSPekka Paalanen #include <linux/mmiotrace.h> 14c61e211dSHarvey Harrison #include <linux/mman.h> 15c61e211dSHarvey Harrison #include <linux/mm.h> 16c61e211dSHarvey Harrison #include <linux/smp.h> 17c61e211dSHarvey Harrison #include <linux/interrupt.h> 18c61e211dSHarvey Harrison #include <linux/init.h> 19c61e211dSHarvey Harrison #include <linux/tty.h> 20c61e211dSHarvey Harrison #include <linux/vt_kern.h> /* For unblank_screen() */ 21c61e211dSHarvey Harrison #include <linux/compiler.h> 22c61e211dSHarvey Harrison #include <linux/highmem.h> 23c61e211dSHarvey Harrison #include <linux/bootmem.h> /* for max_low_pfn */ 24c61e211dSHarvey Harrison #include <linux/vmalloc.h> 25c61e211dSHarvey Harrison #include <linux/module.h> 26c61e211dSHarvey Harrison #include <linux/kprobes.h> 27c61e211dSHarvey Harrison #include <linux/uaccess.h> 28c61e211dSHarvey Harrison #include <linux/kdebug.h> 29c61e211dSHarvey Harrison 30c61e211dSHarvey Harrison #include <asm/system.h> 31c61e211dSHarvey Harrison #include <asm/desc.h> 32c61e211dSHarvey Harrison #include <asm/segment.h> 33c61e211dSHarvey Harrison #include <asm/pgalloc.h> 34c61e211dSHarvey Harrison #include <asm/smp.h> 35c61e211dSHarvey Harrison #include <asm/tlbflush.h> 36c61e211dSHarvey Harrison #include <asm/proto.h> 37c61e211dSHarvey Harrison #include <asm-generic/sections.h> 3870ef5641SJaswinder Singh #include <asm/traps.h> 39c61e211dSHarvey Harrison 40c61e211dSHarvey Harrison /* 41c61e211dSHarvey Harrison * Page fault error code bits 42c61e211dSHarvey Harrison * bit 0 == 0 means no page found, 1 means protection fault 43c61e211dSHarvey Harrison * bit 1 == 0 means read, 1 means write 44c61e211dSHarvey Harrison * bit 2 == 0 means kernel, 1 means user-mode 45c61e211dSHarvey Harrison * bit 3 == 1 means use of reserved bit detected 46c61e211dSHarvey Harrison * bit 4 == 1 means fault was an instruction fetch 47c61e211dSHarvey Harrison */ 48c61e211dSHarvey Harrison #define PF_PROT (1<<0) 49c61e211dSHarvey Harrison #define PF_WRITE (1<<1) 50c61e211dSHarvey Harrison #define PF_USER (1<<2) 51c61e211dSHarvey Harrison #define PF_RSVD (1<<3) 52c61e211dSHarvey Harrison #define PF_INSTR (1<<4) 53c61e211dSHarvey Harrison 540fd0e3daSPekka Paalanen static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 5586069782SPekka Paalanen { 5610c43d2eSPekka Paalanen #ifdef CONFIG_MMIOTRACE_HOOKS 570fd0e3daSPekka Paalanen if (unlikely(is_kmmio_active())) 580fd0e3daSPekka Paalanen if (kmmio_handler(regs, addr) == 1) 590fd0e3daSPekka Paalanen return -1; 6086069782SPekka Paalanen #endif 610fd0e3daSPekka Paalanen return 0; 6286069782SPekka Paalanen } 6386069782SPekka Paalanen 64c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs) 65c61e211dSHarvey Harrison { 66c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES 67c61e211dSHarvey Harrison int ret = 0; 68c61e211dSHarvey Harrison 69c61e211dSHarvey Harrison /* kprobe_running() needs smp_processor_id() */ 70c61e211dSHarvey Harrison if (!user_mode_vm(regs)) { 71c61e211dSHarvey Harrison preempt_disable(); 72c61e211dSHarvey Harrison if (kprobe_running() && kprobe_fault_handler(regs, 14)) 73c61e211dSHarvey Harrison ret = 1; 74c61e211dSHarvey Harrison preempt_enable(); 75c61e211dSHarvey Harrison } 76c61e211dSHarvey Harrison 77c61e211dSHarvey Harrison return ret; 78c61e211dSHarvey Harrison #else 79c61e211dSHarvey Harrison return 0; 80c61e211dSHarvey Harrison #endif 81c61e211dSHarvey Harrison } 82c61e211dSHarvey Harrison 83c61e211dSHarvey Harrison /* 84c61e211dSHarvey Harrison * X86_32 85c61e211dSHarvey Harrison * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 86c61e211dSHarvey Harrison * Check that here and ignore it. 87c61e211dSHarvey Harrison * 88c61e211dSHarvey Harrison * X86_64 89c61e211dSHarvey Harrison * Sometimes the CPU reports invalid exceptions on prefetch. 90c61e211dSHarvey Harrison * Check that here and ignore it. 91c61e211dSHarvey Harrison * 92c61e211dSHarvey Harrison * Opcode checker based on code by Richard Brunner 93c61e211dSHarvey Harrison */ 94c61e211dSHarvey Harrison static int is_prefetch(struct pt_regs *regs, unsigned long addr, 95c61e211dSHarvey Harrison unsigned long error_code) 96c61e211dSHarvey Harrison { 97c61e211dSHarvey Harrison unsigned char *instr; 98c61e211dSHarvey Harrison int scan_more = 1; 99c61e211dSHarvey Harrison int prefetch = 0; 100c61e211dSHarvey Harrison unsigned char *max_instr; 101c61e211dSHarvey Harrison 1023085354dSIngo Molnar /* 1033085354dSIngo Molnar * If it was a exec (instruction fetch) fault on NX page, then 1043085354dSIngo Molnar * do not ignore the fault: 1053085354dSIngo Molnar */ 106c61e211dSHarvey Harrison if (error_code & PF_INSTR) 107c61e211dSHarvey Harrison return 0; 108c61e211dSHarvey Harrison 109c61e211dSHarvey Harrison instr = (unsigned char *)convert_ip_to_linear(current, regs); 110c61e211dSHarvey Harrison max_instr = instr + 15; 111c61e211dSHarvey Harrison 112c61e211dSHarvey Harrison if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 113c61e211dSHarvey Harrison return 0; 114c61e211dSHarvey Harrison 115c61e211dSHarvey Harrison while (scan_more && instr < max_instr) { 116c61e211dSHarvey Harrison unsigned char opcode; 117c61e211dSHarvey Harrison unsigned char instr_hi; 118c61e211dSHarvey Harrison unsigned char instr_lo; 119c61e211dSHarvey Harrison 120c61e211dSHarvey Harrison if (probe_kernel_address(instr, opcode)) 121c61e211dSHarvey Harrison break; 122c61e211dSHarvey Harrison 123c61e211dSHarvey Harrison instr_hi = opcode & 0xf0; 124c61e211dSHarvey Harrison instr_lo = opcode & 0x0f; 125c61e211dSHarvey Harrison instr++; 126c61e211dSHarvey Harrison 127c61e211dSHarvey Harrison switch (instr_hi) { 128c61e211dSHarvey Harrison case 0x20: 129c61e211dSHarvey Harrison case 0x30: 130c61e211dSHarvey Harrison /* 131c61e211dSHarvey Harrison * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 132c61e211dSHarvey Harrison * In X86_64 long mode, the CPU will signal invalid 133c61e211dSHarvey Harrison * opcode if some of these prefixes are present so 134c61e211dSHarvey Harrison * X86_64 will never get here anyway 135c61e211dSHarvey Harrison */ 136c61e211dSHarvey Harrison scan_more = ((instr_lo & 7) == 0x6); 137c61e211dSHarvey Harrison break; 138c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 139c61e211dSHarvey Harrison case 0x40: 140c61e211dSHarvey Harrison /* 141c61e211dSHarvey Harrison * In AMD64 long mode 0x40..0x4F are valid REX prefixes 142c61e211dSHarvey Harrison * Need to figure out under what instruction mode the 143c61e211dSHarvey Harrison * instruction was issued. Could check the LDT for lm, 144c61e211dSHarvey Harrison * but for now it's good enough to assume that long 145c61e211dSHarvey Harrison * mode only uses well known segments or kernel. 146c61e211dSHarvey Harrison */ 147c61e211dSHarvey Harrison scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); 148c61e211dSHarvey Harrison break; 149c61e211dSHarvey Harrison #endif 150c61e211dSHarvey Harrison case 0x60: 151c61e211dSHarvey Harrison /* 0x64 thru 0x67 are valid prefixes in all modes. */ 152c61e211dSHarvey Harrison scan_more = (instr_lo & 0xC) == 0x4; 153c61e211dSHarvey Harrison break; 154c61e211dSHarvey Harrison case 0xF0: 155c61e211dSHarvey Harrison /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 156c61e211dSHarvey Harrison scan_more = !instr_lo || (instr_lo>>1) == 1; 157c61e211dSHarvey Harrison break; 158c61e211dSHarvey Harrison case 0x00: 159c61e211dSHarvey Harrison /* Prefetch instruction is 0x0F0D or 0x0F18 */ 160c61e211dSHarvey Harrison scan_more = 0; 161c61e211dSHarvey Harrison 162c61e211dSHarvey Harrison if (probe_kernel_address(instr, opcode)) 163c61e211dSHarvey Harrison break; 164c61e211dSHarvey Harrison prefetch = (instr_lo == 0xF) && 165c61e211dSHarvey Harrison (opcode == 0x0D || opcode == 0x18); 166c61e211dSHarvey Harrison break; 167c61e211dSHarvey Harrison default: 168c61e211dSHarvey Harrison scan_more = 0; 169c61e211dSHarvey Harrison break; 170c61e211dSHarvey Harrison } 171c61e211dSHarvey Harrison } 172c61e211dSHarvey Harrison return prefetch; 173c61e211dSHarvey Harrison } 174c61e211dSHarvey Harrison 175c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code, 176c61e211dSHarvey Harrison unsigned long address, struct task_struct *tsk) 177c61e211dSHarvey Harrison { 178c61e211dSHarvey Harrison siginfo_t info; 179c61e211dSHarvey Harrison 180c61e211dSHarvey Harrison info.si_signo = si_signo; 181c61e211dSHarvey Harrison info.si_errno = 0; 182c61e211dSHarvey Harrison info.si_code = si_code; 183c61e211dSHarvey Harrison info.si_addr = (void __user *)address; 184c61e211dSHarvey Harrison force_sig_info(si_signo, &info, tsk); 185c61e211dSHarvey Harrison } 186c61e211dSHarvey Harrison 187c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 188c61e211dSHarvey Harrison static int bad_address(void *p) 189c61e211dSHarvey Harrison { 190c61e211dSHarvey Harrison unsigned long dummy; 191c61e211dSHarvey Harrison return probe_kernel_address((unsigned long *)p, dummy); 192c61e211dSHarvey Harrison } 193c61e211dSHarvey Harrison #endif 194c61e211dSHarvey Harrison 195cae30f82SAdrian Bunk static void dump_pagetable(unsigned long address) 196c61e211dSHarvey Harrison { 197c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 198c61e211dSHarvey Harrison __typeof__(pte_val(__pte(0))) page; 199c61e211dSHarvey Harrison 200c61e211dSHarvey Harrison page = read_cr3(); 201c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 202c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE 203c61e211dSHarvey Harrison printk("*pdpt = %016Lx ", page); 204c61e211dSHarvey Harrison if ((page >> PAGE_SHIFT) < max_low_pfn 205c61e211dSHarvey Harrison && page & _PAGE_PRESENT) { 206c61e211dSHarvey Harrison page &= PAGE_MASK; 207c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 208c61e211dSHarvey Harrison & (PTRS_PER_PMD - 1)]; 209c61e211dSHarvey Harrison printk(KERN_CONT "*pde = %016Lx ", page); 210c61e211dSHarvey Harrison page &= ~_PAGE_NX; 211c61e211dSHarvey Harrison } 212c61e211dSHarvey Harrison #else 213c61e211dSHarvey Harrison printk("*pde = %08lx ", page); 214c61e211dSHarvey Harrison #endif 215c61e211dSHarvey Harrison 216c61e211dSHarvey Harrison /* 217c61e211dSHarvey Harrison * We must not directly access the pte in the highpte 218c61e211dSHarvey Harrison * case if the page table is located in highmem. 219c61e211dSHarvey Harrison * And let's rather not kmap-atomic the pte, just in case 220c61e211dSHarvey Harrison * it's allocated already. 221c61e211dSHarvey Harrison */ 222c61e211dSHarvey Harrison if ((page >> PAGE_SHIFT) < max_low_pfn 223c61e211dSHarvey Harrison && (page & _PAGE_PRESENT) 224c61e211dSHarvey Harrison && !(page & _PAGE_PSE)) { 225c61e211dSHarvey Harrison page &= PAGE_MASK; 226c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 227c61e211dSHarvey Harrison & (PTRS_PER_PTE - 1)]; 228c61e211dSHarvey Harrison printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); 229c61e211dSHarvey Harrison } 230c61e211dSHarvey Harrison 231c61e211dSHarvey Harrison printk("\n"); 232c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */ 233c61e211dSHarvey Harrison pgd_t *pgd; 234c61e211dSHarvey Harrison pud_t *pud; 235c61e211dSHarvey Harrison pmd_t *pmd; 236c61e211dSHarvey Harrison pte_t *pte; 237c61e211dSHarvey Harrison 238c61e211dSHarvey Harrison pgd = (pgd_t *)read_cr3(); 239c61e211dSHarvey Harrison 240c61e211dSHarvey Harrison pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 241c61e211dSHarvey Harrison pgd += pgd_index(address); 242c61e211dSHarvey Harrison if (bad_address(pgd)) goto bad; 243c61e211dSHarvey Harrison printk("PGD %lx ", pgd_val(*pgd)); 244c61e211dSHarvey Harrison if (!pgd_present(*pgd)) goto ret; 245c61e211dSHarvey Harrison 246c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 247c61e211dSHarvey Harrison if (bad_address(pud)) goto bad; 248c61e211dSHarvey Harrison printk("PUD %lx ", pud_val(*pud)); 249b5360222SAndi Kleen if (!pud_present(*pud) || pud_large(*pud)) 250b5360222SAndi Kleen goto ret; 251c61e211dSHarvey Harrison 252c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 253c61e211dSHarvey Harrison if (bad_address(pmd)) goto bad; 254c61e211dSHarvey Harrison printk("PMD %lx ", pmd_val(*pmd)); 255c61e211dSHarvey Harrison if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; 256c61e211dSHarvey Harrison 257c61e211dSHarvey Harrison pte = pte_offset_kernel(pmd, address); 258c61e211dSHarvey Harrison if (bad_address(pte)) goto bad; 259c61e211dSHarvey Harrison printk("PTE %lx", pte_val(*pte)); 260c61e211dSHarvey Harrison ret: 261c61e211dSHarvey Harrison printk("\n"); 262c61e211dSHarvey Harrison return; 263c61e211dSHarvey Harrison bad: 264c61e211dSHarvey Harrison printk("BAD\n"); 265c61e211dSHarvey Harrison #endif 266c61e211dSHarvey Harrison } 267c61e211dSHarvey Harrison 268c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 269c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 270c61e211dSHarvey Harrison { 271c61e211dSHarvey Harrison unsigned index = pgd_index(address); 272c61e211dSHarvey Harrison pgd_t *pgd_k; 273c61e211dSHarvey Harrison pud_t *pud, *pud_k; 274c61e211dSHarvey Harrison pmd_t *pmd, *pmd_k; 275c61e211dSHarvey Harrison 276c61e211dSHarvey Harrison pgd += index; 277c61e211dSHarvey Harrison pgd_k = init_mm.pgd + index; 278c61e211dSHarvey Harrison 279c61e211dSHarvey Harrison if (!pgd_present(*pgd_k)) 280c61e211dSHarvey Harrison return NULL; 281c61e211dSHarvey Harrison 282c61e211dSHarvey Harrison /* 283c61e211dSHarvey Harrison * set_pgd(pgd, *pgd_k); here would be useless on PAE 284c61e211dSHarvey Harrison * and redundant with the set_pmd() on non-PAE. As would 285c61e211dSHarvey Harrison * set_pud. 286c61e211dSHarvey Harrison */ 287c61e211dSHarvey Harrison 288c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 289c61e211dSHarvey Harrison pud_k = pud_offset(pgd_k, address); 290c61e211dSHarvey Harrison if (!pud_present(*pud_k)) 291c61e211dSHarvey Harrison return NULL; 292c61e211dSHarvey Harrison 293c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 294c61e211dSHarvey Harrison pmd_k = pmd_offset(pud_k, address); 295c61e211dSHarvey Harrison if (!pmd_present(*pmd_k)) 296c61e211dSHarvey Harrison return NULL; 297c61e211dSHarvey Harrison if (!pmd_present(*pmd)) { 298c61e211dSHarvey Harrison set_pmd(pmd, *pmd_k); 299c61e211dSHarvey Harrison arch_flush_lazy_mmu_mode(); 300c61e211dSHarvey Harrison } else 301c61e211dSHarvey Harrison BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 302c61e211dSHarvey Harrison return pmd_k; 303c61e211dSHarvey Harrison } 304c61e211dSHarvey Harrison #endif 305c61e211dSHarvey Harrison 306c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 307c61e211dSHarvey Harrison static const char errata93_warning[] = 308c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 309c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 310c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n" 311c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 312c61e211dSHarvey Harrison #endif 313c61e211dSHarvey Harrison 314c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS. 315c61e211dSHarvey Harrison BIOS SMM functions are required to use a specific workaround 316c61e211dSHarvey Harrison to avoid corruption of the 64bit RIP register on C stepping K8. 317c61e211dSHarvey Harrison A lot of BIOS that didn't get tested properly miss this. 318c61e211dSHarvey Harrison The OS sees this as a page fault with the upper 32bits of RIP cleared. 319c61e211dSHarvey Harrison Try to work around it here. 320c61e211dSHarvey Harrison Note we only handle faults in kernel here. 321c61e211dSHarvey Harrison Does nothing for X86_32 322c61e211dSHarvey Harrison */ 323c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address) 324c61e211dSHarvey Harrison { 325c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 326c61e211dSHarvey Harrison static int warned; 327c61e211dSHarvey Harrison if (address != regs->ip) 328c61e211dSHarvey Harrison return 0; 329c61e211dSHarvey Harrison if ((address >> 32) != 0) 330c61e211dSHarvey Harrison return 0; 331c61e211dSHarvey Harrison address |= 0xffffffffUL << 32; 332c61e211dSHarvey Harrison if ((address >= (u64)_stext && address <= (u64)_etext) || 333c61e211dSHarvey Harrison (address >= MODULES_VADDR && address <= MODULES_END)) { 334c61e211dSHarvey Harrison if (!warned) { 335c61e211dSHarvey Harrison printk(errata93_warning); 336c61e211dSHarvey Harrison warned = 1; 337c61e211dSHarvey Harrison } 338c61e211dSHarvey Harrison regs->ip = address; 339c61e211dSHarvey Harrison return 1; 340c61e211dSHarvey Harrison } 341c61e211dSHarvey Harrison #endif 342c61e211dSHarvey Harrison return 0; 343c61e211dSHarvey Harrison } 344c61e211dSHarvey Harrison 345c61e211dSHarvey Harrison /* 346c61e211dSHarvey Harrison * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal 347c61e211dSHarvey Harrison * addresses >4GB. We catch this in the page fault handler because these 348c61e211dSHarvey Harrison * addresses are not reachable. Just detect this case and return. Any code 349c61e211dSHarvey Harrison * segment in LDT is compatibility mode. 350c61e211dSHarvey Harrison */ 351c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address) 352c61e211dSHarvey Harrison { 353c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 354c61e211dSHarvey Harrison if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && 355c61e211dSHarvey Harrison (address >> 32)) 356c61e211dSHarvey Harrison return 1; 357c61e211dSHarvey Harrison #endif 358c61e211dSHarvey Harrison return 0; 359c61e211dSHarvey Harrison } 360c61e211dSHarvey Harrison 361c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 362c61e211dSHarvey Harrison { 363c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG 364c61e211dSHarvey Harrison unsigned long nr; 365c61e211dSHarvey Harrison /* 366c61e211dSHarvey Harrison * Pentium F0 0F C7 C8 bug workaround. 367c61e211dSHarvey Harrison */ 368c61e211dSHarvey Harrison if (boot_cpu_data.f00f_bug) { 369c61e211dSHarvey Harrison nr = (address - idt_descr.address) >> 3; 370c61e211dSHarvey Harrison 371c61e211dSHarvey Harrison if (nr == 6) { 372c61e211dSHarvey Harrison do_invalid_op(regs, 0); 373c61e211dSHarvey Harrison return 1; 374c61e211dSHarvey Harrison } 375c61e211dSHarvey Harrison } 376c61e211dSHarvey Harrison #endif 377c61e211dSHarvey Harrison return 0; 378c61e211dSHarvey Harrison } 379c61e211dSHarvey Harrison 380c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, 381c61e211dSHarvey Harrison unsigned long address) 382c61e211dSHarvey Harrison { 383c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 384c61e211dSHarvey Harrison if (!oops_may_print()) 385c61e211dSHarvey Harrison return; 386fd40d6e3SHarvey Harrison #endif 387c61e211dSHarvey Harrison 388c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE 389c61e211dSHarvey Harrison if (error_code & PF_INSTR) { 39093809be8SHarvey Harrison unsigned int level; 391c61e211dSHarvey Harrison pte_t *pte = lookup_address(address, &level); 392c61e211dSHarvey Harrison 393c61e211dSHarvey Harrison if (pte && pte_present(*pte) && !pte_exec(*pte)) 394c61e211dSHarvey Harrison printk(KERN_CRIT "kernel tried to execute " 395c61e211dSHarvey Harrison "NX-protected page - exploit attempt? " 396c61e211dSHarvey Harrison "(uid: %d)\n", current->uid); 397c61e211dSHarvey Harrison } 398c61e211dSHarvey Harrison #endif 399fd40d6e3SHarvey Harrison 400c61e211dSHarvey Harrison printk(KERN_ALERT "BUG: unable to handle kernel "); 401c61e211dSHarvey Harrison if (address < PAGE_SIZE) 402c61e211dSHarvey Harrison printk(KERN_CONT "NULL pointer dereference"); 403c61e211dSHarvey Harrison else 404c61e211dSHarvey Harrison printk(KERN_CONT "paging request"); 405f294a8ceSVegard Nossum printk(KERN_CONT " at %p\n", (void *) address); 406c61e211dSHarvey Harrison printk(KERN_ALERT "IP:"); 407c61e211dSHarvey Harrison printk_address(regs->ip, 1); 408c61e211dSHarvey Harrison dump_pagetable(address); 409c61e211dSHarvey Harrison } 410c61e211dSHarvey Harrison 411c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 412c61e211dSHarvey Harrison static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 413c61e211dSHarvey Harrison unsigned long error_code) 414c61e211dSHarvey Harrison { 415c61e211dSHarvey Harrison unsigned long flags = oops_begin(); 416*874d93d1SAlexander van Heukelum int sig = SIGKILL; 417c61e211dSHarvey Harrison struct task_struct *tsk; 418c61e211dSHarvey Harrison 419c61e211dSHarvey Harrison printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 420c61e211dSHarvey Harrison current->comm, address); 421c61e211dSHarvey Harrison dump_pagetable(address); 422c61e211dSHarvey Harrison tsk = current; 423c61e211dSHarvey Harrison tsk->thread.cr2 = address; 424c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 425c61e211dSHarvey Harrison tsk->thread.error_code = error_code; 426c61e211dSHarvey Harrison if (__die("Bad pagetable", regs, error_code)) 427*874d93d1SAlexander van Heukelum sig = 0; 428*874d93d1SAlexander van Heukelum oops_end(flags, regs, sig); 429c61e211dSHarvey Harrison } 430c61e211dSHarvey Harrison #endif 431c61e211dSHarvey Harrison 432d8b57bb7SThomas Gleixner static int spurious_fault_check(unsigned long error_code, pte_t *pte) 433d8b57bb7SThomas Gleixner { 434d8b57bb7SThomas Gleixner if ((error_code & PF_WRITE) && !pte_write(*pte)) 435d8b57bb7SThomas Gleixner return 0; 436d8b57bb7SThomas Gleixner if ((error_code & PF_INSTR) && !pte_exec(*pte)) 437d8b57bb7SThomas Gleixner return 0; 438d8b57bb7SThomas Gleixner 439d8b57bb7SThomas Gleixner return 1; 440d8b57bb7SThomas Gleixner } 441d8b57bb7SThomas Gleixner 442c61e211dSHarvey Harrison /* 4435b727a3bSJeremy Fitzhardinge * Handle a spurious fault caused by a stale TLB entry. This allows 4445b727a3bSJeremy Fitzhardinge * us to lazily refresh the TLB when increasing the permissions of a 4455b727a3bSJeremy Fitzhardinge * kernel page (RO -> RW or NX -> X). Doing it eagerly is very 4465b727a3bSJeremy Fitzhardinge * expensive since that implies doing a full cross-processor TLB 4475b727a3bSJeremy Fitzhardinge * flush, even if no stale TLB entries exist on other processors. 4485b727a3bSJeremy Fitzhardinge * There are no security implications to leaving a stale TLB when 4495b727a3bSJeremy Fitzhardinge * increasing the permissions on a page. 4505b727a3bSJeremy Fitzhardinge */ 4515b727a3bSJeremy Fitzhardinge static int spurious_fault(unsigned long address, 4525b727a3bSJeremy Fitzhardinge unsigned long error_code) 4535b727a3bSJeremy Fitzhardinge { 4545b727a3bSJeremy Fitzhardinge pgd_t *pgd; 4555b727a3bSJeremy Fitzhardinge pud_t *pud; 4565b727a3bSJeremy Fitzhardinge pmd_t *pmd; 4575b727a3bSJeremy Fitzhardinge pte_t *pte; 4585b727a3bSJeremy Fitzhardinge 4595b727a3bSJeremy Fitzhardinge /* Reserved-bit violation or user access to kernel space? */ 4605b727a3bSJeremy Fitzhardinge if (error_code & (PF_USER | PF_RSVD)) 4615b727a3bSJeremy Fitzhardinge return 0; 4625b727a3bSJeremy Fitzhardinge 4635b727a3bSJeremy Fitzhardinge pgd = init_mm.pgd + pgd_index(address); 4645b727a3bSJeremy Fitzhardinge if (!pgd_present(*pgd)) 4655b727a3bSJeremy Fitzhardinge return 0; 4665b727a3bSJeremy Fitzhardinge 4675b727a3bSJeremy Fitzhardinge pud = pud_offset(pgd, address); 4685b727a3bSJeremy Fitzhardinge if (!pud_present(*pud)) 4695b727a3bSJeremy Fitzhardinge return 0; 4705b727a3bSJeremy Fitzhardinge 471d8b57bb7SThomas Gleixner if (pud_large(*pud)) 472d8b57bb7SThomas Gleixner return spurious_fault_check(error_code, (pte_t *) pud); 473d8b57bb7SThomas Gleixner 4745b727a3bSJeremy Fitzhardinge pmd = pmd_offset(pud, address); 4755b727a3bSJeremy Fitzhardinge if (!pmd_present(*pmd)) 4765b727a3bSJeremy Fitzhardinge return 0; 4775b727a3bSJeremy Fitzhardinge 478d8b57bb7SThomas Gleixner if (pmd_large(*pmd)) 479d8b57bb7SThomas Gleixner return spurious_fault_check(error_code, (pte_t *) pmd); 480d8b57bb7SThomas Gleixner 4815b727a3bSJeremy Fitzhardinge pte = pte_offset_kernel(pmd, address); 4825b727a3bSJeremy Fitzhardinge if (!pte_present(*pte)) 4835b727a3bSJeremy Fitzhardinge return 0; 4845b727a3bSJeremy Fitzhardinge 485d8b57bb7SThomas Gleixner return spurious_fault_check(error_code, pte); 4865b727a3bSJeremy Fitzhardinge } 4875b727a3bSJeremy Fitzhardinge 4885b727a3bSJeremy Fitzhardinge /* 489c61e211dSHarvey Harrison * X86_32 490c61e211dSHarvey Harrison * Handle a fault on the vmalloc or module mapping area 491c61e211dSHarvey Harrison * 492c61e211dSHarvey Harrison * X86_64 493c61e211dSHarvey Harrison * Handle a fault on the vmalloc area 494c61e211dSHarvey Harrison * 495c61e211dSHarvey Harrison * This assumes no large pages in there. 496c61e211dSHarvey Harrison */ 497c61e211dSHarvey Harrison static int vmalloc_fault(unsigned long address) 498c61e211dSHarvey Harrison { 499c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 500c61e211dSHarvey Harrison unsigned long pgd_paddr; 501c61e211dSHarvey Harrison pmd_t *pmd_k; 502c61e211dSHarvey Harrison pte_t *pte_k; 503b29c701dSHenry Nestler 504b29c701dSHenry Nestler /* Make sure we are in vmalloc area */ 505b29c701dSHenry Nestler if (!(address >= VMALLOC_START && address < VMALLOC_END)) 506b29c701dSHenry Nestler return -1; 507b29c701dSHenry Nestler 508c61e211dSHarvey Harrison /* 509c61e211dSHarvey Harrison * Synchronize this task's top level page-table 510c61e211dSHarvey Harrison * with the 'reference' page table. 511c61e211dSHarvey Harrison * 512c61e211dSHarvey Harrison * Do _not_ use "current" here. We might be inside 513c61e211dSHarvey Harrison * an interrupt in the middle of a task switch.. 514c61e211dSHarvey Harrison */ 515c61e211dSHarvey Harrison pgd_paddr = read_cr3(); 516c61e211dSHarvey Harrison pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 517c61e211dSHarvey Harrison if (!pmd_k) 518c61e211dSHarvey Harrison return -1; 519c61e211dSHarvey Harrison pte_k = pte_offset_kernel(pmd_k, address); 520c61e211dSHarvey Harrison if (!pte_present(*pte_k)) 521c61e211dSHarvey Harrison return -1; 522c61e211dSHarvey Harrison return 0; 523c61e211dSHarvey Harrison #else 524c61e211dSHarvey Harrison pgd_t *pgd, *pgd_ref; 525c61e211dSHarvey Harrison pud_t *pud, *pud_ref; 526c61e211dSHarvey Harrison pmd_t *pmd, *pmd_ref; 527c61e211dSHarvey Harrison pte_t *pte, *pte_ref; 528c61e211dSHarvey Harrison 529cf89ec92SHarvey Harrison /* Make sure we are in vmalloc area */ 530cf89ec92SHarvey Harrison if (!(address >= VMALLOC_START && address < VMALLOC_END)) 531cf89ec92SHarvey Harrison return -1; 532cf89ec92SHarvey Harrison 533c61e211dSHarvey Harrison /* Copy kernel mappings over when needed. This can also 534c61e211dSHarvey Harrison happen within a race in page table update. In the later 535c61e211dSHarvey Harrison case just flush. */ 536c61e211dSHarvey Harrison 537c61e211dSHarvey Harrison pgd = pgd_offset(current->mm ?: &init_mm, address); 538c61e211dSHarvey Harrison pgd_ref = pgd_offset_k(address); 539c61e211dSHarvey Harrison if (pgd_none(*pgd_ref)) 540c61e211dSHarvey Harrison return -1; 541c61e211dSHarvey Harrison if (pgd_none(*pgd)) 542c61e211dSHarvey Harrison set_pgd(pgd, *pgd_ref); 543c61e211dSHarvey Harrison else 544c61e211dSHarvey Harrison BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 545c61e211dSHarvey Harrison 546c61e211dSHarvey Harrison /* Below here mismatches are bugs because these lower tables 547c61e211dSHarvey Harrison are shared */ 548c61e211dSHarvey Harrison 549c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 550c61e211dSHarvey Harrison pud_ref = pud_offset(pgd_ref, address); 551c61e211dSHarvey Harrison if (pud_none(*pud_ref)) 552c61e211dSHarvey Harrison return -1; 553c61e211dSHarvey Harrison if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 554c61e211dSHarvey Harrison BUG(); 555c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 556c61e211dSHarvey Harrison pmd_ref = pmd_offset(pud_ref, address); 557c61e211dSHarvey Harrison if (pmd_none(*pmd_ref)) 558c61e211dSHarvey Harrison return -1; 559c61e211dSHarvey Harrison if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 560c61e211dSHarvey Harrison BUG(); 561c61e211dSHarvey Harrison pte_ref = pte_offset_kernel(pmd_ref, address); 562c61e211dSHarvey Harrison if (!pte_present(*pte_ref)) 563c61e211dSHarvey Harrison return -1; 564c61e211dSHarvey Harrison pte = pte_offset_kernel(pmd, address); 565c61e211dSHarvey Harrison /* Don't use pte_page here, because the mappings can point 566c61e211dSHarvey Harrison outside mem_map, and the NUMA hash lookup cannot handle 567c61e211dSHarvey Harrison that. */ 568c61e211dSHarvey Harrison if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 569c61e211dSHarvey Harrison BUG(); 570c61e211dSHarvey Harrison return 0; 571c61e211dSHarvey Harrison #endif 572c61e211dSHarvey Harrison } 573c61e211dSHarvey Harrison 574c61e211dSHarvey Harrison int show_unhandled_signals = 1; 575c61e211dSHarvey Harrison 576c61e211dSHarvey Harrison /* 577c61e211dSHarvey Harrison * This routine handles page faults. It determines the address, 578c61e211dSHarvey Harrison * and the problem, and then passes it off to one of the appropriate 579c61e211dSHarvey Harrison * routines. 580c61e211dSHarvey Harrison */ 581c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 582c61e211dSHarvey Harrison asmlinkage 583c61e211dSHarvey Harrison #endif 584c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 585c61e211dSHarvey Harrison { 586c61e211dSHarvey Harrison struct task_struct *tsk; 587c61e211dSHarvey Harrison struct mm_struct *mm; 588c61e211dSHarvey Harrison struct vm_area_struct *vma; 589c61e211dSHarvey Harrison unsigned long address; 590c61e211dSHarvey Harrison int write, si_code; 591c61e211dSHarvey Harrison int fault; 592c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 593c61e211dSHarvey Harrison unsigned long flags; 594*874d93d1SAlexander van Heukelum int sig; 595c61e211dSHarvey Harrison #endif 596c61e211dSHarvey Harrison 597c61e211dSHarvey Harrison tsk = current; 598c61e211dSHarvey Harrison mm = tsk->mm; 599c61e211dSHarvey Harrison prefetchw(&mm->mmap_sem); 600c61e211dSHarvey Harrison 601c61e211dSHarvey Harrison /* get the address */ 602c61e211dSHarvey Harrison address = read_cr2(); 603c61e211dSHarvey Harrison 604c61e211dSHarvey Harrison si_code = SEGV_MAPERR; 605c61e211dSHarvey Harrison 606c61e211dSHarvey Harrison if (notify_page_fault(regs)) 607c61e211dSHarvey Harrison return; 6080fd0e3daSPekka Paalanen if (unlikely(kmmio_fault(regs, address))) 60986069782SPekka Paalanen return; 610c61e211dSHarvey Harrison 611c61e211dSHarvey Harrison /* 612c61e211dSHarvey Harrison * We fault-in kernel-space virtual memory on-demand. The 613c61e211dSHarvey Harrison * 'reference' page table is init_mm.pgd. 614c61e211dSHarvey Harrison * 615c61e211dSHarvey Harrison * NOTE! We MUST NOT take any locks for this case. We may 616c61e211dSHarvey Harrison * be in an interrupt or a critical region, and should 617c61e211dSHarvey Harrison * only copy the information from the master page table, 618c61e211dSHarvey Harrison * nothing more. 619c61e211dSHarvey Harrison * 620c61e211dSHarvey Harrison * This verifies that the fault happens in kernel space 621c61e211dSHarvey Harrison * (error_code & 4) == 0, and that the fault was not a 622c61e211dSHarvey Harrison * protection error (error_code & 9) == 0. 623c61e211dSHarvey Harrison */ 624c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 625c61e211dSHarvey Harrison if (unlikely(address >= TASK_SIZE)) { 626cf89ec92SHarvey Harrison #else 627cf89ec92SHarvey Harrison if (unlikely(address >= TASK_SIZE64)) { 628cf89ec92SHarvey Harrison #endif 629c61e211dSHarvey Harrison if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 630c61e211dSHarvey Harrison vmalloc_fault(address) >= 0) 631c61e211dSHarvey Harrison return; 6325b727a3bSJeremy Fitzhardinge 6335b727a3bSJeremy Fitzhardinge /* Can handle a stale RO->RW TLB */ 6345b727a3bSJeremy Fitzhardinge if (spurious_fault(address, error_code)) 6355b727a3bSJeremy Fitzhardinge return; 6365b727a3bSJeremy Fitzhardinge 637c61e211dSHarvey Harrison /* 638c61e211dSHarvey Harrison * Don't take the mm semaphore here. If we fixup a prefetch 639c61e211dSHarvey Harrison * fault we could otherwise deadlock. 640c61e211dSHarvey Harrison */ 641c61e211dSHarvey Harrison goto bad_area_nosemaphore; 642c61e211dSHarvey Harrison } 643c61e211dSHarvey Harrison 644cf89ec92SHarvey Harrison 645c61e211dSHarvey Harrison /* 646891cffbdSLinus Torvalds * It's safe to allow irq's after cr2 has been saved and the 647891cffbdSLinus Torvalds * vmalloc fault has been handled. 648891cffbdSLinus Torvalds * 649891cffbdSLinus Torvalds * User-mode registers count as a user access even for any 650891cffbdSLinus Torvalds * potential system fault or CPU buglet. 651c61e211dSHarvey Harrison */ 652891cffbdSLinus Torvalds if (user_mode_vm(regs)) { 653891cffbdSLinus Torvalds local_irq_enable(); 654891cffbdSLinus Torvalds error_code |= PF_USER; 655891cffbdSLinus Torvalds } else if (regs->flags & X86_EFLAGS_IF) 656c61e211dSHarvey Harrison local_irq_enable(); 657c61e211dSHarvey Harrison 658891cffbdSLinus Torvalds #ifdef CONFIG_X86_64 659c61e211dSHarvey Harrison if (unlikely(error_code & PF_RSVD)) 660c61e211dSHarvey Harrison pgtable_bad(address, regs, error_code); 661891cffbdSLinus Torvalds #endif 662c61e211dSHarvey Harrison 663c61e211dSHarvey Harrison /* 664c61e211dSHarvey Harrison * If we're in an interrupt, have no user context or are running in an 665c61e211dSHarvey Harrison * atomic region then we must not take the fault. 666c61e211dSHarvey Harrison */ 667c61e211dSHarvey Harrison if (unlikely(in_atomic() || !mm)) 668c61e211dSHarvey Harrison goto bad_area_nosemaphore; 669c61e211dSHarvey Harrison 670c61e211dSHarvey Harrison again: 6713a1dfe6eSIngo Molnar /* 6723a1dfe6eSIngo Molnar * When running in the kernel we expect faults to occur only to 673c61e211dSHarvey Harrison * addresses in user space. All other faults represent errors in the 674c61e211dSHarvey Harrison * kernel and should generate an OOPS. Unfortunately, in the case of an 675c61e211dSHarvey Harrison * erroneous fault occurring in a code path which already holds mmap_sem 676c61e211dSHarvey Harrison * we will deadlock attempting to validate the fault against the 677c61e211dSHarvey Harrison * address space. Luckily the kernel only validly references user 678c61e211dSHarvey Harrison * space from well defined areas of code, which are listed in the 679c61e211dSHarvey Harrison * exceptions table. 680c61e211dSHarvey Harrison * 681c61e211dSHarvey Harrison * As the vast majority of faults will be valid we will only perform 682c61e211dSHarvey Harrison * the source reference check when there is a possibility of a deadlock. 683c61e211dSHarvey Harrison * Attempt to lock the address space, if we cannot we then validate the 684c61e211dSHarvey Harrison * source. If this is invalid we can skip the address space check, 685c61e211dSHarvey Harrison * thus avoiding the deadlock. 686c61e211dSHarvey Harrison */ 687c61e211dSHarvey Harrison if (!down_read_trylock(&mm->mmap_sem)) { 688c61e211dSHarvey Harrison if ((error_code & PF_USER) == 0 && 689c61e211dSHarvey Harrison !search_exception_tables(regs->ip)) 690c61e211dSHarvey Harrison goto bad_area_nosemaphore; 691c61e211dSHarvey Harrison down_read(&mm->mmap_sem); 692c61e211dSHarvey Harrison } 693c61e211dSHarvey Harrison 694c61e211dSHarvey Harrison vma = find_vma(mm, address); 695c61e211dSHarvey Harrison if (!vma) 696c61e211dSHarvey Harrison goto bad_area; 697c61e211dSHarvey Harrison if (vma->vm_start <= address) 698c61e211dSHarvey Harrison goto good_area; 699c61e211dSHarvey Harrison if (!(vma->vm_flags & VM_GROWSDOWN)) 700c61e211dSHarvey Harrison goto bad_area; 701c61e211dSHarvey Harrison if (error_code & PF_USER) { 702c61e211dSHarvey Harrison /* 703c61e211dSHarvey Harrison * Accessing the stack below %sp is always a bug. 704c61e211dSHarvey Harrison * The large cushion allows instructions like enter 705c61e211dSHarvey Harrison * and pusha to work. ("enter $65535,$31" pushes 706c61e211dSHarvey Harrison * 32 pointers and then decrements %sp by 65535.) 707c61e211dSHarvey Harrison */ 708c61e211dSHarvey Harrison if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 709c61e211dSHarvey Harrison goto bad_area; 710c61e211dSHarvey Harrison } 711c61e211dSHarvey Harrison if (expand_stack(vma, address)) 712c61e211dSHarvey Harrison goto bad_area; 713c61e211dSHarvey Harrison /* 714c61e211dSHarvey Harrison * Ok, we have a good vm_area for this memory access, so 715c61e211dSHarvey Harrison * we can handle it.. 716c61e211dSHarvey Harrison */ 717c61e211dSHarvey Harrison good_area: 718c61e211dSHarvey Harrison si_code = SEGV_ACCERR; 719c61e211dSHarvey Harrison write = 0; 720c61e211dSHarvey Harrison switch (error_code & (PF_PROT|PF_WRITE)) { 721c61e211dSHarvey Harrison default: /* 3: write, present */ 722c61e211dSHarvey Harrison /* fall through */ 723c61e211dSHarvey Harrison case PF_WRITE: /* write, not present */ 724c61e211dSHarvey Harrison if (!(vma->vm_flags & VM_WRITE)) 725c61e211dSHarvey Harrison goto bad_area; 726c61e211dSHarvey Harrison write++; 727c61e211dSHarvey Harrison break; 728c61e211dSHarvey Harrison case PF_PROT: /* read, present */ 729c61e211dSHarvey Harrison goto bad_area; 730c61e211dSHarvey Harrison case 0: /* read, not present */ 731c61e211dSHarvey Harrison if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 732c61e211dSHarvey Harrison goto bad_area; 733c61e211dSHarvey Harrison } 734c61e211dSHarvey Harrison 735c61e211dSHarvey Harrison /* 736c61e211dSHarvey Harrison * If for any reason at all we couldn't handle the fault, 737c61e211dSHarvey Harrison * make sure we exit gracefully rather than endlessly redo 738c61e211dSHarvey Harrison * the fault. 739c61e211dSHarvey Harrison */ 740c61e211dSHarvey Harrison fault = handle_mm_fault(mm, vma, address, write); 741c61e211dSHarvey Harrison if (unlikely(fault & VM_FAULT_ERROR)) { 742c61e211dSHarvey Harrison if (fault & VM_FAULT_OOM) 743c61e211dSHarvey Harrison goto out_of_memory; 744c61e211dSHarvey Harrison else if (fault & VM_FAULT_SIGBUS) 745c61e211dSHarvey Harrison goto do_sigbus; 746c61e211dSHarvey Harrison BUG(); 747c61e211dSHarvey Harrison } 748c61e211dSHarvey Harrison if (fault & VM_FAULT_MAJOR) 749c61e211dSHarvey Harrison tsk->maj_flt++; 750c61e211dSHarvey Harrison else 751c61e211dSHarvey Harrison tsk->min_flt++; 752c61e211dSHarvey Harrison 753c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 754c61e211dSHarvey Harrison /* 755c61e211dSHarvey Harrison * Did it hit the DOS screen memory VA from vm86 mode? 756c61e211dSHarvey Harrison */ 757c61e211dSHarvey Harrison if (v8086_mode(regs)) { 758c61e211dSHarvey Harrison unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; 759c61e211dSHarvey Harrison if (bit < 32) 760c61e211dSHarvey Harrison tsk->thread.screen_bitmap |= 1 << bit; 761c61e211dSHarvey Harrison } 762c61e211dSHarvey Harrison #endif 763c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 764c61e211dSHarvey Harrison return; 765c61e211dSHarvey Harrison 766c61e211dSHarvey Harrison /* 767c61e211dSHarvey Harrison * Something tried to access memory that isn't in our memory map.. 768c61e211dSHarvey Harrison * Fix it, but check if it's kernel or user first.. 769c61e211dSHarvey Harrison */ 770c61e211dSHarvey Harrison bad_area: 771c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 772c61e211dSHarvey Harrison 773c61e211dSHarvey Harrison bad_area_nosemaphore: 774c61e211dSHarvey Harrison /* User mode accesses just cause a SIGSEGV */ 775c61e211dSHarvey Harrison if (error_code & PF_USER) { 776c61e211dSHarvey Harrison /* 777c61e211dSHarvey Harrison * It's possible to have interrupts off here. 778c61e211dSHarvey Harrison */ 779c61e211dSHarvey Harrison local_irq_enable(); 780c61e211dSHarvey Harrison 781c61e211dSHarvey Harrison /* 782c61e211dSHarvey Harrison * Valid to do another page fault here because this one came 783c61e211dSHarvey Harrison * from user space. 784c61e211dSHarvey Harrison */ 785c61e211dSHarvey Harrison if (is_prefetch(regs, address, error_code)) 786c61e211dSHarvey Harrison return; 787c61e211dSHarvey Harrison 788c61e211dSHarvey Harrison if (is_errata100(regs, address)) 789c61e211dSHarvey Harrison return; 790c61e211dSHarvey Harrison 791c61e211dSHarvey Harrison if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 792c61e211dSHarvey Harrison printk_ratelimit()) { 793c61e211dSHarvey Harrison printk( 794f294a8ceSVegard Nossum "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 795c61e211dSHarvey Harrison task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 796f294a8ceSVegard Nossum tsk->comm, task_pid_nr(tsk), address, 797f294a8ceSVegard Nossum (void *) regs->ip, (void *) regs->sp, error_code); 798c61e211dSHarvey Harrison print_vma_addr(" in ", regs->ip); 799c61e211dSHarvey Harrison printk("\n"); 800c61e211dSHarvey Harrison } 801c61e211dSHarvey Harrison 802c61e211dSHarvey Harrison tsk->thread.cr2 = address; 803c61e211dSHarvey Harrison /* Kernel addresses are always protection faults */ 804c61e211dSHarvey Harrison tsk->thread.error_code = error_code | (address >= TASK_SIZE); 805c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 806c61e211dSHarvey Harrison force_sig_info_fault(SIGSEGV, si_code, address, tsk); 807c61e211dSHarvey Harrison return; 808c61e211dSHarvey Harrison } 809c61e211dSHarvey Harrison 810c61e211dSHarvey Harrison if (is_f00f_bug(regs, address)) 811c61e211dSHarvey Harrison return; 812c61e211dSHarvey Harrison 813c61e211dSHarvey Harrison no_context: 814c61e211dSHarvey Harrison /* Are we prepared to handle this kernel fault? */ 815c61e211dSHarvey Harrison if (fixup_exception(regs)) 816c61e211dSHarvey Harrison return; 817c61e211dSHarvey Harrison 818c61e211dSHarvey Harrison /* 819c61e211dSHarvey Harrison * X86_32 820c61e211dSHarvey Harrison * Valid to do another page fault here, because if this fault 821c61e211dSHarvey Harrison * had been triggered by is_prefetch fixup_exception would have 822c61e211dSHarvey Harrison * handled it. 823c61e211dSHarvey Harrison * 824c61e211dSHarvey Harrison * X86_64 825c61e211dSHarvey Harrison * Hall of shame of CPU/BIOS bugs. 826c61e211dSHarvey Harrison */ 827c61e211dSHarvey Harrison if (is_prefetch(regs, address, error_code)) 828c61e211dSHarvey Harrison return; 829c61e211dSHarvey Harrison 830c61e211dSHarvey Harrison if (is_errata93(regs, address)) 831c61e211dSHarvey Harrison return; 832c61e211dSHarvey Harrison 833c61e211dSHarvey Harrison /* 834c61e211dSHarvey Harrison * Oops. The kernel tried to access some bad page. We'll have to 835c61e211dSHarvey Harrison * terminate things with extreme prejudice. 836c61e211dSHarvey Harrison */ 837c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 838c61e211dSHarvey Harrison bust_spinlocks(1); 839fd40d6e3SHarvey Harrison #else 840fd40d6e3SHarvey Harrison flags = oops_begin(); 841fd40d6e3SHarvey Harrison #endif 842c61e211dSHarvey Harrison 843c61e211dSHarvey Harrison show_fault_oops(regs, error_code, address); 844c61e211dSHarvey Harrison 845c61e211dSHarvey Harrison tsk->thread.cr2 = address; 846c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 847c61e211dSHarvey Harrison tsk->thread.error_code = error_code; 848fd40d6e3SHarvey Harrison 849fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32 850c61e211dSHarvey Harrison die("Oops", regs, error_code); 851c61e211dSHarvey Harrison bust_spinlocks(0); 852c61e211dSHarvey Harrison do_exit(SIGKILL); 853fd40d6e3SHarvey Harrison #else 854*874d93d1SAlexander van Heukelum sig = SIGKILL; 855c61e211dSHarvey Harrison if (__die("Oops", regs, error_code)) 856*874d93d1SAlexander van Heukelum sig = 0; 857c61e211dSHarvey Harrison /* Executive summary in case the body of the oops scrolled away */ 858c61e211dSHarvey Harrison printk(KERN_EMERG "CR2: %016lx\n", address); 859*874d93d1SAlexander van Heukelum oops_end(flags, regs, sig); 860c61e211dSHarvey Harrison #endif 861c61e211dSHarvey Harrison 862c61e211dSHarvey Harrison /* 863c61e211dSHarvey Harrison * We ran out of memory, or some other thing happened to us that made 864c61e211dSHarvey Harrison * us unable to handle the page fault gracefully. 865c61e211dSHarvey Harrison */ 866c61e211dSHarvey Harrison out_of_memory: 867c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 868c61e211dSHarvey Harrison if (is_global_init(tsk)) { 869c61e211dSHarvey Harrison yield(); 8703a1dfe6eSIngo Molnar /* 8713a1dfe6eSIngo Molnar * Re-lookup the vma - in theory the vma tree might 8723a1dfe6eSIngo Molnar * have changed: 8733a1dfe6eSIngo Molnar */ 874c61e211dSHarvey Harrison goto again; 875fd40d6e3SHarvey Harrison } 876fd40d6e3SHarvey Harrison 877c61e211dSHarvey Harrison printk("VM: killing process %s\n", tsk->comm); 878c61e211dSHarvey Harrison if (error_code & PF_USER) 879c61e211dSHarvey Harrison do_group_exit(SIGKILL); 880c61e211dSHarvey Harrison goto no_context; 881c61e211dSHarvey Harrison 882c61e211dSHarvey Harrison do_sigbus: 883c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 884c61e211dSHarvey Harrison 885c61e211dSHarvey Harrison /* Kernel mode? Handle exceptions or die */ 886c61e211dSHarvey Harrison if (!(error_code & PF_USER)) 887c61e211dSHarvey Harrison goto no_context; 888c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 889c61e211dSHarvey Harrison /* User space => ok to do another page fault */ 890c61e211dSHarvey Harrison if (is_prefetch(regs, address, error_code)) 891c61e211dSHarvey Harrison return; 892c61e211dSHarvey Harrison #endif 893c61e211dSHarvey Harrison tsk->thread.cr2 = address; 894c61e211dSHarvey Harrison tsk->thread.error_code = error_code; 895c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 896c61e211dSHarvey Harrison force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 897c61e211dSHarvey Harrison } 898c61e211dSHarvey Harrison 899c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock); 900c61e211dSHarvey Harrison LIST_HEAD(pgd_list); 901c61e211dSHarvey Harrison 902c61e211dSHarvey Harrison void vmalloc_sync_all(void) 903c61e211dSHarvey Harrison { 904c61e211dSHarvey Harrison unsigned long address; 905c61e211dSHarvey Harrison 906cc643d46SJan Beulich #ifdef CONFIG_X86_32 907c61e211dSHarvey Harrison if (SHARED_KERNEL_PMD) 908c61e211dSHarvey Harrison return; 909c61e211dSHarvey Harrison 910cc643d46SJan Beulich for (address = VMALLOC_START & PMD_MASK; 911cc643d46SJan Beulich address >= TASK_SIZE && address < FIXADDR_TOP; 912cc643d46SJan Beulich address += PMD_SIZE) { 913c61e211dSHarvey Harrison unsigned long flags; 914c61e211dSHarvey Harrison struct page *page; 915c61e211dSHarvey Harrison 916c61e211dSHarvey Harrison spin_lock_irqsave(&pgd_lock, flags); 917e3ed910dSJeremy Fitzhardinge list_for_each_entry(page, &pgd_list, lru) { 918c61e211dSHarvey Harrison if (!vmalloc_sync_one(page_address(page), 919e3ed910dSJeremy Fitzhardinge address)) 920c61e211dSHarvey Harrison break; 921c61e211dSHarvey Harrison } 922c61e211dSHarvey Harrison spin_unlock_irqrestore(&pgd_lock, flags); 923c61e211dSHarvey Harrison } 924c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */ 925cc643d46SJan Beulich for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; 926cc643d46SJan Beulich address += PGDIR_SIZE) { 927c61e211dSHarvey Harrison const pgd_t *pgd_ref = pgd_offset_k(address); 92858d5d0d8SIngo Molnar unsigned long flags; 929c61e211dSHarvey Harrison struct page *page; 930c61e211dSHarvey Harrison 931c61e211dSHarvey Harrison if (pgd_none(*pgd_ref)) 932c61e211dSHarvey Harrison continue; 93358d5d0d8SIngo Molnar spin_lock_irqsave(&pgd_lock, flags); 934c61e211dSHarvey Harrison list_for_each_entry(page, &pgd_list, lru) { 935c61e211dSHarvey Harrison pgd_t *pgd; 936c61e211dSHarvey Harrison pgd = (pgd_t *)page_address(page) + pgd_index(address); 937c61e211dSHarvey Harrison if (pgd_none(*pgd)) 938c61e211dSHarvey Harrison set_pgd(pgd, *pgd_ref); 939c61e211dSHarvey Harrison else 940c61e211dSHarvey Harrison BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 941c61e211dSHarvey Harrison } 94258d5d0d8SIngo Molnar spin_unlock_irqrestore(&pgd_lock, flags); 943c61e211dSHarvey Harrison } 944c61e211dSHarvey Harrison #endif 945c61e211dSHarvey Harrison } 946