1c61e211dSHarvey Harrison /* 2c61e211dSHarvey Harrison * Copyright (C) 1995 Linus Torvalds 3c61e211dSHarvey Harrison * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 4c61e211dSHarvey Harrison */ 5c61e211dSHarvey Harrison 6c61e211dSHarvey Harrison #include <linux/signal.h> 7c61e211dSHarvey Harrison #include <linux/sched.h> 8c61e211dSHarvey Harrison #include <linux/kernel.h> 9c61e211dSHarvey Harrison #include <linux/errno.h> 10c61e211dSHarvey Harrison #include <linux/string.h> 11c61e211dSHarvey Harrison #include <linux/types.h> 12c61e211dSHarvey Harrison #include <linux/ptrace.h> 130fd0e3daSPekka Paalanen #include <linux/mmiotrace.h> 14c61e211dSHarvey Harrison #include <linux/mman.h> 15c61e211dSHarvey Harrison #include <linux/mm.h> 16c61e211dSHarvey Harrison #include <linux/smp.h> 17c61e211dSHarvey Harrison #include <linux/interrupt.h> 18c61e211dSHarvey Harrison #include <linux/init.h> 19c61e211dSHarvey Harrison #include <linux/tty.h> 20c61e211dSHarvey Harrison #include <linux/vt_kern.h> /* For unblank_screen() */ 21c61e211dSHarvey Harrison #include <linux/compiler.h> 22c61e211dSHarvey Harrison #include <linux/highmem.h> 23c61e211dSHarvey Harrison #include <linux/bootmem.h> /* for max_low_pfn */ 24c61e211dSHarvey Harrison #include <linux/vmalloc.h> 25c61e211dSHarvey Harrison #include <linux/module.h> 26c61e211dSHarvey Harrison #include <linux/kprobes.h> 27c61e211dSHarvey Harrison #include <linux/uaccess.h> 28c61e211dSHarvey Harrison #include <linux/kdebug.h> 297c9f8861SEric Sandeen #include <linux/magic.h> 30c61e211dSHarvey Harrison 31c61e211dSHarvey Harrison #include <asm/system.h> 32c61e211dSHarvey Harrison #include <asm/desc.h> 33c61e211dSHarvey Harrison #include <asm/segment.h> 34c61e211dSHarvey Harrison #include <asm/pgalloc.h> 35c61e211dSHarvey Harrison #include <asm/smp.h> 36c61e211dSHarvey Harrison #include <asm/tlbflush.h> 37c61e211dSHarvey Harrison #include <asm/proto.h> 38c61e211dSHarvey Harrison #include <asm-generic/sections.h> 3970ef5641SJaswinder Singh #include <asm/traps.h> 40c61e211dSHarvey Harrison 41c61e211dSHarvey Harrison /* 42c61e211dSHarvey Harrison * Page fault error code bits 43c61e211dSHarvey Harrison * bit 0 == 0 means no page found, 1 means protection fault 44c61e211dSHarvey Harrison * bit 1 == 0 means read, 1 means write 45c61e211dSHarvey Harrison * bit 2 == 0 means kernel, 1 means user-mode 46c61e211dSHarvey Harrison * bit 3 == 1 means use of reserved bit detected 47c61e211dSHarvey Harrison * bit 4 == 1 means fault was an instruction fetch 48c61e211dSHarvey Harrison */ 49c61e211dSHarvey Harrison #define PF_PROT (1<<0) 50c61e211dSHarvey Harrison #define PF_WRITE (1<<1) 51c61e211dSHarvey Harrison #define PF_USER (1<<2) 52c61e211dSHarvey Harrison #define PF_RSVD (1<<3) 53c61e211dSHarvey Harrison #define PF_INSTR (1<<4) 54c61e211dSHarvey Harrison 550fd0e3daSPekka Paalanen static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 5686069782SPekka Paalanen { 57fd3fdf11SPekka Paalanen #ifdef CONFIG_MMIOTRACE 580fd0e3daSPekka Paalanen if (unlikely(is_kmmio_active())) 590fd0e3daSPekka Paalanen if (kmmio_handler(regs, addr) == 1) 600fd0e3daSPekka Paalanen return -1; 6186069782SPekka Paalanen #endif 620fd0e3daSPekka Paalanen return 0; 6386069782SPekka Paalanen } 6486069782SPekka Paalanen 65c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs) 66c61e211dSHarvey Harrison { 67c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES 68c61e211dSHarvey Harrison int ret = 0; 69c61e211dSHarvey Harrison 70c61e211dSHarvey Harrison /* kprobe_running() needs smp_processor_id() */ 71c61e211dSHarvey Harrison if (!user_mode_vm(regs)) { 72c61e211dSHarvey Harrison preempt_disable(); 73c61e211dSHarvey Harrison if (kprobe_running() && kprobe_fault_handler(regs, 14)) 74c61e211dSHarvey Harrison ret = 1; 75c61e211dSHarvey Harrison preempt_enable(); 76c61e211dSHarvey Harrison } 77c61e211dSHarvey Harrison 78c61e211dSHarvey Harrison return ret; 79c61e211dSHarvey Harrison #else 80c61e211dSHarvey Harrison return 0; 81c61e211dSHarvey Harrison #endif 82c61e211dSHarvey Harrison } 83c61e211dSHarvey Harrison 84c61e211dSHarvey Harrison /* 85c61e211dSHarvey Harrison * X86_32 86c61e211dSHarvey Harrison * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 87c61e211dSHarvey Harrison * Check that here and ignore it. 88c61e211dSHarvey Harrison * 89c61e211dSHarvey Harrison * X86_64 90c61e211dSHarvey Harrison * Sometimes the CPU reports invalid exceptions on prefetch. 91c61e211dSHarvey Harrison * Check that here and ignore it. 92c61e211dSHarvey Harrison * 93c61e211dSHarvey Harrison * Opcode checker based on code by Richard Brunner 94c61e211dSHarvey Harrison */ 9592181f19SNick Piggin static int is_prefetch(struct pt_regs *regs, unsigned long error_code, 9692181f19SNick Piggin unsigned long addr) 97c61e211dSHarvey Harrison { 98c61e211dSHarvey Harrison unsigned char *instr; 99c61e211dSHarvey Harrison int scan_more = 1; 100c61e211dSHarvey Harrison int prefetch = 0; 101c61e211dSHarvey Harrison unsigned char *max_instr; 102c61e211dSHarvey Harrison 1033085354dSIngo Molnar /* 1043085354dSIngo Molnar * If it was a exec (instruction fetch) fault on NX page, then 1053085354dSIngo Molnar * do not ignore the fault: 1063085354dSIngo Molnar */ 107c61e211dSHarvey Harrison if (error_code & PF_INSTR) 108c61e211dSHarvey Harrison return 0; 109c61e211dSHarvey Harrison 110c61e211dSHarvey Harrison instr = (unsigned char *)convert_ip_to_linear(current, regs); 111c61e211dSHarvey Harrison max_instr = instr + 15; 112c61e211dSHarvey Harrison 113c61e211dSHarvey Harrison if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 114c61e211dSHarvey Harrison return 0; 115c61e211dSHarvey Harrison 116c61e211dSHarvey Harrison while (scan_more && instr < max_instr) { 117c61e211dSHarvey Harrison unsigned char opcode; 118c61e211dSHarvey Harrison unsigned char instr_hi; 119c61e211dSHarvey Harrison unsigned char instr_lo; 120c61e211dSHarvey Harrison 121c61e211dSHarvey Harrison if (probe_kernel_address(instr, opcode)) 122c61e211dSHarvey Harrison break; 123c61e211dSHarvey Harrison 124c61e211dSHarvey Harrison instr_hi = opcode & 0xf0; 125c61e211dSHarvey Harrison instr_lo = opcode & 0x0f; 126c61e211dSHarvey Harrison instr++; 127c61e211dSHarvey Harrison 128c61e211dSHarvey Harrison switch (instr_hi) { 129c61e211dSHarvey Harrison case 0x20: 130c61e211dSHarvey Harrison case 0x30: 131c61e211dSHarvey Harrison /* 132c61e211dSHarvey Harrison * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 133c61e211dSHarvey Harrison * In X86_64 long mode, the CPU will signal invalid 134c61e211dSHarvey Harrison * opcode if some of these prefixes are present so 135c61e211dSHarvey Harrison * X86_64 will never get here anyway 136c61e211dSHarvey Harrison */ 137c61e211dSHarvey Harrison scan_more = ((instr_lo & 7) == 0x6); 138c61e211dSHarvey Harrison break; 139c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 140c61e211dSHarvey Harrison case 0x40: 141c61e211dSHarvey Harrison /* 142c61e211dSHarvey Harrison * In AMD64 long mode 0x40..0x4F are valid REX prefixes 143c61e211dSHarvey Harrison * Need to figure out under what instruction mode the 144c61e211dSHarvey Harrison * instruction was issued. Could check the LDT for lm, 145c61e211dSHarvey Harrison * but for now it's good enough to assume that long 146c61e211dSHarvey Harrison * mode only uses well known segments or kernel. 147c61e211dSHarvey Harrison */ 148c61e211dSHarvey Harrison scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); 149c61e211dSHarvey Harrison break; 150c61e211dSHarvey Harrison #endif 151c61e211dSHarvey Harrison case 0x60: 152c61e211dSHarvey Harrison /* 0x64 thru 0x67 are valid prefixes in all modes. */ 153c61e211dSHarvey Harrison scan_more = (instr_lo & 0xC) == 0x4; 154c61e211dSHarvey Harrison break; 155c61e211dSHarvey Harrison case 0xF0: 156c61e211dSHarvey Harrison /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 157c61e211dSHarvey Harrison scan_more = !instr_lo || (instr_lo>>1) == 1; 158c61e211dSHarvey Harrison break; 159c61e211dSHarvey Harrison case 0x00: 160c61e211dSHarvey Harrison /* Prefetch instruction is 0x0F0D or 0x0F18 */ 161c61e211dSHarvey Harrison scan_more = 0; 162c61e211dSHarvey Harrison 163c61e211dSHarvey Harrison if (probe_kernel_address(instr, opcode)) 164c61e211dSHarvey Harrison break; 165c61e211dSHarvey Harrison prefetch = (instr_lo == 0xF) && 166c61e211dSHarvey Harrison (opcode == 0x0D || opcode == 0x18); 167c61e211dSHarvey Harrison break; 168c61e211dSHarvey Harrison default: 169c61e211dSHarvey Harrison scan_more = 0; 170c61e211dSHarvey Harrison break; 171c61e211dSHarvey Harrison } 172c61e211dSHarvey Harrison } 173c61e211dSHarvey Harrison return prefetch; 174c61e211dSHarvey Harrison } 175c61e211dSHarvey Harrison 176c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code, 177c61e211dSHarvey Harrison unsigned long address, struct task_struct *tsk) 178c61e211dSHarvey Harrison { 179c61e211dSHarvey Harrison siginfo_t info; 180c61e211dSHarvey Harrison 181c61e211dSHarvey Harrison info.si_signo = si_signo; 182c61e211dSHarvey Harrison info.si_errno = 0; 183c61e211dSHarvey Harrison info.si_code = si_code; 184c61e211dSHarvey Harrison info.si_addr = (void __user *)address; 185c61e211dSHarvey Harrison force_sig_info(si_signo, &info, tsk); 186c61e211dSHarvey Harrison } 187c61e211dSHarvey Harrison 188c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 189c61e211dSHarvey Harrison static int bad_address(void *p) 190c61e211dSHarvey Harrison { 191c61e211dSHarvey Harrison unsigned long dummy; 192c61e211dSHarvey Harrison return probe_kernel_address((unsigned long *)p, dummy); 193c61e211dSHarvey Harrison } 194c61e211dSHarvey Harrison #endif 195c61e211dSHarvey Harrison 196cae30f82SAdrian Bunk static void dump_pagetable(unsigned long address) 197c61e211dSHarvey Harrison { 198c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 199c61e211dSHarvey Harrison __typeof__(pte_val(__pte(0))) page; 200c61e211dSHarvey Harrison 201c61e211dSHarvey Harrison page = read_cr3(); 202c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 203c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE 204c61e211dSHarvey Harrison printk("*pdpt = %016Lx ", page); 205c61e211dSHarvey Harrison if ((page >> PAGE_SHIFT) < max_low_pfn 206c61e211dSHarvey Harrison && page & _PAGE_PRESENT) { 207c61e211dSHarvey Harrison page &= PAGE_MASK; 208c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 209c61e211dSHarvey Harrison & (PTRS_PER_PMD - 1)]; 210c61e211dSHarvey Harrison printk(KERN_CONT "*pde = %016Lx ", page); 211c61e211dSHarvey Harrison page &= ~_PAGE_NX; 212c61e211dSHarvey Harrison } 213c61e211dSHarvey Harrison #else 214c61e211dSHarvey Harrison printk("*pde = %08lx ", page); 215c61e211dSHarvey Harrison #endif 216c61e211dSHarvey Harrison 217c61e211dSHarvey Harrison /* 218c61e211dSHarvey Harrison * We must not directly access the pte in the highpte 219c61e211dSHarvey Harrison * case if the page table is located in highmem. 220c61e211dSHarvey Harrison * And let's rather not kmap-atomic the pte, just in case 221c61e211dSHarvey Harrison * it's allocated already. 222c61e211dSHarvey Harrison */ 223c61e211dSHarvey Harrison if ((page >> PAGE_SHIFT) < max_low_pfn 224c61e211dSHarvey Harrison && (page & _PAGE_PRESENT) 225c61e211dSHarvey Harrison && !(page & _PAGE_PSE)) { 226c61e211dSHarvey Harrison page &= PAGE_MASK; 227c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 228c61e211dSHarvey Harrison & (PTRS_PER_PTE - 1)]; 229c61e211dSHarvey Harrison printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); 230c61e211dSHarvey Harrison } 231c61e211dSHarvey Harrison 232c61e211dSHarvey Harrison printk("\n"); 233c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */ 234c61e211dSHarvey Harrison pgd_t *pgd; 235c61e211dSHarvey Harrison pud_t *pud; 236c61e211dSHarvey Harrison pmd_t *pmd; 237c61e211dSHarvey Harrison pte_t *pte; 238c61e211dSHarvey Harrison 239c61e211dSHarvey Harrison pgd = (pgd_t *)read_cr3(); 240c61e211dSHarvey Harrison 241c61e211dSHarvey Harrison pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 242c61e211dSHarvey Harrison pgd += pgd_index(address); 243c61e211dSHarvey Harrison if (bad_address(pgd)) goto bad; 244c61e211dSHarvey Harrison printk("PGD %lx ", pgd_val(*pgd)); 245c61e211dSHarvey Harrison if (!pgd_present(*pgd)) goto ret; 246c61e211dSHarvey Harrison 247c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 248c61e211dSHarvey Harrison if (bad_address(pud)) goto bad; 249c61e211dSHarvey Harrison printk("PUD %lx ", pud_val(*pud)); 250b5360222SAndi Kleen if (!pud_present(*pud) || pud_large(*pud)) 251b5360222SAndi Kleen goto ret; 252c61e211dSHarvey Harrison 253c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 254c61e211dSHarvey Harrison if (bad_address(pmd)) goto bad; 255c61e211dSHarvey Harrison printk("PMD %lx ", pmd_val(*pmd)); 256c61e211dSHarvey Harrison if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; 257c61e211dSHarvey Harrison 258c61e211dSHarvey Harrison pte = pte_offset_kernel(pmd, address); 259c61e211dSHarvey Harrison if (bad_address(pte)) goto bad; 260c61e211dSHarvey Harrison printk("PTE %lx", pte_val(*pte)); 261c61e211dSHarvey Harrison ret: 262c61e211dSHarvey Harrison printk("\n"); 263c61e211dSHarvey Harrison return; 264c61e211dSHarvey Harrison bad: 265c61e211dSHarvey Harrison printk("BAD\n"); 266c61e211dSHarvey Harrison #endif 267c61e211dSHarvey Harrison } 268c61e211dSHarvey Harrison 269c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 270c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 271c61e211dSHarvey Harrison { 272c61e211dSHarvey Harrison unsigned index = pgd_index(address); 273c61e211dSHarvey Harrison pgd_t *pgd_k; 274c61e211dSHarvey Harrison pud_t *pud, *pud_k; 275c61e211dSHarvey Harrison pmd_t *pmd, *pmd_k; 276c61e211dSHarvey Harrison 277c61e211dSHarvey Harrison pgd += index; 278c61e211dSHarvey Harrison pgd_k = init_mm.pgd + index; 279c61e211dSHarvey Harrison 280c61e211dSHarvey Harrison if (!pgd_present(*pgd_k)) 281c61e211dSHarvey Harrison return NULL; 282c61e211dSHarvey Harrison 283c61e211dSHarvey Harrison /* 284c61e211dSHarvey Harrison * set_pgd(pgd, *pgd_k); here would be useless on PAE 285c61e211dSHarvey Harrison * and redundant with the set_pmd() on non-PAE. As would 286c61e211dSHarvey Harrison * set_pud. 287c61e211dSHarvey Harrison */ 288c61e211dSHarvey Harrison 289c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 290c61e211dSHarvey Harrison pud_k = pud_offset(pgd_k, address); 291c61e211dSHarvey Harrison if (!pud_present(*pud_k)) 292c61e211dSHarvey Harrison return NULL; 293c61e211dSHarvey Harrison 294c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 295c61e211dSHarvey Harrison pmd_k = pmd_offset(pud_k, address); 296c61e211dSHarvey Harrison if (!pmd_present(*pmd_k)) 297c61e211dSHarvey Harrison return NULL; 298c61e211dSHarvey Harrison if (!pmd_present(*pmd)) { 299c61e211dSHarvey Harrison set_pmd(pmd, *pmd_k); 300c61e211dSHarvey Harrison arch_flush_lazy_mmu_mode(); 301c61e211dSHarvey Harrison } else 302c61e211dSHarvey Harrison BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 303c61e211dSHarvey Harrison return pmd_k; 304c61e211dSHarvey Harrison } 305c61e211dSHarvey Harrison #endif 306c61e211dSHarvey Harrison 307c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 308c61e211dSHarvey Harrison static const char errata93_warning[] = 309c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 310c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 311c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n" 312c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 313c61e211dSHarvey Harrison #endif 314c61e211dSHarvey Harrison 315c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS. 316c61e211dSHarvey Harrison BIOS SMM functions are required to use a specific workaround 317c61e211dSHarvey Harrison to avoid corruption of the 64bit RIP register on C stepping K8. 318c61e211dSHarvey Harrison A lot of BIOS that didn't get tested properly miss this. 319c61e211dSHarvey Harrison The OS sees this as a page fault with the upper 32bits of RIP cleared. 320c61e211dSHarvey Harrison Try to work around it here. 321c61e211dSHarvey Harrison Note we only handle faults in kernel here. 322c61e211dSHarvey Harrison Does nothing for X86_32 323c61e211dSHarvey Harrison */ 324c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address) 325c61e211dSHarvey Harrison { 326c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 327c61e211dSHarvey Harrison static int warned; 328c61e211dSHarvey Harrison if (address != regs->ip) 329c61e211dSHarvey Harrison return 0; 330c61e211dSHarvey Harrison if ((address >> 32) != 0) 331c61e211dSHarvey Harrison return 0; 332c61e211dSHarvey Harrison address |= 0xffffffffUL << 32; 333c61e211dSHarvey Harrison if ((address >= (u64)_stext && address <= (u64)_etext) || 334c61e211dSHarvey Harrison (address >= MODULES_VADDR && address <= MODULES_END)) { 335c61e211dSHarvey Harrison if (!warned) { 336c61e211dSHarvey Harrison printk(errata93_warning); 337c61e211dSHarvey Harrison warned = 1; 338c61e211dSHarvey Harrison } 339c61e211dSHarvey Harrison regs->ip = address; 340c61e211dSHarvey Harrison return 1; 341c61e211dSHarvey Harrison } 342c61e211dSHarvey Harrison #endif 343c61e211dSHarvey Harrison return 0; 344c61e211dSHarvey Harrison } 345c61e211dSHarvey Harrison 346c61e211dSHarvey Harrison /* 347c61e211dSHarvey Harrison * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal 348c61e211dSHarvey Harrison * addresses >4GB. We catch this in the page fault handler because these 349c61e211dSHarvey Harrison * addresses are not reachable. Just detect this case and return. Any code 350c61e211dSHarvey Harrison * segment in LDT is compatibility mode. 351c61e211dSHarvey Harrison */ 352c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address) 353c61e211dSHarvey Harrison { 354c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 355c61e211dSHarvey Harrison if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && 356c61e211dSHarvey Harrison (address >> 32)) 357c61e211dSHarvey Harrison return 1; 358c61e211dSHarvey Harrison #endif 359c61e211dSHarvey Harrison return 0; 360c61e211dSHarvey Harrison } 361c61e211dSHarvey Harrison 362c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 363c61e211dSHarvey Harrison { 364c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG 365c61e211dSHarvey Harrison unsigned long nr; 366c61e211dSHarvey Harrison /* 367c61e211dSHarvey Harrison * Pentium F0 0F C7 C8 bug workaround. 368c61e211dSHarvey Harrison */ 369c61e211dSHarvey Harrison if (boot_cpu_data.f00f_bug) { 370c61e211dSHarvey Harrison nr = (address - idt_descr.address) >> 3; 371c61e211dSHarvey Harrison 372c61e211dSHarvey Harrison if (nr == 6) { 373c61e211dSHarvey Harrison do_invalid_op(regs, 0); 374c61e211dSHarvey Harrison return 1; 375c61e211dSHarvey Harrison } 376c61e211dSHarvey Harrison } 377c61e211dSHarvey Harrison #endif 378c61e211dSHarvey Harrison return 0; 379c61e211dSHarvey Harrison } 380c61e211dSHarvey Harrison 381c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, 382c61e211dSHarvey Harrison unsigned long address) 383c61e211dSHarvey Harrison { 384c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 385c61e211dSHarvey Harrison if (!oops_may_print()) 386c61e211dSHarvey Harrison return; 387fd40d6e3SHarvey Harrison #endif 388c61e211dSHarvey Harrison 389c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE 390c61e211dSHarvey Harrison if (error_code & PF_INSTR) { 39193809be8SHarvey Harrison unsigned int level; 392c61e211dSHarvey Harrison pte_t *pte = lookup_address(address, &level); 393c61e211dSHarvey Harrison 394c61e211dSHarvey Harrison if (pte && pte_present(*pte) && !pte_exec(*pte)) 395c61e211dSHarvey Harrison printk(KERN_CRIT "kernel tried to execute " 396c61e211dSHarvey Harrison "NX-protected page - exploit attempt? " 397350b4da7SDavid Howells "(uid: %d)\n", current_uid()); 398c61e211dSHarvey Harrison } 399c61e211dSHarvey Harrison #endif 400fd40d6e3SHarvey Harrison 401c61e211dSHarvey Harrison printk(KERN_ALERT "BUG: unable to handle kernel "); 402c61e211dSHarvey Harrison if (address < PAGE_SIZE) 403c61e211dSHarvey Harrison printk(KERN_CONT "NULL pointer dereference"); 404c61e211dSHarvey Harrison else 405c61e211dSHarvey Harrison printk(KERN_CONT "paging request"); 406f294a8ceSVegard Nossum printk(KERN_CONT " at %p\n", (void *) address); 407c61e211dSHarvey Harrison printk(KERN_ALERT "IP:"); 408c61e211dSHarvey Harrison printk_address(regs->ip, 1); 409c61e211dSHarvey Harrison dump_pagetable(address); 410c61e211dSHarvey Harrison } 411c61e211dSHarvey Harrison 412c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 41392181f19SNick Piggin static noinline void pgtable_bad(struct pt_regs *regs, 41492181f19SNick Piggin unsigned long error_code, unsigned long address) 415c61e211dSHarvey Harrison { 416c61e211dSHarvey Harrison unsigned long flags = oops_begin(); 417874d93d1SAlexander van Heukelum int sig = SIGKILL; 41892181f19SNick Piggin struct task_struct *tsk = current; 419c61e211dSHarvey Harrison 420c61e211dSHarvey Harrison printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 42192181f19SNick Piggin tsk->comm, address); 422c61e211dSHarvey Harrison dump_pagetable(address); 423c61e211dSHarvey Harrison tsk->thread.cr2 = address; 424c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 425c61e211dSHarvey Harrison tsk->thread.error_code = error_code; 426c61e211dSHarvey Harrison if (__die("Bad pagetable", regs, error_code)) 427874d93d1SAlexander van Heukelum sig = 0; 428874d93d1SAlexander van Heukelum oops_end(flags, regs, sig); 429c61e211dSHarvey Harrison } 430c61e211dSHarvey Harrison #endif 431c61e211dSHarvey Harrison 43292181f19SNick Piggin static noinline void no_context(struct pt_regs *regs, 43392181f19SNick Piggin unsigned long error_code, unsigned long address) 43492181f19SNick Piggin { 43592181f19SNick Piggin struct task_struct *tsk = current; 43619803078SIngo Molnar unsigned long *stackend; 43719803078SIngo Molnar 43892181f19SNick Piggin #ifdef CONFIG_X86_64 43992181f19SNick Piggin unsigned long flags; 44092181f19SNick Piggin int sig; 44192181f19SNick Piggin #endif 44292181f19SNick Piggin 44392181f19SNick Piggin /* Are we prepared to handle this kernel fault? */ 44492181f19SNick Piggin if (fixup_exception(regs)) 44592181f19SNick Piggin return; 44692181f19SNick Piggin 44792181f19SNick Piggin /* 44892181f19SNick Piggin * X86_32 44992181f19SNick Piggin * Valid to do another page fault here, because if this fault 45092181f19SNick Piggin * had been triggered by is_prefetch fixup_exception would have 45192181f19SNick Piggin * handled it. 45292181f19SNick Piggin * 45392181f19SNick Piggin * X86_64 45492181f19SNick Piggin * Hall of shame of CPU/BIOS bugs. 45592181f19SNick Piggin */ 45692181f19SNick Piggin if (is_prefetch(regs, error_code, address)) 45792181f19SNick Piggin return; 45892181f19SNick Piggin 45992181f19SNick Piggin if (is_errata93(regs, address)) 46092181f19SNick Piggin return; 46192181f19SNick Piggin 46292181f19SNick Piggin /* 46392181f19SNick Piggin * Oops. The kernel tried to access some bad page. We'll have to 46492181f19SNick Piggin * terminate things with extreme prejudice. 46592181f19SNick Piggin */ 46692181f19SNick Piggin #ifdef CONFIG_X86_32 46792181f19SNick Piggin bust_spinlocks(1); 46892181f19SNick Piggin #else 46992181f19SNick Piggin flags = oops_begin(); 47092181f19SNick Piggin #endif 47192181f19SNick Piggin 47292181f19SNick Piggin show_fault_oops(regs, error_code, address); 47392181f19SNick Piggin 47419803078SIngo Molnar stackend = end_of_stack(tsk); 47519803078SIngo Molnar if (*stackend != STACK_END_MAGIC) 47619803078SIngo Molnar printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 47719803078SIngo Molnar 47892181f19SNick Piggin tsk->thread.cr2 = address; 47992181f19SNick Piggin tsk->thread.trap_no = 14; 48092181f19SNick Piggin tsk->thread.error_code = error_code; 48192181f19SNick Piggin 48292181f19SNick Piggin #ifdef CONFIG_X86_32 48392181f19SNick Piggin die("Oops", regs, error_code); 48492181f19SNick Piggin bust_spinlocks(0); 48592181f19SNick Piggin do_exit(SIGKILL); 48692181f19SNick Piggin #else 48792181f19SNick Piggin sig = SIGKILL; 48892181f19SNick Piggin if (__die("Oops", regs, error_code)) 48992181f19SNick Piggin sig = 0; 49092181f19SNick Piggin /* Executive summary in case the body of the oops scrolled away */ 49192181f19SNick Piggin printk(KERN_EMERG "CR2: %016lx\n", address); 49292181f19SNick Piggin oops_end(flags, regs, sig); 49392181f19SNick Piggin #endif 49492181f19SNick Piggin } 49592181f19SNick Piggin 49692181f19SNick Piggin static void __bad_area_nosemaphore(struct pt_regs *regs, 49792181f19SNick Piggin unsigned long error_code, unsigned long address, 49892181f19SNick Piggin int si_code) 49992181f19SNick Piggin { 50092181f19SNick Piggin struct task_struct *tsk = current; 50192181f19SNick Piggin 50292181f19SNick Piggin /* User mode accesses just cause a SIGSEGV */ 50392181f19SNick Piggin if (error_code & PF_USER) { 50492181f19SNick Piggin /* 50592181f19SNick Piggin * It's possible to have interrupts off here. 50692181f19SNick Piggin */ 50792181f19SNick Piggin local_irq_enable(); 50892181f19SNick Piggin 50992181f19SNick Piggin /* 51092181f19SNick Piggin * Valid to do another page fault here because this one came 51192181f19SNick Piggin * from user space. 51292181f19SNick Piggin */ 51392181f19SNick Piggin if (is_prefetch(regs, error_code, address)) 51492181f19SNick Piggin return; 51592181f19SNick Piggin 51692181f19SNick Piggin if (is_errata100(regs, address)) 51792181f19SNick Piggin return; 51892181f19SNick Piggin 51992181f19SNick Piggin if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 52092181f19SNick Piggin printk_ratelimit()) { 52192181f19SNick Piggin printk( 52292181f19SNick Piggin "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", 52392181f19SNick Piggin task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 52492181f19SNick Piggin tsk->comm, task_pid_nr(tsk), address, 52592181f19SNick Piggin (void *) regs->ip, (void *) regs->sp, error_code); 52692181f19SNick Piggin print_vma_addr(" in ", regs->ip); 52792181f19SNick Piggin printk("\n"); 52892181f19SNick Piggin } 52992181f19SNick Piggin 53092181f19SNick Piggin tsk->thread.cr2 = address; 53192181f19SNick Piggin /* Kernel addresses are always protection faults */ 53292181f19SNick Piggin tsk->thread.error_code = error_code | (address >= TASK_SIZE); 53392181f19SNick Piggin tsk->thread.trap_no = 14; 53492181f19SNick Piggin force_sig_info_fault(SIGSEGV, si_code, address, tsk); 53592181f19SNick Piggin return; 53692181f19SNick Piggin } 53792181f19SNick Piggin 53892181f19SNick Piggin if (is_f00f_bug(regs, address)) 53992181f19SNick Piggin return; 54092181f19SNick Piggin 54192181f19SNick Piggin no_context(regs, error_code, address); 54292181f19SNick Piggin } 54392181f19SNick Piggin 54492181f19SNick Piggin static noinline void bad_area_nosemaphore(struct pt_regs *regs, 54592181f19SNick Piggin unsigned long error_code, unsigned long address) 54692181f19SNick Piggin { 54792181f19SNick Piggin __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); 54892181f19SNick Piggin } 54992181f19SNick Piggin 55092181f19SNick Piggin static void __bad_area(struct pt_regs *regs, 55192181f19SNick Piggin unsigned long error_code, unsigned long address, 55292181f19SNick Piggin int si_code) 55392181f19SNick Piggin { 55492181f19SNick Piggin struct mm_struct *mm = current->mm; 55592181f19SNick Piggin 55692181f19SNick Piggin /* 55792181f19SNick Piggin * Something tried to access memory that isn't in our memory map.. 55892181f19SNick Piggin * Fix it, but check if it's kernel or user first.. 55992181f19SNick Piggin */ 56092181f19SNick Piggin up_read(&mm->mmap_sem); 56192181f19SNick Piggin 56292181f19SNick Piggin __bad_area_nosemaphore(regs, error_code, address, si_code); 56392181f19SNick Piggin } 56492181f19SNick Piggin 56592181f19SNick Piggin static noinline void bad_area(struct pt_regs *regs, 56692181f19SNick Piggin unsigned long error_code, unsigned long address) 56792181f19SNick Piggin { 56892181f19SNick Piggin __bad_area(regs, error_code, address, SEGV_MAPERR); 56992181f19SNick Piggin } 57092181f19SNick Piggin 57192181f19SNick Piggin static noinline void bad_area_access_error(struct pt_regs *regs, 57292181f19SNick Piggin unsigned long error_code, unsigned long address) 57392181f19SNick Piggin { 57492181f19SNick Piggin __bad_area(regs, error_code, address, SEGV_ACCERR); 57592181f19SNick Piggin } 57692181f19SNick Piggin 57792181f19SNick Piggin /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ 57892181f19SNick Piggin static void out_of_memory(struct pt_regs *regs, 57992181f19SNick Piggin unsigned long error_code, unsigned long address) 58092181f19SNick Piggin { 58192181f19SNick Piggin /* 58292181f19SNick Piggin * We ran out of memory, call the OOM killer, and return the userspace 58392181f19SNick Piggin * (which will retry the fault, or kill us if we got oom-killed). 58492181f19SNick Piggin */ 58592181f19SNick Piggin up_read(¤t->mm->mmap_sem); 58692181f19SNick Piggin pagefault_out_of_memory(); 58792181f19SNick Piggin } 58892181f19SNick Piggin 58992181f19SNick Piggin static void do_sigbus(struct pt_regs *regs, 59092181f19SNick Piggin unsigned long error_code, unsigned long address) 59192181f19SNick Piggin { 59292181f19SNick Piggin struct task_struct *tsk = current; 59392181f19SNick Piggin struct mm_struct *mm = tsk->mm; 59492181f19SNick Piggin 59592181f19SNick Piggin up_read(&mm->mmap_sem); 59692181f19SNick Piggin 59792181f19SNick Piggin /* Kernel mode? Handle exceptions or die */ 59892181f19SNick Piggin if (!(error_code & PF_USER)) 59992181f19SNick Piggin no_context(regs, error_code, address); 60092181f19SNick Piggin #ifdef CONFIG_X86_32 60192181f19SNick Piggin /* User space => ok to do another page fault */ 60292181f19SNick Piggin if (is_prefetch(regs, error_code, address)) 60392181f19SNick Piggin return; 60492181f19SNick Piggin #endif 60592181f19SNick Piggin tsk->thread.cr2 = address; 60692181f19SNick Piggin tsk->thread.error_code = error_code; 60792181f19SNick Piggin tsk->thread.trap_no = 14; 60892181f19SNick Piggin force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 60992181f19SNick Piggin } 61092181f19SNick Piggin 61192181f19SNick Piggin static noinline void mm_fault_error(struct pt_regs *regs, 61292181f19SNick Piggin unsigned long error_code, unsigned long address, unsigned int fault) 61392181f19SNick Piggin { 61492181f19SNick Piggin if (fault & VM_FAULT_OOM) 61592181f19SNick Piggin out_of_memory(regs, error_code, address); 61692181f19SNick Piggin else if (fault & VM_FAULT_SIGBUS) 61792181f19SNick Piggin do_sigbus(regs, error_code, address); 61892181f19SNick Piggin else 61992181f19SNick Piggin BUG(); 62092181f19SNick Piggin } 62192181f19SNick Piggin 622d8b57bb7SThomas Gleixner static int spurious_fault_check(unsigned long error_code, pte_t *pte) 623d8b57bb7SThomas Gleixner { 624d8b57bb7SThomas Gleixner if ((error_code & PF_WRITE) && !pte_write(*pte)) 625d8b57bb7SThomas Gleixner return 0; 626d8b57bb7SThomas Gleixner if ((error_code & PF_INSTR) && !pte_exec(*pte)) 627d8b57bb7SThomas Gleixner return 0; 628d8b57bb7SThomas Gleixner 629d8b57bb7SThomas Gleixner return 1; 630d8b57bb7SThomas Gleixner } 631d8b57bb7SThomas Gleixner 632c61e211dSHarvey Harrison /* 6335b727a3bSJeremy Fitzhardinge * Handle a spurious fault caused by a stale TLB entry. This allows 6345b727a3bSJeremy Fitzhardinge * us to lazily refresh the TLB when increasing the permissions of a 6355b727a3bSJeremy Fitzhardinge * kernel page (RO -> RW or NX -> X). Doing it eagerly is very 6365b727a3bSJeremy Fitzhardinge * expensive since that implies doing a full cross-processor TLB 6375b727a3bSJeremy Fitzhardinge * flush, even if no stale TLB entries exist on other processors. 6385b727a3bSJeremy Fitzhardinge * There are no security implications to leaving a stale TLB when 6395b727a3bSJeremy Fitzhardinge * increasing the permissions on a page. 6405b727a3bSJeremy Fitzhardinge */ 64192181f19SNick Piggin static noinline int spurious_fault(unsigned long error_code, 64292181f19SNick Piggin unsigned long address) 6435b727a3bSJeremy Fitzhardinge { 6445b727a3bSJeremy Fitzhardinge pgd_t *pgd; 6455b727a3bSJeremy Fitzhardinge pud_t *pud; 6465b727a3bSJeremy Fitzhardinge pmd_t *pmd; 6475b727a3bSJeremy Fitzhardinge pte_t *pte; 6485b727a3bSJeremy Fitzhardinge 6495b727a3bSJeremy Fitzhardinge /* Reserved-bit violation or user access to kernel space? */ 6505b727a3bSJeremy Fitzhardinge if (error_code & (PF_USER | PF_RSVD)) 6515b727a3bSJeremy Fitzhardinge return 0; 6525b727a3bSJeremy Fitzhardinge 6535b727a3bSJeremy Fitzhardinge pgd = init_mm.pgd + pgd_index(address); 6545b727a3bSJeremy Fitzhardinge if (!pgd_present(*pgd)) 6555b727a3bSJeremy Fitzhardinge return 0; 6565b727a3bSJeremy Fitzhardinge 6575b727a3bSJeremy Fitzhardinge pud = pud_offset(pgd, address); 6585b727a3bSJeremy Fitzhardinge if (!pud_present(*pud)) 6595b727a3bSJeremy Fitzhardinge return 0; 6605b727a3bSJeremy Fitzhardinge 661d8b57bb7SThomas Gleixner if (pud_large(*pud)) 662d8b57bb7SThomas Gleixner return spurious_fault_check(error_code, (pte_t *) pud); 663d8b57bb7SThomas Gleixner 6645b727a3bSJeremy Fitzhardinge pmd = pmd_offset(pud, address); 6655b727a3bSJeremy Fitzhardinge if (!pmd_present(*pmd)) 6665b727a3bSJeremy Fitzhardinge return 0; 6675b727a3bSJeremy Fitzhardinge 668d8b57bb7SThomas Gleixner if (pmd_large(*pmd)) 669d8b57bb7SThomas Gleixner return spurious_fault_check(error_code, (pte_t *) pmd); 670d8b57bb7SThomas Gleixner 6715b727a3bSJeremy Fitzhardinge pte = pte_offset_kernel(pmd, address); 6725b727a3bSJeremy Fitzhardinge if (!pte_present(*pte)) 6735b727a3bSJeremy Fitzhardinge return 0; 6745b727a3bSJeremy Fitzhardinge 675d8b57bb7SThomas Gleixner return spurious_fault_check(error_code, pte); 6765b727a3bSJeremy Fitzhardinge } 6775b727a3bSJeremy Fitzhardinge 6785b727a3bSJeremy Fitzhardinge /* 679c61e211dSHarvey Harrison * X86_32 680c61e211dSHarvey Harrison * Handle a fault on the vmalloc or module mapping area 681c61e211dSHarvey Harrison * 682c61e211dSHarvey Harrison * X86_64 683c61e211dSHarvey Harrison * Handle a fault on the vmalloc area 684c61e211dSHarvey Harrison * 685c61e211dSHarvey Harrison * This assumes no large pages in there. 686c61e211dSHarvey Harrison */ 68792181f19SNick Piggin static noinline int vmalloc_fault(unsigned long address) 688c61e211dSHarvey Harrison { 689c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 690c61e211dSHarvey Harrison unsigned long pgd_paddr; 691c61e211dSHarvey Harrison pmd_t *pmd_k; 692c61e211dSHarvey Harrison pte_t *pte_k; 693b29c701dSHenry Nestler 694b29c701dSHenry Nestler /* Make sure we are in vmalloc area */ 695b29c701dSHenry Nestler if (!(address >= VMALLOC_START && address < VMALLOC_END)) 696b29c701dSHenry Nestler return -1; 697b29c701dSHenry Nestler 698c61e211dSHarvey Harrison /* 699c61e211dSHarvey Harrison * Synchronize this task's top level page-table 700c61e211dSHarvey Harrison * with the 'reference' page table. 701c61e211dSHarvey Harrison * 702c61e211dSHarvey Harrison * Do _not_ use "current" here. We might be inside 703c61e211dSHarvey Harrison * an interrupt in the middle of a task switch.. 704c61e211dSHarvey Harrison */ 705c61e211dSHarvey Harrison pgd_paddr = read_cr3(); 706c61e211dSHarvey Harrison pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 707c61e211dSHarvey Harrison if (!pmd_k) 708c61e211dSHarvey Harrison return -1; 709c61e211dSHarvey Harrison pte_k = pte_offset_kernel(pmd_k, address); 710c61e211dSHarvey Harrison if (!pte_present(*pte_k)) 711c61e211dSHarvey Harrison return -1; 712c61e211dSHarvey Harrison return 0; 713c61e211dSHarvey Harrison #else 714c61e211dSHarvey Harrison pgd_t *pgd, *pgd_ref; 715c61e211dSHarvey Harrison pud_t *pud, *pud_ref; 716c61e211dSHarvey Harrison pmd_t *pmd, *pmd_ref; 717c61e211dSHarvey Harrison pte_t *pte, *pte_ref; 718c61e211dSHarvey Harrison 719cf89ec92SHarvey Harrison /* Make sure we are in vmalloc area */ 720cf89ec92SHarvey Harrison if (!(address >= VMALLOC_START && address < VMALLOC_END)) 721cf89ec92SHarvey Harrison return -1; 722cf89ec92SHarvey Harrison 723c61e211dSHarvey Harrison /* Copy kernel mappings over when needed. This can also 724c61e211dSHarvey Harrison happen within a race in page table update. In the later 725c61e211dSHarvey Harrison case just flush. */ 726c61e211dSHarvey Harrison 727f313e123SAndi Kleen pgd = pgd_offset(current->active_mm, address); 728c61e211dSHarvey Harrison pgd_ref = pgd_offset_k(address); 729c61e211dSHarvey Harrison if (pgd_none(*pgd_ref)) 730c61e211dSHarvey Harrison return -1; 731c61e211dSHarvey Harrison if (pgd_none(*pgd)) 732c61e211dSHarvey Harrison set_pgd(pgd, *pgd_ref); 733c61e211dSHarvey Harrison else 734c61e211dSHarvey Harrison BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 735c61e211dSHarvey Harrison 736c61e211dSHarvey Harrison /* Below here mismatches are bugs because these lower tables 737c61e211dSHarvey Harrison are shared */ 738c61e211dSHarvey Harrison 739c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 740c61e211dSHarvey Harrison pud_ref = pud_offset(pgd_ref, address); 741c61e211dSHarvey Harrison if (pud_none(*pud_ref)) 742c61e211dSHarvey Harrison return -1; 743c61e211dSHarvey Harrison if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 744c61e211dSHarvey Harrison BUG(); 745c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 746c61e211dSHarvey Harrison pmd_ref = pmd_offset(pud_ref, address); 747c61e211dSHarvey Harrison if (pmd_none(*pmd_ref)) 748c61e211dSHarvey Harrison return -1; 749c61e211dSHarvey Harrison if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 750c61e211dSHarvey Harrison BUG(); 751c61e211dSHarvey Harrison pte_ref = pte_offset_kernel(pmd_ref, address); 752c61e211dSHarvey Harrison if (!pte_present(*pte_ref)) 753c61e211dSHarvey Harrison return -1; 754c61e211dSHarvey Harrison pte = pte_offset_kernel(pmd, address); 755c61e211dSHarvey Harrison /* Don't use pte_page here, because the mappings can point 756c61e211dSHarvey Harrison outside mem_map, and the NUMA hash lookup cannot handle 757c61e211dSHarvey Harrison that. */ 758c61e211dSHarvey Harrison if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 759c61e211dSHarvey Harrison BUG(); 760c61e211dSHarvey Harrison return 0; 761c61e211dSHarvey Harrison #endif 762c61e211dSHarvey Harrison } 763c61e211dSHarvey Harrison 764c61e211dSHarvey Harrison int show_unhandled_signals = 1; 765c61e211dSHarvey Harrison 76692181f19SNick Piggin static inline int access_error(unsigned long error_code, int write, 76792181f19SNick Piggin struct vm_area_struct *vma) 76892181f19SNick Piggin { 76992181f19SNick Piggin if (write) { 77092181f19SNick Piggin /* write, present and write, not present */ 77192181f19SNick Piggin if (unlikely(!(vma->vm_flags & VM_WRITE))) 77292181f19SNick Piggin return 1; 77392181f19SNick Piggin } else if (unlikely(error_code & PF_PROT)) { 77492181f19SNick Piggin /* read, present */ 77592181f19SNick Piggin return 1; 77692181f19SNick Piggin } else { 77792181f19SNick Piggin /* read, not present */ 77892181f19SNick Piggin if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 77992181f19SNick Piggin return 1; 78092181f19SNick Piggin } 78192181f19SNick Piggin 78292181f19SNick Piggin return 0; 78392181f19SNick Piggin } 78492181f19SNick Piggin 785c61e211dSHarvey Harrison /* 786c61e211dSHarvey Harrison * This routine handles page faults. It determines the address, 787c61e211dSHarvey Harrison * and the problem, and then passes it off to one of the appropriate 788c61e211dSHarvey Harrison * routines. 789c61e211dSHarvey Harrison */ 790c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 791c61e211dSHarvey Harrison asmlinkage 792c61e211dSHarvey Harrison #endif 793c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 794c61e211dSHarvey Harrison { 79592181f19SNick Piggin unsigned long address; 796c61e211dSHarvey Harrison struct task_struct *tsk; 797c61e211dSHarvey Harrison struct mm_struct *mm; 798c61e211dSHarvey Harrison struct vm_area_struct *vma; 79992181f19SNick Piggin int write; 800c61e211dSHarvey Harrison int fault; 801c61e211dSHarvey Harrison 802c61e211dSHarvey Harrison tsk = current; 803c61e211dSHarvey Harrison mm = tsk->mm; 804c61e211dSHarvey Harrison prefetchw(&mm->mmap_sem); 805c61e211dSHarvey Harrison 806c61e211dSHarvey Harrison /* get the address */ 807c61e211dSHarvey Harrison address = read_cr2(); 808c61e211dSHarvey Harrison 8090fd0e3daSPekka Paalanen if (unlikely(kmmio_fault(regs, address))) 81086069782SPekka Paalanen return; 811c61e211dSHarvey Harrison 812c61e211dSHarvey Harrison /* 813c61e211dSHarvey Harrison * We fault-in kernel-space virtual memory on-demand. The 814c61e211dSHarvey Harrison * 'reference' page table is init_mm.pgd. 815c61e211dSHarvey Harrison * 816c61e211dSHarvey Harrison * NOTE! We MUST NOT take any locks for this case. We may 817c61e211dSHarvey Harrison * be in an interrupt or a critical region, and should 818c61e211dSHarvey Harrison * only copy the information from the master page table, 819c61e211dSHarvey Harrison * nothing more. 820c61e211dSHarvey Harrison * 821c61e211dSHarvey Harrison * This verifies that the fault happens in kernel space 822c61e211dSHarvey Harrison * (error_code & 4) == 0, and that the fault was not a 823c61e211dSHarvey Harrison * protection error (error_code & 9) == 0. 824c61e211dSHarvey Harrison */ 825c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 826c61e211dSHarvey Harrison if (unlikely(address >= TASK_SIZE)) { 827cf89ec92SHarvey Harrison #else 828cf89ec92SHarvey Harrison if (unlikely(address >= TASK_SIZE64)) { 829cf89ec92SHarvey Harrison #endif 830c61e211dSHarvey Harrison if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 831c61e211dSHarvey Harrison vmalloc_fault(address) >= 0) 832c61e211dSHarvey Harrison return; 8335b727a3bSJeremy Fitzhardinge 8345b727a3bSJeremy Fitzhardinge /* Can handle a stale RO->RW TLB */ 83592181f19SNick Piggin if (spurious_fault(error_code, address)) 8365b727a3bSJeremy Fitzhardinge return; 8375b727a3bSJeremy Fitzhardinge 8389be260a6SMasami Hiramatsu /* kprobes don't want to hook the spurious faults. */ 8399be260a6SMasami Hiramatsu if (notify_page_fault(regs)) 8409be260a6SMasami Hiramatsu return; 841c61e211dSHarvey Harrison /* 842c61e211dSHarvey Harrison * Don't take the mm semaphore here. If we fixup a prefetch 843c61e211dSHarvey Harrison * fault we could otherwise deadlock. 844c61e211dSHarvey Harrison */ 84592181f19SNick Piggin bad_area_nosemaphore(regs, error_code, address); 84692181f19SNick Piggin return; 847c61e211dSHarvey Harrison } 848c61e211dSHarvey Harrison 849*f8a6b2b9SIngo Molnar if (unlikely(notify_page_fault(regs))) 8509be260a6SMasami Hiramatsu return; 851c61e211dSHarvey Harrison /* 852891cffbdSLinus Torvalds * It's safe to allow irq's after cr2 has been saved and the 853891cffbdSLinus Torvalds * vmalloc fault has been handled. 854891cffbdSLinus Torvalds * 855891cffbdSLinus Torvalds * User-mode registers count as a user access even for any 856891cffbdSLinus Torvalds * potential system fault or CPU buglet. 857c61e211dSHarvey Harrison */ 858891cffbdSLinus Torvalds if (user_mode_vm(regs)) { 859891cffbdSLinus Torvalds local_irq_enable(); 860891cffbdSLinus Torvalds error_code |= PF_USER; 861891cffbdSLinus Torvalds } else if (regs->flags & X86_EFLAGS_IF) 862c61e211dSHarvey Harrison local_irq_enable(); 863c61e211dSHarvey Harrison 864891cffbdSLinus Torvalds #ifdef CONFIG_X86_64 865c61e211dSHarvey Harrison if (unlikely(error_code & PF_RSVD)) 86692181f19SNick Piggin pgtable_bad(regs, error_code, address); 867891cffbdSLinus Torvalds #endif 868c61e211dSHarvey Harrison 869c61e211dSHarvey Harrison /* 870c61e211dSHarvey Harrison * If we're in an interrupt, have no user context or are running in an 871c61e211dSHarvey Harrison * atomic region then we must not take the fault. 872c61e211dSHarvey Harrison */ 87392181f19SNick Piggin if (unlikely(in_atomic() || !mm)) { 87492181f19SNick Piggin bad_area_nosemaphore(regs, error_code, address); 87592181f19SNick Piggin return; 87692181f19SNick Piggin } 877c61e211dSHarvey Harrison 8783a1dfe6eSIngo Molnar /* 8793a1dfe6eSIngo Molnar * When running in the kernel we expect faults to occur only to 880c61e211dSHarvey Harrison * addresses in user space. All other faults represent errors in the 881c61e211dSHarvey Harrison * kernel and should generate an OOPS. Unfortunately, in the case of an 882c61e211dSHarvey Harrison * erroneous fault occurring in a code path which already holds mmap_sem 883c61e211dSHarvey Harrison * we will deadlock attempting to validate the fault against the 884c61e211dSHarvey Harrison * address space. Luckily the kernel only validly references user 885c61e211dSHarvey Harrison * space from well defined areas of code, which are listed in the 886c61e211dSHarvey Harrison * exceptions table. 887c61e211dSHarvey Harrison * 888c61e211dSHarvey Harrison * As the vast majority of faults will be valid we will only perform 889c61e211dSHarvey Harrison * the source reference check when there is a possibility of a deadlock. 890c61e211dSHarvey Harrison * Attempt to lock the address space, if we cannot we then validate the 891c61e211dSHarvey Harrison * source. If this is invalid we can skip the address space check, 892c61e211dSHarvey Harrison * thus avoiding the deadlock. 893c61e211dSHarvey Harrison */ 89492181f19SNick Piggin if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 895c61e211dSHarvey Harrison if ((error_code & PF_USER) == 0 && 89692181f19SNick Piggin !search_exception_tables(regs->ip)) { 89792181f19SNick Piggin bad_area_nosemaphore(regs, error_code, address); 89892181f19SNick Piggin return; 89992181f19SNick Piggin } 900c61e211dSHarvey Harrison down_read(&mm->mmap_sem); 901c61e211dSHarvey Harrison } 902c61e211dSHarvey Harrison 903c61e211dSHarvey Harrison vma = find_vma(mm, address); 90492181f19SNick Piggin if (unlikely(!vma)) { 90592181f19SNick Piggin bad_area(regs, error_code, address); 90692181f19SNick Piggin return; 90792181f19SNick Piggin } 90892181f19SNick Piggin if (likely(vma->vm_start <= address)) 909c61e211dSHarvey Harrison goto good_area; 91092181f19SNick Piggin if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 91192181f19SNick Piggin bad_area(regs, error_code, address); 91292181f19SNick Piggin return; 91392181f19SNick Piggin } 914c61e211dSHarvey Harrison if (error_code & PF_USER) { 915c61e211dSHarvey Harrison /* 916c61e211dSHarvey Harrison * Accessing the stack below %sp is always a bug. 917c61e211dSHarvey Harrison * The large cushion allows instructions like enter 918c61e211dSHarvey Harrison * and pusha to work. ("enter $65535,$31" pushes 919c61e211dSHarvey Harrison * 32 pointers and then decrements %sp by 65535.) 920c61e211dSHarvey Harrison */ 92192181f19SNick Piggin if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { 92292181f19SNick Piggin bad_area(regs, error_code, address); 92392181f19SNick Piggin return; 924c61e211dSHarvey Harrison } 92592181f19SNick Piggin } 92692181f19SNick Piggin if (unlikely(expand_stack(vma, address))) { 92792181f19SNick Piggin bad_area(regs, error_code, address); 92892181f19SNick Piggin return; 92992181f19SNick Piggin } 93092181f19SNick Piggin 931c61e211dSHarvey Harrison /* 932c61e211dSHarvey Harrison * Ok, we have a good vm_area for this memory access, so 933c61e211dSHarvey Harrison * we can handle it.. 934c61e211dSHarvey Harrison */ 935c61e211dSHarvey Harrison good_area: 93692181f19SNick Piggin write = error_code & PF_WRITE; 93792181f19SNick Piggin if (unlikely(access_error(error_code, write, vma))) { 93892181f19SNick Piggin bad_area_access_error(regs, error_code, address); 93992181f19SNick Piggin return; 940c61e211dSHarvey Harrison } 941c61e211dSHarvey Harrison 942c61e211dSHarvey Harrison /* 943c61e211dSHarvey Harrison * If for any reason at all we couldn't handle the fault, 944c61e211dSHarvey Harrison * make sure we exit gracefully rather than endlessly redo 945c61e211dSHarvey Harrison * the fault. 946c61e211dSHarvey Harrison */ 947c61e211dSHarvey Harrison fault = handle_mm_fault(mm, vma, address, write); 948c61e211dSHarvey Harrison if (unlikely(fault & VM_FAULT_ERROR)) { 94992181f19SNick Piggin mm_fault_error(regs, error_code, address, fault); 95092181f19SNick Piggin return; 951c61e211dSHarvey Harrison } 952c61e211dSHarvey Harrison if (fault & VM_FAULT_MAJOR) 953c61e211dSHarvey Harrison tsk->maj_flt++; 954c61e211dSHarvey Harrison else 955c61e211dSHarvey Harrison tsk->min_flt++; 956c61e211dSHarvey Harrison 957c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 958c61e211dSHarvey Harrison /* 959c61e211dSHarvey Harrison * Did it hit the DOS screen memory VA from vm86 mode? 960c61e211dSHarvey Harrison */ 961c61e211dSHarvey Harrison if (v8086_mode(regs)) { 962c61e211dSHarvey Harrison unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; 963c61e211dSHarvey Harrison if (bit < 32) 964c61e211dSHarvey Harrison tsk->thread.screen_bitmap |= 1 << bit; 965c61e211dSHarvey Harrison } 966c61e211dSHarvey Harrison #endif 967c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 968c61e211dSHarvey Harrison } 969c61e211dSHarvey Harrison 970c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock); 971c61e211dSHarvey Harrison LIST_HEAD(pgd_list); 972c61e211dSHarvey Harrison 973c61e211dSHarvey Harrison void vmalloc_sync_all(void) 974c61e211dSHarvey Harrison { 975c61e211dSHarvey Harrison unsigned long address; 976c61e211dSHarvey Harrison 977cc643d46SJan Beulich #ifdef CONFIG_X86_32 978c61e211dSHarvey Harrison if (SHARED_KERNEL_PMD) 979c61e211dSHarvey Harrison return; 980c61e211dSHarvey Harrison 981cc643d46SJan Beulich for (address = VMALLOC_START & PMD_MASK; 982cc643d46SJan Beulich address >= TASK_SIZE && address < FIXADDR_TOP; 983cc643d46SJan Beulich address += PMD_SIZE) { 984c61e211dSHarvey Harrison unsigned long flags; 985c61e211dSHarvey Harrison struct page *page; 986c61e211dSHarvey Harrison 987c61e211dSHarvey Harrison spin_lock_irqsave(&pgd_lock, flags); 988e3ed910dSJeremy Fitzhardinge list_for_each_entry(page, &pgd_list, lru) { 989c61e211dSHarvey Harrison if (!vmalloc_sync_one(page_address(page), 990e3ed910dSJeremy Fitzhardinge address)) 991c61e211dSHarvey Harrison break; 992c61e211dSHarvey Harrison } 993c61e211dSHarvey Harrison spin_unlock_irqrestore(&pgd_lock, flags); 994c61e211dSHarvey Harrison } 995c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */ 996cc643d46SJan Beulich for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; 997cc643d46SJan Beulich address += PGDIR_SIZE) { 998c61e211dSHarvey Harrison const pgd_t *pgd_ref = pgd_offset_k(address); 99958d5d0d8SIngo Molnar unsigned long flags; 1000c61e211dSHarvey Harrison struct page *page; 1001c61e211dSHarvey Harrison 1002c61e211dSHarvey Harrison if (pgd_none(*pgd_ref)) 1003c61e211dSHarvey Harrison continue; 100458d5d0d8SIngo Molnar spin_lock_irqsave(&pgd_lock, flags); 1005c61e211dSHarvey Harrison list_for_each_entry(page, &pgd_list, lru) { 1006c61e211dSHarvey Harrison pgd_t *pgd; 1007c61e211dSHarvey Harrison pgd = (pgd_t *)page_address(page) + pgd_index(address); 1008c61e211dSHarvey Harrison if (pgd_none(*pgd)) 1009c61e211dSHarvey Harrison set_pgd(pgd, *pgd_ref); 1010c61e211dSHarvey Harrison else 1011c61e211dSHarvey Harrison BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 1012c61e211dSHarvey Harrison } 101358d5d0d8SIngo Molnar spin_unlock_irqrestore(&pgd_lock, flags); 1014c61e211dSHarvey Harrison } 1015c61e211dSHarvey Harrison #endif 1016c61e211dSHarvey Harrison } 1017