1c61e211dSHarvey Harrison /* 2c61e211dSHarvey Harrison * Copyright (C) 1995 Linus Torvalds 3c61e211dSHarvey Harrison * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. 4c61e211dSHarvey Harrison */ 5c61e211dSHarvey Harrison 6c61e211dSHarvey Harrison #include <linux/signal.h> 7c61e211dSHarvey Harrison #include <linux/sched.h> 8c61e211dSHarvey Harrison #include <linux/kernel.h> 9c61e211dSHarvey Harrison #include <linux/errno.h> 10c61e211dSHarvey Harrison #include <linux/string.h> 11c61e211dSHarvey Harrison #include <linux/types.h> 12c61e211dSHarvey Harrison #include <linux/ptrace.h> 13c61e211dSHarvey Harrison #include <linux/mman.h> 14c61e211dSHarvey Harrison #include <linux/mm.h> 15c61e211dSHarvey Harrison #include <linux/smp.h> 16c61e211dSHarvey Harrison #include <linux/interrupt.h> 17c61e211dSHarvey Harrison #include <linux/init.h> 18c61e211dSHarvey Harrison #include <linux/tty.h> 19c61e211dSHarvey Harrison #include <linux/vt_kern.h> /* For unblank_screen() */ 20c61e211dSHarvey Harrison #include <linux/compiler.h> 21c61e211dSHarvey Harrison #include <linux/highmem.h> 22c61e211dSHarvey Harrison #include <linux/bootmem.h> /* for max_low_pfn */ 23c61e211dSHarvey Harrison #include <linux/vmalloc.h> 24c61e211dSHarvey Harrison #include <linux/module.h> 25c61e211dSHarvey Harrison #include <linux/kprobes.h> 26c61e211dSHarvey Harrison #include <linux/uaccess.h> 27c61e211dSHarvey Harrison #include <linux/kdebug.h> 28c61e211dSHarvey Harrison 29c61e211dSHarvey Harrison #include <asm/system.h> 30c61e211dSHarvey Harrison #include <asm/desc.h> 31c61e211dSHarvey Harrison #include <asm/segment.h> 32c61e211dSHarvey Harrison #include <asm/pgalloc.h> 33c61e211dSHarvey Harrison #include <asm/smp.h> 34c61e211dSHarvey Harrison #include <asm/tlbflush.h> 35c61e211dSHarvey Harrison #include <asm/proto.h> 36c61e211dSHarvey Harrison #include <asm-generic/sections.h> 37c61e211dSHarvey Harrison 38c61e211dSHarvey Harrison /* 39c61e211dSHarvey Harrison * Page fault error code bits 40c61e211dSHarvey Harrison * bit 0 == 0 means no page found, 1 means protection fault 41c61e211dSHarvey Harrison * bit 1 == 0 means read, 1 means write 42c61e211dSHarvey Harrison * bit 2 == 0 means kernel, 1 means user-mode 43c61e211dSHarvey Harrison * bit 3 == 1 means use of reserved bit detected 44c61e211dSHarvey Harrison * bit 4 == 1 means fault was an instruction fetch 45c61e211dSHarvey Harrison */ 46c61e211dSHarvey Harrison #define PF_PROT (1<<0) 47c61e211dSHarvey Harrison #define PF_WRITE (1<<1) 48c61e211dSHarvey Harrison #define PF_USER (1<<2) 49c61e211dSHarvey Harrison #define PF_RSVD (1<<3) 50c61e211dSHarvey Harrison #define PF_INSTR (1<<4) 51c61e211dSHarvey Harrison 52c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs) 53c61e211dSHarvey Harrison { 54c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES 55c61e211dSHarvey Harrison int ret = 0; 56c61e211dSHarvey Harrison 57c61e211dSHarvey Harrison /* kprobe_running() needs smp_processor_id() */ 58c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 59c61e211dSHarvey Harrison if (!user_mode_vm(regs)) { 60c61e211dSHarvey Harrison #else 61c61e211dSHarvey Harrison if (!user_mode(regs)) { 62c61e211dSHarvey Harrison #endif 63c61e211dSHarvey Harrison preempt_disable(); 64c61e211dSHarvey Harrison if (kprobe_running() && kprobe_fault_handler(regs, 14)) 65c61e211dSHarvey Harrison ret = 1; 66c61e211dSHarvey Harrison preempt_enable(); 67c61e211dSHarvey Harrison } 68c61e211dSHarvey Harrison 69c61e211dSHarvey Harrison return ret; 70c61e211dSHarvey Harrison #else 71c61e211dSHarvey Harrison return 0; 72c61e211dSHarvey Harrison #endif 73c61e211dSHarvey Harrison } 74c61e211dSHarvey Harrison 75c61e211dSHarvey Harrison /* 76c61e211dSHarvey Harrison * X86_32 77c61e211dSHarvey Harrison * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 78c61e211dSHarvey Harrison * Check that here and ignore it. 79c61e211dSHarvey Harrison * 80c61e211dSHarvey Harrison * X86_64 81c61e211dSHarvey Harrison * Sometimes the CPU reports invalid exceptions on prefetch. 82c61e211dSHarvey Harrison * Check that here and ignore it. 83c61e211dSHarvey Harrison * 84c61e211dSHarvey Harrison * Opcode checker based on code by Richard Brunner 85c61e211dSHarvey Harrison */ 86c61e211dSHarvey Harrison static int is_prefetch(struct pt_regs *regs, unsigned long addr, 87c61e211dSHarvey Harrison unsigned long error_code) 88c61e211dSHarvey Harrison { 89c61e211dSHarvey Harrison unsigned char *instr; 90c61e211dSHarvey Harrison int scan_more = 1; 91c61e211dSHarvey Harrison int prefetch = 0; 92c61e211dSHarvey Harrison unsigned char *max_instr; 93c61e211dSHarvey Harrison 94c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 95b406ac61SHarvey Harrison if (!(__supported_pte_mask & _PAGE_NX)) 96c61e211dSHarvey Harrison return 0; 97c61e211dSHarvey Harrison #endif 98b406ac61SHarvey Harrison 99c61e211dSHarvey Harrison /* If it was a exec fault on NX page, ignore */ 100c61e211dSHarvey Harrison if (error_code & PF_INSTR) 101c61e211dSHarvey Harrison return 0; 102c61e211dSHarvey Harrison 103c61e211dSHarvey Harrison instr = (unsigned char *)convert_ip_to_linear(current, regs); 104c61e211dSHarvey Harrison max_instr = instr + 15; 105c61e211dSHarvey Harrison 106c61e211dSHarvey Harrison if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) 107c61e211dSHarvey Harrison return 0; 108c61e211dSHarvey Harrison 109c61e211dSHarvey Harrison while (scan_more && instr < max_instr) { 110c61e211dSHarvey Harrison unsigned char opcode; 111c61e211dSHarvey Harrison unsigned char instr_hi; 112c61e211dSHarvey Harrison unsigned char instr_lo; 113c61e211dSHarvey Harrison 114c61e211dSHarvey Harrison if (probe_kernel_address(instr, opcode)) 115c61e211dSHarvey Harrison break; 116c61e211dSHarvey Harrison 117c61e211dSHarvey Harrison instr_hi = opcode & 0xf0; 118c61e211dSHarvey Harrison instr_lo = opcode & 0x0f; 119c61e211dSHarvey Harrison instr++; 120c61e211dSHarvey Harrison 121c61e211dSHarvey Harrison switch (instr_hi) { 122c61e211dSHarvey Harrison case 0x20: 123c61e211dSHarvey Harrison case 0x30: 124c61e211dSHarvey Harrison /* 125c61e211dSHarvey Harrison * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 126c61e211dSHarvey Harrison * In X86_64 long mode, the CPU will signal invalid 127c61e211dSHarvey Harrison * opcode if some of these prefixes are present so 128c61e211dSHarvey Harrison * X86_64 will never get here anyway 129c61e211dSHarvey Harrison */ 130c61e211dSHarvey Harrison scan_more = ((instr_lo & 7) == 0x6); 131c61e211dSHarvey Harrison break; 132c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 133c61e211dSHarvey Harrison case 0x40: 134c61e211dSHarvey Harrison /* 135c61e211dSHarvey Harrison * In AMD64 long mode 0x40..0x4F are valid REX prefixes 136c61e211dSHarvey Harrison * Need to figure out under what instruction mode the 137c61e211dSHarvey Harrison * instruction was issued. Could check the LDT for lm, 138c61e211dSHarvey Harrison * but for now it's good enough to assume that long 139c61e211dSHarvey Harrison * mode only uses well known segments or kernel. 140c61e211dSHarvey Harrison */ 141c61e211dSHarvey Harrison scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); 142c61e211dSHarvey Harrison break; 143c61e211dSHarvey Harrison #endif 144c61e211dSHarvey Harrison case 0x60: 145c61e211dSHarvey Harrison /* 0x64 thru 0x67 are valid prefixes in all modes. */ 146c61e211dSHarvey Harrison scan_more = (instr_lo & 0xC) == 0x4; 147c61e211dSHarvey Harrison break; 148c61e211dSHarvey Harrison case 0xF0: 149c61e211dSHarvey Harrison /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 150c61e211dSHarvey Harrison scan_more = !instr_lo || (instr_lo>>1) == 1; 151c61e211dSHarvey Harrison break; 152c61e211dSHarvey Harrison case 0x00: 153c61e211dSHarvey Harrison /* Prefetch instruction is 0x0F0D or 0x0F18 */ 154c61e211dSHarvey Harrison scan_more = 0; 155c61e211dSHarvey Harrison 156c61e211dSHarvey Harrison if (probe_kernel_address(instr, opcode)) 157c61e211dSHarvey Harrison break; 158c61e211dSHarvey Harrison prefetch = (instr_lo == 0xF) && 159c61e211dSHarvey Harrison (opcode == 0x0D || opcode == 0x18); 160c61e211dSHarvey Harrison break; 161c61e211dSHarvey Harrison default: 162c61e211dSHarvey Harrison scan_more = 0; 163c61e211dSHarvey Harrison break; 164c61e211dSHarvey Harrison } 165c61e211dSHarvey Harrison } 166c61e211dSHarvey Harrison return prefetch; 167c61e211dSHarvey Harrison } 168c61e211dSHarvey Harrison 169c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code, 170c61e211dSHarvey Harrison unsigned long address, struct task_struct *tsk) 171c61e211dSHarvey Harrison { 172c61e211dSHarvey Harrison siginfo_t info; 173c61e211dSHarvey Harrison 174c61e211dSHarvey Harrison info.si_signo = si_signo; 175c61e211dSHarvey Harrison info.si_errno = 0; 176c61e211dSHarvey Harrison info.si_code = si_code; 177c61e211dSHarvey Harrison info.si_addr = (void __user *)address; 178c61e211dSHarvey Harrison force_sig_info(si_signo, &info, tsk); 179c61e211dSHarvey Harrison } 180c61e211dSHarvey Harrison 181c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 182c61e211dSHarvey Harrison static int bad_address(void *p) 183c61e211dSHarvey Harrison { 184c61e211dSHarvey Harrison unsigned long dummy; 185c61e211dSHarvey Harrison return probe_kernel_address((unsigned long *)p, dummy); 186c61e211dSHarvey Harrison } 187c61e211dSHarvey Harrison #endif 188c61e211dSHarvey Harrison 189c61e211dSHarvey Harrison void dump_pagetable(unsigned long address) 190c61e211dSHarvey Harrison { 191c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 192c61e211dSHarvey Harrison __typeof__(pte_val(__pte(0))) page; 193c61e211dSHarvey Harrison 194c61e211dSHarvey Harrison page = read_cr3(); 195c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; 196c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE 197c61e211dSHarvey Harrison printk("*pdpt = %016Lx ", page); 198c61e211dSHarvey Harrison if ((page >> PAGE_SHIFT) < max_low_pfn 199c61e211dSHarvey Harrison && page & _PAGE_PRESENT) { 200c61e211dSHarvey Harrison page &= PAGE_MASK; 201c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) 202c61e211dSHarvey Harrison & (PTRS_PER_PMD - 1)]; 203c61e211dSHarvey Harrison printk(KERN_CONT "*pde = %016Lx ", page); 204c61e211dSHarvey Harrison page &= ~_PAGE_NX; 205c61e211dSHarvey Harrison } 206c61e211dSHarvey Harrison #else 207c61e211dSHarvey Harrison printk("*pde = %08lx ", page); 208c61e211dSHarvey Harrison #endif 209c61e211dSHarvey Harrison 210c61e211dSHarvey Harrison /* 211c61e211dSHarvey Harrison * We must not directly access the pte in the highpte 212c61e211dSHarvey Harrison * case if the page table is located in highmem. 213c61e211dSHarvey Harrison * And let's rather not kmap-atomic the pte, just in case 214c61e211dSHarvey Harrison * it's allocated already. 215c61e211dSHarvey Harrison */ 216c61e211dSHarvey Harrison if ((page >> PAGE_SHIFT) < max_low_pfn 217c61e211dSHarvey Harrison && (page & _PAGE_PRESENT) 218c61e211dSHarvey Harrison && !(page & _PAGE_PSE)) { 219c61e211dSHarvey Harrison page &= PAGE_MASK; 220c61e211dSHarvey Harrison page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) 221c61e211dSHarvey Harrison & (PTRS_PER_PTE - 1)]; 222c61e211dSHarvey Harrison printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); 223c61e211dSHarvey Harrison } 224c61e211dSHarvey Harrison 225c61e211dSHarvey Harrison printk("\n"); 226c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */ 227c61e211dSHarvey Harrison pgd_t *pgd; 228c61e211dSHarvey Harrison pud_t *pud; 229c61e211dSHarvey Harrison pmd_t *pmd; 230c61e211dSHarvey Harrison pte_t *pte; 231c61e211dSHarvey Harrison 232c61e211dSHarvey Harrison pgd = (pgd_t *)read_cr3(); 233c61e211dSHarvey Harrison 234c61e211dSHarvey Harrison pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 235c61e211dSHarvey Harrison pgd += pgd_index(address); 236c61e211dSHarvey Harrison if (bad_address(pgd)) goto bad; 237c61e211dSHarvey Harrison printk("PGD %lx ", pgd_val(*pgd)); 238c61e211dSHarvey Harrison if (!pgd_present(*pgd)) goto ret; 239c61e211dSHarvey Harrison 240c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 241c61e211dSHarvey Harrison if (bad_address(pud)) goto bad; 242c61e211dSHarvey Harrison printk("PUD %lx ", pud_val(*pud)); 243c61e211dSHarvey Harrison if (!pud_present(*pud)) goto ret; 244c61e211dSHarvey Harrison 245c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 246c61e211dSHarvey Harrison if (bad_address(pmd)) goto bad; 247c61e211dSHarvey Harrison printk("PMD %lx ", pmd_val(*pmd)); 248c61e211dSHarvey Harrison if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; 249c61e211dSHarvey Harrison 250c61e211dSHarvey Harrison pte = pte_offset_kernel(pmd, address); 251c61e211dSHarvey Harrison if (bad_address(pte)) goto bad; 252c61e211dSHarvey Harrison printk("PTE %lx", pte_val(*pte)); 253c61e211dSHarvey Harrison ret: 254c61e211dSHarvey Harrison printk("\n"); 255c61e211dSHarvey Harrison return; 256c61e211dSHarvey Harrison bad: 257c61e211dSHarvey Harrison printk("BAD\n"); 258c61e211dSHarvey Harrison #endif 259c61e211dSHarvey Harrison } 260c61e211dSHarvey Harrison 261c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 262c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 263c61e211dSHarvey Harrison { 264c61e211dSHarvey Harrison unsigned index = pgd_index(address); 265c61e211dSHarvey Harrison pgd_t *pgd_k; 266c61e211dSHarvey Harrison pud_t *pud, *pud_k; 267c61e211dSHarvey Harrison pmd_t *pmd, *pmd_k; 268c61e211dSHarvey Harrison 269c61e211dSHarvey Harrison pgd += index; 270c61e211dSHarvey Harrison pgd_k = init_mm.pgd + index; 271c61e211dSHarvey Harrison 272c61e211dSHarvey Harrison if (!pgd_present(*pgd_k)) 273c61e211dSHarvey Harrison return NULL; 274c61e211dSHarvey Harrison 275c61e211dSHarvey Harrison /* 276c61e211dSHarvey Harrison * set_pgd(pgd, *pgd_k); here would be useless on PAE 277c61e211dSHarvey Harrison * and redundant with the set_pmd() on non-PAE. As would 278c61e211dSHarvey Harrison * set_pud. 279c61e211dSHarvey Harrison */ 280c61e211dSHarvey Harrison 281c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 282c61e211dSHarvey Harrison pud_k = pud_offset(pgd_k, address); 283c61e211dSHarvey Harrison if (!pud_present(*pud_k)) 284c61e211dSHarvey Harrison return NULL; 285c61e211dSHarvey Harrison 286c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 287c61e211dSHarvey Harrison pmd_k = pmd_offset(pud_k, address); 288c61e211dSHarvey Harrison if (!pmd_present(*pmd_k)) 289c61e211dSHarvey Harrison return NULL; 290c61e211dSHarvey Harrison if (!pmd_present(*pmd)) { 291c61e211dSHarvey Harrison set_pmd(pmd, *pmd_k); 292c61e211dSHarvey Harrison arch_flush_lazy_mmu_mode(); 293c61e211dSHarvey Harrison } else 294c61e211dSHarvey Harrison BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); 295c61e211dSHarvey Harrison return pmd_k; 296c61e211dSHarvey Harrison } 297c61e211dSHarvey Harrison #endif 298c61e211dSHarvey Harrison 299c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 300c61e211dSHarvey Harrison static const char errata93_warning[] = 301c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 302c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" 303c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n" 304c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; 305c61e211dSHarvey Harrison #endif 306c61e211dSHarvey Harrison 307c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS. 308c61e211dSHarvey Harrison BIOS SMM functions are required to use a specific workaround 309c61e211dSHarvey Harrison to avoid corruption of the 64bit RIP register on C stepping K8. 310c61e211dSHarvey Harrison A lot of BIOS that didn't get tested properly miss this. 311c61e211dSHarvey Harrison The OS sees this as a page fault with the upper 32bits of RIP cleared. 312c61e211dSHarvey Harrison Try to work around it here. 313c61e211dSHarvey Harrison Note we only handle faults in kernel here. 314c61e211dSHarvey Harrison Does nothing for X86_32 315c61e211dSHarvey Harrison */ 316c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address) 317c61e211dSHarvey Harrison { 318c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 319c61e211dSHarvey Harrison static int warned; 320c61e211dSHarvey Harrison if (address != regs->ip) 321c61e211dSHarvey Harrison return 0; 322c61e211dSHarvey Harrison if ((address >> 32) != 0) 323c61e211dSHarvey Harrison return 0; 324c61e211dSHarvey Harrison address |= 0xffffffffUL << 32; 325c61e211dSHarvey Harrison if ((address >= (u64)_stext && address <= (u64)_etext) || 326c61e211dSHarvey Harrison (address >= MODULES_VADDR && address <= MODULES_END)) { 327c61e211dSHarvey Harrison if (!warned) { 328c61e211dSHarvey Harrison printk(errata93_warning); 329c61e211dSHarvey Harrison warned = 1; 330c61e211dSHarvey Harrison } 331c61e211dSHarvey Harrison regs->ip = address; 332c61e211dSHarvey Harrison return 1; 333c61e211dSHarvey Harrison } 334c61e211dSHarvey Harrison #endif 335c61e211dSHarvey Harrison return 0; 336c61e211dSHarvey Harrison } 337c61e211dSHarvey Harrison 338c61e211dSHarvey Harrison /* 339c61e211dSHarvey Harrison * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal 340c61e211dSHarvey Harrison * addresses >4GB. We catch this in the page fault handler because these 341c61e211dSHarvey Harrison * addresses are not reachable. Just detect this case and return. Any code 342c61e211dSHarvey Harrison * segment in LDT is compatibility mode. 343c61e211dSHarvey Harrison */ 344c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address) 345c61e211dSHarvey Harrison { 346c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 347c61e211dSHarvey Harrison if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && 348c61e211dSHarvey Harrison (address >> 32)) 349c61e211dSHarvey Harrison return 1; 350c61e211dSHarvey Harrison #endif 351c61e211dSHarvey Harrison return 0; 352c61e211dSHarvey Harrison } 353c61e211dSHarvey Harrison 354c61e211dSHarvey Harrison void do_invalid_op(struct pt_regs *, unsigned long); 355c61e211dSHarvey Harrison 356c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 357c61e211dSHarvey Harrison { 358c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG 359c61e211dSHarvey Harrison unsigned long nr; 360c61e211dSHarvey Harrison /* 361c61e211dSHarvey Harrison * Pentium F0 0F C7 C8 bug workaround. 362c61e211dSHarvey Harrison */ 363c61e211dSHarvey Harrison if (boot_cpu_data.f00f_bug) { 364c61e211dSHarvey Harrison nr = (address - idt_descr.address) >> 3; 365c61e211dSHarvey Harrison 366c61e211dSHarvey Harrison if (nr == 6) { 367c61e211dSHarvey Harrison do_invalid_op(regs, 0); 368c61e211dSHarvey Harrison return 1; 369c61e211dSHarvey Harrison } 370c61e211dSHarvey Harrison } 371c61e211dSHarvey Harrison #endif 372c61e211dSHarvey Harrison return 0; 373c61e211dSHarvey Harrison } 374c61e211dSHarvey Harrison 375c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, 376c61e211dSHarvey Harrison unsigned long address) 377c61e211dSHarvey Harrison { 378c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 379c61e211dSHarvey Harrison if (!oops_may_print()) 380c61e211dSHarvey Harrison return; 381*fd40d6e3SHarvey Harrison #endif 382c61e211dSHarvey Harrison 383c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE 384c61e211dSHarvey Harrison if (error_code & PF_INSTR) { 385c61e211dSHarvey Harrison int level; 386c61e211dSHarvey Harrison pte_t *pte = lookup_address(address, &level); 387c61e211dSHarvey Harrison 388c61e211dSHarvey Harrison if (pte && pte_present(*pte) && !pte_exec(*pte)) 389c61e211dSHarvey Harrison printk(KERN_CRIT "kernel tried to execute " 390c61e211dSHarvey Harrison "NX-protected page - exploit attempt? " 391c61e211dSHarvey Harrison "(uid: %d)\n", current->uid); 392c61e211dSHarvey Harrison } 393c61e211dSHarvey Harrison #endif 394*fd40d6e3SHarvey Harrison 395c61e211dSHarvey Harrison printk(KERN_ALERT "BUG: unable to handle kernel "); 396c61e211dSHarvey Harrison if (address < PAGE_SIZE) 397c61e211dSHarvey Harrison printk(KERN_CONT "NULL pointer dereference"); 398c61e211dSHarvey Harrison else 399c61e211dSHarvey Harrison printk(KERN_CONT "paging request"); 400*fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32 401c61e211dSHarvey Harrison printk(KERN_CONT " at %08lx\n", address); 402*fd40d6e3SHarvey Harrison #else 403c61e211dSHarvey Harrison printk(KERN_CONT " at %016lx\n", address); 404*fd40d6e3SHarvey Harrison #endif 405c61e211dSHarvey Harrison printk(KERN_ALERT "IP:"); 406c61e211dSHarvey Harrison printk_address(regs->ip, 1); 407c61e211dSHarvey Harrison dump_pagetable(address); 408c61e211dSHarvey Harrison } 409c61e211dSHarvey Harrison 410c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 411c61e211dSHarvey Harrison static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 412c61e211dSHarvey Harrison unsigned long error_code) 413c61e211dSHarvey Harrison { 414c61e211dSHarvey Harrison unsigned long flags = oops_begin(); 415c61e211dSHarvey Harrison struct task_struct *tsk; 416c61e211dSHarvey Harrison 417c61e211dSHarvey Harrison printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 418c61e211dSHarvey Harrison current->comm, address); 419c61e211dSHarvey Harrison dump_pagetable(address); 420c61e211dSHarvey Harrison tsk = current; 421c61e211dSHarvey Harrison tsk->thread.cr2 = address; 422c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 423c61e211dSHarvey Harrison tsk->thread.error_code = error_code; 424c61e211dSHarvey Harrison if (__die("Bad pagetable", regs, error_code)) 425c61e211dSHarvey Harrison regs = NULL; 426c61e211dSHarvey Harrison oops_end(flags, regs, SIGKILL); 427c61e211dSHarvey Harrison } 428c61e211dSHarvey Harrison #endif 429c61e211dSHarvey Harrison 430c61e211dSHarvey Harrison /* 4315b727a3bSJeremy Fitzhardinge * Handle a spurious fault caused by a stale TLB entry. This allows 4325b727a3bSJeremy Fitzhardinge * us to lazily refresh the TLB when increasing the permissions of a 4335b727a3bSJeremy Fitzhardinge * kernel page (RO -> RW or NX -> X). Doing it eagerly is very 4345b727a3bSJeremy Fitzhardinge * expensive since that implies doing a full cross-processor TLB 4355b727a3bSJeremy Fitzhardinge * flush, even if no stale TLB entries exist on other processors. 4365b727a3bSJeremy Fitzhardinge * There are no security implications to leaving a stale TLB when 4375b727a3bSJeremy Fitzhardinge * increasing the permissions on a page. 4385b727a3bSJeremy Fitzhardinge */ 4395b727a3bSJeremy Fitzhardinge static int spurious_fault(unsigned long address, 4405b727a3bSJeremy Fitzhardinge unsigned long error_code) 4415b727a3bSJeremy Fitzhardinge { 4425b727a3bSJeremy Fitzhardinge pgd_t *pgd; 4435b727a3bSJeremy Fitzhardinge pud_t *pud; 4445b727a3bSJeremy Fitzhardinge pmd_t *pmd; 4455b727a3bSJeremy Fitzhardinge pte_t *pte; 4465b727a3bSJeremy Fitzhardinge 4475b727a3bSJeremy Fitzhardinge /* Reserved-bit violation or user access to kernel space? */ 4485b727a3bSJeremy Fitzhardinge if (error_code & (PF_USER | PF_RSVD)) 4495b727a3bSJeremy Fitzhardinge return 0; 4505b727a3bSJeremy Fitzhardinge 4515b727a3bSJeremy Fitzhardinge pgd = init_mm.pgd + pgd_index(address); 4525b727a3bSJeremy Fitzhardinge if (!pgd_present(*pgd)) 4535b727a3bSJeremy Fitzhardinge return 0; 4545b727a3bSJeremy Fitzhardinge 4555b727a3bSJeremy Fitzhardinge pud = pud_offset(pgd, address); 4565b727a3bSJeremy Fitzhardinge if (!pud_present(*pud)) 4575b727a3bSJeremy Fitzhardinge return 0; 4585b727a3bSJeremy Fitzhardinge 4595b727a3bSJeremy Fitzhardinge pmd = pmd_offset(pud, address); 4605b727a3bSJeremy Fitzhardinge if (!pmd_present(*pmd)) 4615b727a3bSJeremy Fitzhardinge return 0; 4625b727a3bSJeremy Fitzhardinge 4635b727a3bSJeremy Fitzhardinge pte = pte_offset_kernel(pmd, address); 4645b727a3bSJeremy Fitzhardinge if (!pte_present(*pte)) 4655b727a3bSJeremy Fitzhardinge return 0; 4665b727a3bSJeremy Fitzhardinge 4675b727a3bSJeremy Fitzhardinge if ((error_code & PF_WRITE) && !pte_write(*pte)) 4685b727a3bSJeremy Fitzhardinge return 0; 4695b727a3bSJeremy Fitzhardinge if ((error_code & PF_INSTR) && !pte_exec(*pte)) 4705b727a3bSJeremy Fitzhardinge return 0; 4715b727a3bSJeremy Fitzhardinge 4725b727a3bSJeremy Fitzhardinge return 1; 4735b727a3bSJeremy Fitzhardinge } 4745b727a3bSJeremy Fitzhardinge 4755b727a3bSJeremy Fitzhardinge /* 476c61e211dSHarvey Harrison * X86_32 477c61e211dSHarvey Harrison * Handle a fault on the vmalloc or module mapping area 478c61e211dSHarvey Harrison * 479c61e211dSHarvey Harrison * X86_64 480c61e211dSHarvey Harrison * Handle a fault on the vmalloc area 481c61e211dSHarvey Harrison * 482c61e211dSHarvey Harrison * This assumes no large pages in there. 483c61e211dSHarvey Harrison */ 484c61e211dSHarvey Harrison static int vmalloc_fault(unsigned long address) 485c61e211dSHarvey Harrison { 486c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 487c61e211dSHarvey Harrison unsigned long pgd_paddr; 488c61e211dSHarvey Harrison pmd_t *pmd_k; 489c61e211dSHarvey Harrison pte_t *pte_k; 490c61e211dSHarvey Harrison /* 491c61e211dSHarvey Harrison * Synchronize this task's top level page-table 492c61e211dSHarvey Harrison * with the 'reference' page table. 493c61e211dSHarvey Harrison * 494c61e211dSHarvey Harrison * Do _not_ use "current" here. We might be inside 495c61e211dSHarvey Harrison * an interrupt in the middle of a task switch.. 496c61e211dSHarvey Harrison */ 497c61e211dSHarvey Harrison pgd_paddr = read_cr3(); 498c61e211dSHarvey Harrison pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 499c61e211dSHarvey Harrison if (!pmd_k) 500c61e211dSHarvey Harrison return -1; 501c61e211dSHarvey Harrison pte_k = pte_offset_kernel(pmd_k, address); 502c61e211dSHarvey Harrison if (!pte_present(*pte_k)) 503c61e211dSHarvey Harrison return -1; 504c61e211dSHarvey Harrison return 0; 505c61e211dSHarvey Harrison #else 506c61e211dSHarvey Harrison pgd_t *pgd, *pgd_ref; 507c61e211dSHarvey Harrison pud_t *pud, *pud_ref; 508c61e211dSHarvey Harrison pmd_t *pmd, *pmd_ref; 509c61e211dSHarvey Harrison pte_t *pte, *pte_ref; 510c61e211dSHarvey Harrison 511c61e211dSHarvey Harrison /* Copy kernel mappings over when needed. This can also 512c61e211dSHarvey Harrison happen within a race in page table update. In the later 513c61e211dSHarvey Harrison case just flush. */ 514c61e211dSHarvey Harrison 515c61e211dSHarvey Harrison pgd = pgd_offset(current->mm ?: &init_mm, address); 516c61e211dSHarvey Harrison pgd_ref = pgd_offset_k(address); 517c61e211dSHarvey Harrison if (pgd_none(*pgd_ref)) 518c61e211dSHarvey Harrison return -1; 519c61e211dSHarvey Harrison if (pgd_none(*pgd)) 520c61e211dSHarvey Harrison set_pgd(pgd, *pgd_ref); 521c61e211dSHarvey Harrison else 522c61e211dSHarvey Harrison BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 523c61e211dSHarvey Harrison 524c61e211dSHarvey Harrison /* Below here mismatches are bugs because these lower tables 525c61e211dSHarvey Harrison are shared */ 526c61e211dSHarvey Harrison 527c61e211dSHarvey Harrison pud = pud_offset(pgd, address); 528c61e211dSHarvey Harrison pud_ref = pud_offset(pgd_ref, address); 529c61e211dSHarvey Harrison if (pud_none(*pud_ref)) 530c61e211dSHarvey Harrison return -1; 531c61e211dSHarvey Harrison if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) 532c61e211dSHarvey Harrison BUG(); 533c61e211dSHarvey Harrison pmd = pmd_offset(pud, address); 534c61e211dSHarvey Harrison pmd_ref = pmd_offset(pud_ref, address); 535c61e211dSHarvey Harrison if (pmd_none(*pmd_ref)) 536c61e211dSHarvey Harrison return -1; 537c61e211dSHarvey Harrison if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) 538c61e211dSHarvey Harrison BUG(); 539c61e211dSHarvey Harrison pte_ref = pte_offset_kernel(pmd_ref, address); 540c61e211dSHarvey Harrison if (!pte_present(*pte_ref)) 541c61e211dSHarvey Harrison return -1; 542c61e211dSHarvey Harrison pte = pte_offset_kernel(pmd, address); 543c61e211dSHarvey Harrison /* Don't use pte_page here, because the mappings can point 544c61e211dSHarvey Harrison outside mem_map, and the NUMA hash lookup cannot handle 545c61e211dSHarvey Harrison that. */ 546c61e211dSHarvey Harrison if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) 547c61e211dSHarvey Harrison BUG(); 548c61e211dSHarvey Harrison return 0; 549c61e211dSHarvey Harrison #endif 550c61e211dSHarvey Harrison } 551c61e211dSHarvey Harrison 552c61e211dSHarvey Harrison int show_unhandled_signals = 1; 553c61e211dSHarvey Harrison 554c61e211dSHarvey Harrison /* 555c61e211dSHarvey Harrison * This routine handles page faults. It determines the address, 556c61e211dSHarvey Harrison * and the problem, and then passes it off to one of the appropriate 557c61e211dSHarvey Harrison * routines. 558c61e211dSHarvey Harrison */ 559c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 560c61e211dSHarvey Harrison asmlinkage 561c61e211dSHarvey Harrison #endif 562c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 563c61e211dSHarvey Harrison { 564c61e211dSHarvey Harrison struct task_struct *tsk; 565c61e211dSHarvey Harrison struct mm_struct *mm; 566c61e211dSHarvey Harrison struct vm_area_struct *vma; 567c61e211dSHarvey Harrison unsigned long address; 568c61e211dSHarvey Harrison int write, si_code; 569c61e211dSHarvey Harrison int fault; 570c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 571c61e211dSHarvey Harrison unsigned long flags; 572c61e211dSHarvey Harrison #endif 573c61e211dSHarvey Harrison 574c61e211dSHarvey Harrison /* 575c61e211dSHarvey Harrison * We can fault from pretty much anywhere, with unknown IRQ state. 576c61e211dSHarvey Harrison */ 577c61e211dSHarvey Harrison trace_hardirqs_fixup(); 578c61e211dSHarvey Harrison 579c61e211dSHarvey Harrison tsk = current; 580c61e211dSHarvey Harrison mm = tsk->mm; 581c61e211dSHarvey Harrison prefetchw(&mm->mmap_sem); 582c61e211dSHarvey Harrison 583c61e211dSHarvey Harrison /* get the address */ 584c61e211dSHarvey Harrison address = read_cr2(); 585c61e211dSHarvey Harrison 586c61e211dSHarvey Harrison si_code = SEGV_MAPERR; 587c61e211dSHarvey Harrison 588c61e211dSHarvey Harrison if (notify_page_fault(regs)) 589c61e211dSHarvey Harrison return; 590c61e211dSHarvey Harrison 591c61e211dSHarvey Harrison /* 592c61e211dSHarvey Harrison * We fault-in kernel-space virtual memory on-demand. The 593c61e211dSHarvey Harrison * 'reference' page table is init_mm.pgd. 594c61e211dSHarvey Harrison * 595c61e211dSHarvey Harrison * NOTE! We MUST NOT take any locks for this case. We may 596c61e211dSHarvey Harrison * be in an interrupt or a critical region, and should 597c61e211dSHarvey Harrison * only copy the information from the master page table, 598c61e211dSHarvey Harrison * nothing more. 599c61e211dSHarvey Harrison * 600c61e211dSHarvey Harrison * This verifies that the fault happens in kernel space 601c61e211dSHarvey Harrison * (error_code & 4) == 0, and that the fault was not a 602c61e211dSHarvey Harrison * protection error (error_code & 9) == 0. 603c61e211dSHarvey Harrison */ 604c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 605c61e211dSHarvey Harrison if (unlikely(address >= TASK_SIZE)) { 606c61e211dSHarvey Harrison if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 607c61e211dSHarvey Harrison vmalloc_fault(address) >= 0) 608c61e211dSHarvey Harrison return; 6095b727a3bSJeremy Fitzhardinge 6105b727a3bSJeremy Fitzhardinge /* Can handle a stale RO->RW TLB */ 6115b727a3bSJeremy Fitzhardinge if (spurious_fault(address, error_code)) 6125b727a3bSJeremy Fitzhardinge return; 6135b727a3bSJeremy Fitzhardinge 614c61e211dSHarvey Harrison /* 615c61e211dSHarvey Harrison * Don't take the mm semaphore here. If we fixup a prefetch 616c61e211dSHarvey Harrison * fault we could otherwise deadlock. 617c61e211dSHarvey Harrison */ 618c61e211dSHarvey Harrison goto bad_area_nosemaphore; 619c61e211dSHarvey Harrison } 620c61e211dSHarvey Harrison 621c61e211dSHarvey Harrison /* It's safe to allow irq's after cr2 has been saved and the vmalloc 622c61e211dSHarvey Harrison fault has been handled. */ 623c61e211dSHarvey Harrison if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) 624c61e211dSHarvey Harrison local_irq_enable(); 625c61e211dSHarvey Harrison 626c61e211dSHarvey Harrison /* 627c61e211dSHarvey Harrison * If we're in an interrupt, have no user context or are running in an 628c61e211dSHarvey Harrison * atomic region then we must not take the fault. 629c61e211dSHarvey Harrison */ 630c61e211dSHarvey Harrison if (in_atomic() || !mm) 631c61e211dSHarvey Harrison goto bad_area_nosemaphore; 632c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */ 633c61e211dSHarvey Harrison if (unlikely(address >= TASK_SIZE64)) { 634c61e211dSHarvey Harrison /* 635c61e211dSHarvey Harrison * Don't check for the module range here: its PML4 636c61e211dSHarvey Harrison * is always initialized because it's shared with the main 637c61e211dSHarvey Harrison * kernel text. Only vmalloc may need PML4 syncups. 638c61e211dSHarvey Harrison */ 639c61e211dSHarvey Harrison if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && 640c61e211dSHarvey Harrison ((address >= VMALLOC_START && address < VMALLOC_END))) { 641c61e211dSHarvey Harrison if (vmalloc_fault(address) >= 0) 642c61e211dSHarvey Harrison return; 643c61e211dSHarvey Harrison } 6445b727a3bSJeremy Fitzhardinge 6455b727a3bSJeremy Fitzhardinge /* Can handle a stale RO->RW TLB */ 6465b727a3bSJeremy Fitzhardinge if (spurious_fault(address, error_code)) 6475b727a3bSJeremy Fitzhardinge return; 6485b727a3bSJeremy Fitzhardinge 649c61e211dSHarvey Harrison /* 650c61e211dSHarvey Harrison * Don't take the mm semaphore here. If we fixup a prefetch 651c61e211dSHarvey Harrison * fault we could otherwise deadlock. 652c61e211dSHarvey Harrison */ 653c61e211dSHarvey Harrison goto bad_area_nosemaphore; 654c61e211dSHarvey Harrison } 655c61e211dSHarvey Harrison if (likely(regs->flags & X86_EFLAGS_IF)) 656c61e211dSHarvey Harrison local_irq_enable(); 657c61e211dSHarvey Harrison 658c61e211dSHarvey Harrison if (unlikely(error_code & PF_RSVD)) 659c61e211dSHarvey Harrison pgtable_bad(address, regs, error_code); 660c61e211dSHarvey Harrison 661c61e211dSHarvey Harrison /* 662c61e211dSHarvey Harrison * If we're in an interrupt, have no user context or are running in an 663c61e211dSHarvey Harrison * atomic region then we must not take the fault. 664c61e211dSHarvey Harrison */ 665c61e211dSHarvey Harrison if (unlikely(in_atomic() || !mm)) 666c61e211dSHarvey Harrison goto bad_area_nosemaphore; 667c61e211dSHarvey Harrison 668c61e211dSHarvey Harrison /* 669c61e211dSHarvey Harrison * User-mode registers count as a user access even for any 670c61e211dSHarvey Harrison * potential system fault or CPU buglet. 671c61e211dSHarvey Harrison */ 672c61e211dSHarvey Harrison if (user_mode_vm(regs)) 673c61e211dSHarvey Harrison error_code |= PF_USER; 674c61e211dSHarvey Harrison again: 675c61e211dSHarvey Harrison #endif 676c61e211dSHarvey Harrison /* When running in the kernel we expect faults to occur only to 677c61e211dSHarvey Harrison * addresses in user space. All other faults represent errors in the 678c61e211dSHarvey Harrison * kernel and should generate an OOPS. Unfortunately, in the case of an 679c61e211dSHarvey Harrison * erroneous fault occurring in a code path which already holds mmap_sem 680c61e211dSHarvey Harrison * we will deadlock attempting to validate the fault against the 681c61e211dSHarvey Harrison * address space. Luckily the kernel only validly references user 682c61e211dSHarvey Harrison * space from well defined areas of code, which are listed in the 683c61e211dSHarvey Harrison * exceptions table. 684c61e211dSHarvey Harrison * 685c61e211dSHarvey Harrison * As the vast majority of faults will be valid we will only perform 686c61e211dSHarvey Harrison * the source reference check when there is a possibility of a deadlock. 687c61e211dSHarvey Harrison * Attempt to lock the address space, if we cannot we then validate the 688c61e211dSHarvey Harrison * source. If this is invalid we can skip the address space check, 689c61e211dSHarvey Harrison * thus avoiding the deadlock. 690c61e211dSHarvey Harrison */ 691c61e211dSHarvey Harrison if (!down_read_trylock(&mm->mmap_sem)) { 692c61e211dSHarvey Harrison if ((error_code & PF_USER) == 0 && 693c61e211dSHarvey Harrison !search_exception_tables(regs->ip)) 694c61e211dSHarvey Harrison goto bad_area_nosemaphore; 695c61e211dSHarvey Harrison down_read(&mm->mmap_sem); 696c61e211dSHarvey Harrison } 697c61e211dSHarvey Harrison 698c61e211dSHarvey Harrison vma = find_vma(mm, address); 699c61e211dSHarvey Harrison if (!vma) 700c61e211dSHarvey Harrison goto bad_area; 701c61e211dSHarvey Harrison if (vma->vm_start <= address) 702c61e211dSHarvey Harrison goto good_area; 703c61e211dSHarvey Harrison if (!(vma->vm_flags & VM_GROWSDOWN)) 704c61e211dSHarvey Harrison goto bad_area; 705c61e211dSHarvey Harrison if (error_code & PF_USER) { 706c61e211dSHarvey Harrison /* 707c61e211dSHarvey Harrison * Accessing the stack below %sp is always a bug. 708c61e211dSHarvey Harrison * The large cushion allows instructions like enter 709c61e211dSHarvey Harrison * and pusha to work. ("enter $65535,$31" pushes 710c61e211dSHarvey Harrison * 32 pointers and then decrements %sp by 65535.) 711c61e211dSHarvey Harrison */ 712c61e211dSHarvey Harrison if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 713c61e211dSHarvey Harrison goto bad_area; 714c61e211dSHarvey Harrison } 715c61e211dSHarvey Harrison if (expand_stack(vma, address)) 716c61e211dSHarvey Harrison goto bad_area; 717c61e211dSHarvey Harrison /* 718c61e211dSHarvey Harrison * Ok, we have a good vm_area for this memory access, so 719c61e211dSHarvey Harrison * we can handle it.. 720c61e211dSHarvey Harrison */ 721c61e211dSHarvey Harrison good_area: 722c61e211dSHarvey Harrison si_code = SEGV_ACCERR; 723c61e211dSHarvey Harrison write = 0; 724c61e211dSHarvey Harrison switch (error_code & (PF_PROT|PF_WRITE)) { 725c61e211dSHarvey Harrison default: /* 3: write, present */ 726c61e211dSHarvey Harrison /* fall through */ 727c61e211dSHarvey Harrison case PF_WRITE: /* write, not present */ 728c61e211dSHarvey Harrison if (!(vma->vm_flags & VM_WRITE)) 729c61e211dSHarvey Harrison goto bad_area; 730c61e211dSHarvey Harrison write++; 731c61e211dSHarvey Harrison break; 732c61e211dSHarvey Harrison case PF_PROT: /* read, present */ 733c61e211dSHarvey Harrison goto bad_area; 734c61e211dSHarvey Harrison case 0: /* read, not present */ 735c61e211dSHarvey Harrison if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 736c61e211dSHarvey Harrison goto bad_area; 737c61e211dSHarvey Harrison } 738c61e211dSHarvey Harrison 739c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 740c61e211dSHarvey Harrison survive: 741c61e211dSHarvey Harrison #endif 742c61e211dSHarvey Harrison /* 743c61e211dSHarvey Harrison * If for any reason at all we couldn't handle the fault, 744c61e211dSHarvey Harrison * make sure we exit gracefully rather than endlessly redo 745c61e211dSHarvey Harrison * the fault. 746c61e211dSHarvey Harrison */ 747c61e211dSHarvey Harrison fault = handle_mm_fault(mm, vma, address, write); 748c61e211dSHarvey Harrison if (unlikely(fault & VM_FAULT_ERROR)) { 749c61e211dSHarvey Harrison if (fault & VM_FAULT_OOM) 750c61e211dSHarvey Harrison goto out_of_memory; 751c61e211dSHarvey Harrison else if (fault & VM_FAULT_SIGBUS) 752c61e211dSHarvey Harrison goto do_sigbus; 753c61e211dSHarvey Harrison BUG(); 754c61e211dSHarvey Harrison } 755c61e211dSHarvey Harrison if (fault & VM_FAULT_MAJOR) 756c61e211dSHarvey Harrison tsk->maj_flt++; 757c61e211dSHarvey Harrison else 758c61e211dSHarvey Harrison tsk->min_flt++; 759c61e211dSHarvey Harrison 760c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 761c61e211dSHarvey Harrison /* 762c61e211dSHarvey Harrison * Did it hit the DOS screen memory VA from vm86 mode? 763c61e211dSHarvey Harrison */ 764c61e211dSHarvey Harrison if (v8086_mode(regs)) { 765c61e211dSHarvey Harrison unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; 766c61e211dSHarvey Harrison if (bit < 32) 767c61e211dSHarvey Harrison tsk->thread.screen_bitmap |= 1 << bit; 768c61e211dSHarvey Harrison } 769c61e211dSHarvey Harrison #endif 770c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 771c61e211dSHarvey Harrison return; 772c61e211dSHarvey Harrison 773c61e211dSHarvey Harrison /* 774c61e211dSHarvey Harrison * Something tried to access memory that isn't in our memory map.. 775c61e211dSHarvey Harrison * Fix it, but check if it's kernel or user first.. 776c61e211dSHarvey Harrison */ 777c61e211dSHarvey Harrison bad_area: 778c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 779c61e211dSHarvey Harrison 780c61e211dSHarvey Harrison bad_area_nosemaphore: 781c61e211dSHarvey Harrison /* User mode accesses just cause a SIGSEGV */ 782c61e211dSHarvey Harrison if (error_code & PF_USER) { 783c61e211dSHarvey Harrison /* 784c61e211dSHarvey Harrison * It's possible to have interrupts off here. 785c61e211dSHarvey Harrison */ 786c61e211dSHarvey Harrison local_irq_enable(); 787c61e211dSHarvey Harrison 788c61e211dSHarvey Harrison /* 789c61e211dSHarvey Harrison * Valid to do another page fault here because this one came 790c61e211dSHarvey Harrison * from user space. 791c61e211dSHarvey Harrison */ 792c61e211dSHarvey Harrison if (is_prefetch(regs, address, error_code)) 793c61e211dSHarvey Harrison return; 794c61e211dSHarvey Harrison 795c61e211dSHarvey Harrison if (is_errata100(regs, address)) 796c61e211dSHarvey Harrison return; 797c61e211dSHarvey Harrison 798c61e211dSHarvey Harrison if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 799c61e211dSHarvey Harrison printk_ratelimit()) { 800c61e211dSHarvey Harrison printk( 801c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 802c61e211dSHarvey Harrison "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", 803c61e211dSHarvey Harrison #else 804c61e211dSHarvey Harrison "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", 805c61e211dSHarvey Harrison #endif 806c61e211dSHarvey Harrison task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, 807c61e211dSHarvey Harrison tsk->comm, task_pid_nr(tsk), address, regs->ip, 808c61e211dSHarvey Harrison regs->sp, error_code); 809c61e211dSHarvey Harrison print_vma_addr(" in ", regs->ip); 810c61e211dSHarvey Harrison printk("\n"); 811c61e211dSHarvey Harrison } 812c61e211dSHarvey Harrison 813c61e211dSHarvey Harrison tsk->thread.cr2 = address; 814c61e211dSHarvey Harrison /* Kernel addresses are always protection faults */ 815c61e211dSHarvey Harrison tsk->thread.error_code = error_code | (address >= TASK_SIZE); 816c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 817c61e211dSHarvey Harrison force_sig_info_fault(SIGSEGV, si_code, address, tsk); 818c61e211dSHarvey Harrison return; 819c61e211dSHarvey Harrison } 820c61e211dSHarvey Harrison 821c61e211dSHarvey Harrison if (is_f00f_bug(regs, address)) 822c61e211dSHarvey Harrison return; 823c61e211dSHarvey Harrison 824c61e211dSHarvey Harrison no_context: 825c61e211dSHarvey Harrison /* Are we prepared to handle this kernel fault? */ 826c61e211dSHarvey Harrison if (fixup_exception(regs)) 827c61e211dSHarvey Harrison return; 828c61e211dSHarvey Harrison 829c61e211dSHarvey Harrison /* 830c61e211dSHarvey Harrison * X86_32 831c61e211dSHarvey Harrison * Valid to do another page fault here, because if this fault 832c61e211dSHarvey Harrison * had been triggered by is_prefetch fixup_exception would have 833c61e211dSHarvey Harrison * handled it. 834c61e211dSHarvey Harrison * 835c61e211dSHarvey Harrison * X86_64 836c61e211dSHarvey Harrison * Hall of shame of CPU/BIOS bugs. 837c61e211dSHarvey Harrison */ 838c61e211dSHarvey Harrison if (is_prefetch(regs, address, error_code)) 839c61e211dSHarvey Harrison return; 840c61e211dSHarvey Harrison 841c61e211dSHarvey Harrison if (is_errata93(regs, address)) 842c61e211dSHarvey Harrison return; 843c61e211dSHarvey Harrison 844c61e211dSHarvey Harrison /* 845c61e211dSHarvey Harrison * Oops. The kernel tried to access some bad page. We'll have to 846c61e211dSHarvey Harrison * terminate things with extreme prejudice. 847c61e211dSHarvey Harrison */ 848c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 849c61e211dSHarvey Harrison bust_spinlocks(1); 850*fd40d6e3SHarvey Harrison #else 851*fd40d6e3SHarvey Harrison flags = oops_begin(); 852*fd40d6e3SHarvey Harrison #endif 853c61e211dSHarvey Harrison 854c61e211dSHarvey Harrison show_fault_oops(regs, error_code, address); 855c61e211dSHarvey Harrison 856c61e211dSHarvey Harrison tsk->thread.cr2 = address; 857c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 858c61e211dSHarvey Harrison tsk->thread.error_code = error_code; 859*fd40d6e3SHarvey Harrison 860*fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32 861c61e211dSHarvey Harrison die("Oops", regs, error_code); 862c61e211dSHarvey Harrison bust_spinlocks(0); 863c61e211dSHarvey Harrison do_exit(SIGKILL); 864*fd40d6e3SHarvey Harrison #else 865c61e211dSHarvey Harrison if (__die("Oops", regs, error_code)) 866c61e211dSHarvey Harrison regs = NULL; 867c61e211dSHarvey Harrison /* Executive summary in case the body of the oops scrolled away */ 868c61e211dSHarvey Harrison printk(KERN_EMERG "CR2: %016lx\n", address); 869c61e211dSHarvey Harrison oops_end(flags, regs, SIGKILL); 870c61e211dSHarvey Harrison #endif 871c61e211dSHarvey Harrison 872c61e211dSHarvey Harrison /* 873c61e211dSHarvey Harrison * We ran out of memory, or some other thing happened to us that made 874c61e211dSHarvey Harrison * us unable to handle the page fault gracefully. 875c61e211dSHarvey Harrison */ 876c61e211dSHarvey Harrison out_of_memory: 877c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 878c61e211dSHarvey Harrison if (is_global_init(tsk)) { 879c61e211dSHarvey Harrison yield(); 880*fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32 881c61e211dSHarvey Harrison down_read(&mm->mmap_sem); 882c61e211dSHarvey Harrison goto survive; 883c61e211dSHarvey Harrison #else 884c61e211dSHarvey Harrison goto again; 885c61e211dSHarvey Harrison #endif 886*fd40d6e3SHarvey Harrison } 887*fd40d6e3SHarvey Harrison 888c61e211dSHarvey Harrison printk("VM: killing process %s\n", tsk->comm); 889c61e211dSHarvey Harrison if (error_code & PF_USER) 890c61e211dSHarvey Harrison do_group_exit(SIGKILL); 891c61e211dSHarvey Harrison goto no_context; 892c61e211dSHarvey Harrison 893c61e211dSHarvey Harrison do_sigbus: 894c61e211dSHarvey Harrison up_read(&mm->mmap_sem); 895c61e211dSHarvey Harrison 896c61e211dSHarvey Harrison /* Kernel mode? Handle exceptions or die */ 897c61e211dSHarvey Harrison if (!(error_code & PF_USER)) 898c61e211dSHarvey Harrison goto no_context; 899c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 900c61e211dSHarvey Harrison /* User space => ok to do another page fault */ 901c61e211dSHarvey Harrison if (is_prefetch(regs, address, error_code)) 902c61e211dSHarvey Harrison return; 903c61e211dSHarvey Harrison #endif 904c61e211dSHarvey Harrison tsk->thread.cr2 = address; 905c61e211dSHarvey Harrison tsk->thread.error_code = error_code; 906c61e211dSHarvey Harrison tsk->thread.trap_no = 14; 907c61e211dSHarvey Harrison force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); 908c61e211dSHarvey Harrison } 909c61e211dSHarvey Harrison 910c61e211dSHarvey Harrison #ifdef CONFIG_X86_64 911c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock); 912c61e211dSHarvey Harrison LIST_HEAD(pgd_list); 913c61e211dSHarvey Harrison #endif 914c61e211dSHarvey Harrison 915c61e211dSHarvey Harrison void vmalloc_sync_all(void) 916c61e211dSHarvey Harrison { 917c61e211dSHarvey Harrison #ifdef CONFIG_X86_32 918c61e211dSHarvey Harrison /* 919c61e211dSHarvey Harrison * Note that races in the updates of insync and start aren't 920c61e211dSHarvey Harrison * problematic: insync can only get set bits added, and updates to 921c61e211dSHarvey Harrison * start are only improving performance (without affecting correctness 922c61e211dSHarvey Harrison * if undone). 923c61e211dSHarvey Harrison */ 924c61e211dSHarvey Harrison static DECLARE_BITMAP(insync, PTRS_PER_PGD); 925c61e211dSHarvey Harrison static unsigned long start = TASK_SIZE; 926c61e211dSHarvey Harrison unsigned long address; 927c61e211dSHarvey Harrison 928c61e211dSHarvey Harrison if (SHARED_KERNEL_PMD) 929c61e211dSHarvey Harrison return; 930c61e211dSHarvey Harrison 931c61e211dSHarvey Harrison BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 932c61e211dSHarvey Harrison for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 933c61e211dSHarvey Harrison if (!test_bit(pgd_index(address), insync)) { 934c61e211dSHarvey Harrison unsigned long flags; 935c61e211dSHarvey Harrison struct page *page; 936c61e211dSHarvey Harrison 937c61e211dSHarvey Harrison spin_lock_irqsave(&pgd_lock, flags); 938c61e211dSHarvey Harrison for (page = pgd_list; page; page = 939c61e211dSHarvey Harrison (struct page *)page->index) 940c61e211dSHarvey Harrison if (!vmalloc_sync_one(page_address(page), 941c61e211dSHarvey Harrison address)) { 942c61e211dSHarvey Harrison BUG_ON(page != pgd_list); 943c61e211dSHarvey Harrison break; 944c61e211dSHarvey Harrison } 945c61e211dSHarvey Harrison spin_unlock_irqrestore(&pgd_lock, flags); 946c61e211dSHarvey Harrison if (!page) 947c61e211dSHarvey Harrison set_bit(pgd_index(address), insync); 948c61e211dSHarvey Harrison } 949c61e211dSHarvey Harrison if (address == start && test_bit(pgd_index(address), insync)) 950c61e211dSHarvey Harrison start = address + PGDIR_SIZE; 951c61e211dSHarvey Harrison } 952c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */ 953c61e211dSHarvey Harrison /* 954c61e211dSHarvey Harrison * Note that races in the updates of insync and start aren't 955c61e211dSHarvey Harrison * problematic: insync can only get set bits added, and updates to 956c61e211dSHarvey Harrison * start are only improving performance (without affecting correctness 957c61e211dSHarvey Harrison * if undone). 958c61e211dSHarvey Harrison */ 959c61e211dSHarvey Harrison static DECLARE_BITMAP(insync, PTRS_PER_PGD); 960c61e211dSHarvey Harrison static unsigned long start = VMALLOC_START & PGDIR_MASK; 961c61e211dSHarvey Harrison unsigned long address; 962c61e211dSHarvey Harrison 963c61e211dSHarvey Harrison for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { 964c61e211dSHarvey Harrison if (!test_bit(pgd_index(address), insync)) { 965c61e211dSHarvey Harrison const pgd_t *pgd_ref = pgd_offset_k(address); 966c61e211dSHarvey Harrison struct page *page; 967c61e211dSHarvey Harrison 968c61e211dSHarvey Harrison if (pgd_none(*pgd_ref)) 969c61e211dSHarvey Harrison continue; 970c61e211dSHarvey Harrison spin_lock(&pgd_lock); 971c61e211dSHarvey Harrison list_for_each_entry(page, &pgd_list, lru) { 972c61e211dSHarvey Harrison pgd_t *pgd; 973c61e211dSHarvey Harrison pgd = (pgd_t *)page_address(page) + pgd_index(address); 974c61e211dSHarvey Harrison if (pgd_none(*pgd)) 975c61e211dSHarvey Harrison set_pgd(pgd, *pgd_ref); 976c61e211dSHarvey Harrison else 977c61e211dSHarvey Harrison BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); 978c61e211dSHarvey Harrison } 979c61e211dSHarvey Harrison spin_unlock(&pgd_lock); 980c61e211dSHarvey Harrison set_bit(pgd_index(address), insync); 981c61e211dSHarvey Harrison } 982c61e211dSHarvey Harrison if (address == start) 983c61e211dSHarvey Harrison start = address + PGDIR_SIZE; 984c61e211dSHarvey Harrison } 985c61e211dSHarvey Harrison /* Check that there is no need to do the same for the modules area. */ 986c61e211dSHarvey Harrison BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 987c61e211dSHarvey Harrison BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 988c61e211dSHarvey Harrison (__START_KERNEL & PGDIR_MASK))); 989c61e211dSHarvey Harrison #endif 990c61e211dSHarvey Harrison } 991