xref: /openbmc/linux/arch/x86/mm/fault.c (revision 5b727a3b0158a129827c21ce3bfb0ba997e8ddd0)
1c61e211dSHarvey Harrison /*
2c61e211dSHarvey Harrison  *  Copyright (C) 1995  Linus Torvalds
3c61e211dSHarvey Harrison  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4c61e211dSHarvey Harrison  */
5c61e211dSHarvey Harrison 
6c61e211dSHarvey Harrison #include <linux/signal.h>
7c61e211dSHarvey Harrison #include <linux/sched.h>
8c61e211dSHarvey Harrison #include <linux/kernel.h>
9c61e211dSHarvey Harrison #include <linux/errno.h>
10c61e211dSHarvey Harrison #include <linux/string.h>
11c61e211dSHarvey Harrison #include <linux/types.h>
12c61e211dSHarvey Harrison #include <linux/ptrace.h>
13c61e211dSHarvey Harrison #include <linux/mman.h>
14c61e211dSHarvey Harrison #include <linux/mm.h>
15c61e211dSHarvey Harrison #include <linux/smp.h>
16c61e211dSHarvey Harrison #include <linux/interrupt.h>
17c61e211dSHarvey Harrison #include <linux/init.h>
18c61e211dSHarvey Harrison #include <linux/tty.h>
19c61e211dSHarvey Harrison #include <linux/vt_kern.h>		/* For unblank_screen() */
20c61e211dSHarvey Harrison #include <linux/compiler.h>
21c61e211dSHarvey Harrison #include <linux/highmem.h>
22c61e211dSHarvey Harrison #include <linux/bootmem.h>		/* for max_low_pfn */
23c61e211dSHarvey Harrison #include <linux/vmalloc.h>
24c61e211dSHarvey Harrison #include <linux/module.h>
25c61e211dSHarvey Harrison #include <linux/kprobes.h>
26c61e211dSHarvey Harrison #include <linux/uaccess.h>
27c61e211dSHarvey Harrison #include <linux/kdebug.h>
28c61e211dSHarvey Harrison 
29c61e211dSHarvey Harrison #include <asm/system.h>
30c61e211dSHarvey Harrison #include <asm/desc.h>
31c61e211dSHarvey Harrison #include <asm/segment.h>
32c61e211dSHarvey Harrison #include <asm/pgalloc.h>
33c61e211dSHarvey Harrison #include <asm/smp.h>
34c61e211dSHarvey Harrison #include <asm/tlbflush.h>
35c61e211dSHarvey Harrison #include <asm/proto.h>
36c61e211dSHarvey Harrison #include <asm-generic/sections.h>
37c61e211dSHarvey Harrison 
38c61e211dSHarvey Harrison /*
39c61e211dSHarvey Harrison  * Page fault error code bits
40c61e211dSHarvey Harrison  *	bit 0 == 0 means no page found, 1 means protection fault
41c61e211dSHarvey Harrison  *	bit 1 == 0 means read, 1 means write
42c61e211dSHarvey Harrison  *	bit 2 == 0 means kernel, 1 means user-mode
43c61e211dSHarvey Harrison  *	bit 3 == 1 means use of reserved bit detected
44c61e211dSHarvey Harrison  *	bit 4 == 1 means fault was an instruction fetch
45c61e211dSHarvey Harrison  */
46c61e211dSHarvey Harrison #define PF_PROT		(1<<0)
47c61e211dSHarvey Harrison #define PF_WRITE	(1<<1)
48c61e211dSHarvey Harrison #define PF_USER		(1<<2)
49c61e211dSHarvey Harrison #define PF_RSVD		(1<<3)
50c61e211dSHarvey Harrison #define PF_INSTR	(1<<4)
51c61e211dSHarvey Harrison 
52c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs)
53c61e211dSHarvey Harrison {
54c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES
55c61e211dSHarvey Harrison 	int ret = 0;
56c61e211dSHarvey Harrison 
57c61e211dSHarvey Harrison 	/* kprobe_running() needs smp_processor_id() */
58c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
59c61e211dSHarvey Harrison 	if (!user_mode_vm(regs)) {
60c61e211dSHarvey Harrison #else
61c61e211dSHarvey Harrison 	if (!user_mode(regs)) {
62c61e211dSHarvey Harrison #endif
63c61e211dSHarvey Harrison 		preempt_disable();
64c61e211dSHarvey Harrison 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
65c61e211dSHarvey Harrison 			ret = 1;
66c61e211dSHarvey Harrison 		preempt_enable();
67c61e211dSHarvey Harrison 	}
68c61e211dSHarvey Harrison 
69c61e211dSHarvey Harrison 	return ret;
70c61e211dSHarvey Harrison #else
71c61e211dSHarvey Harrison 	return 0;
72c61e211dSHarvey Harrison #endif
73c61e211dSHarvey Harrison }
74c61e211dSHarvey Harrison 
75c61e211dSHarvey Harrison /*
76c61e211dSHarvey Harrison  * X86_32
77c61e211dSHarvey Harrison  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
78c61e211dSHarvey Harrison  * Check that here and ignore it.
79c61e211dSHarvey Harrison  *
80c61e211dSHarvey Harrison  * X86_64
81c61e211dSHarvey Harrison  * Sometimes the CPU reports invalid exceptions on prefetch.
82c61e211dSHarvey Harrison  * Check that here and ignore it.
83c61e211dSHarvey Harrison  *
84c61e211dSHarvey Harrison  * Opcode checker based on code by Richard Brunner
85c61e211dSHarvey Harrison  */
86c61e211dSHarvey Harrison static int is_prefetch(struct pt_regs *regs, unsigned long addr,
87c61e211dSHarvey Harrison 		       unsigned long error_code)
88c61e211dSHarvey Harrison {
89c61e211dSHarvey Harrison 	unsigned char *instr;
90c61e211dSHarvey Harrison 	int scan_more = 1;
91c61e211dSHarvey Harrison 	int prefetch = 0;
92c61e211dSHarvey Harrison 	unsigned char *max_instr;
93c61e211dSHarvey Harrison 
94c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
95b406ac61SHarvey Harrison 	if (!(__supported_pte_mask & _PAGE_NX))
96c61e211dSHarvey Harrison 		return 0;
97c61e211dSHarvey Harrison #endif
98b406ac61SHarvey Harrison 
99c61e211dSHarvey Harrison 	/* If it was a exec fault on NX page, ignore */
100c61e211dSHarvey Harrison 	if (error_code & PF_INSTR)
101c61e211dSHarvey Harrison 		return 0;
102c61e211dSHarvey Harrison 
103c61e211dSHarvey Harrison 	instr = (unsigned char *)convert_ip_to_linear(current, regs);
104c61e211dSHarvey Harrison 	max_instr = instr + 15;
105c61e211dSHarvey Harrison 
106c61e211dSHarvey Harrison 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
107c61e211dSHarvey Harrison 		return 0;
108c61e211dSHarvey Harrison 
109c61e211dSHarvey Harrison 	while (scan_more && instr < max_instr) {
110c61e211dSHarvey Harrison 		unsigned char opcode;
111c61e211dSHarvey Harrison 		unsigned char instr_hi;
112c61e211dSHarvey Harrison 		unsigned char instr_lo;
113c61e211dSHarvey Harrison 
114c61e211dSHarvey Harrison 		if (probe_kernel_address(instr, opcode))
115c61e211dSHarvey Harrison 			break;
116c61e211dSHarvey Harrison 
117c61e211dSHarvey Harrison 		instr_hi = opcode & 0xf0;
118c61e211dSHarvey Harrison 		instr_lo = opcode & 0x0f;
119c61e211dSHarvey Harrison 		instr++;
120c61e211dSHarvey Harrison 
121c61e211dSHarvey Harrison 		switch (instr_hi) {
122c61e211dSHarvey Harrison 		case 0x20:
123c61e211dSHarvey Harrison 		case 0x30:
124c61e211dSHarvey Harrison 			/*
125c61e211dSHarvey Harrison 			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
126c61e211dSHarvey Harrison 			 * In X86_64 long mode, the CPU will signal invalid
127c61e211dSHarvey Harrison 			 * opcode if some of these prefixes are present so
128c61e211dSHarvey Harrison 			 * X86_64 will never get here anyway
129c61e211dSHarvey Harrison 			 */
130c61e211dSHarvey Harrison 			scan_more = ((instr_lo & 7) == 0x6);
131c61e211dSHarvey Harrison 			break;
132c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
133c61e211dSHarvey Harrison 		case 0x40:
134c61e211dSHarvey Harrison 			/*
135c61e211dSHarvey Harrison 			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
136c61e211dSHarvey Harrison 			 * Need to figure out under what instruction mode the
137c61e211dSHarvey Harrison 			 * instruction was issued. Could check the LDT for lm,
138c61e211dSHarvey Harrison 			 * but for now it's good enough to assume that long
139c61e211dSHarvey Harrison 			 * mode only uses well known segments or kernel.
140c61e211dSHarvey Harrison 			 */
141c61e211dSHarvey Harrison 			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
142c61e211dSHarvey Harrison 			break;
143c61e211dSHarvey Harrison #endif
144c61e211dSHarvey Harrison 		case 0x60:
145c61e211dSHarvey Harrison 			/* 0x64 thru 0x67 are valid prefixes in all modes. */
146c61e211dSHarvey Harrison 			scan_more = (instr_lo & 0xC) == 0x4;
147c61e211dSHarvey Harrison 			break;
148c61e211dSHarvey Harrison 		case 0xF0:
149c61e211dSHarvey Harrison 			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
150c61e211dSHarvey Harrison 			scan_more = !instr_lo || (instr_lo>>1) == 1;
151c61e211dSHarvey Harrison 			break;
152c61e211dSHarvey Harrison 		case 0x00:
153c61e211dSHarvey Harrison 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
154c61e211dSHarvey Harrison 			scan_more = 0;
155c61e211dSHarvey Harrison 
156c61e211dSHarvey Harrison 			if (probe_kernel_address(instr, opcode))
157c61e211dSHarvey Harrison 				break;
158c61e211dSHarvey Harrison 			prefetch = (instr_lo == 0xF) &&
159c61e211dSHarvey Harrison 				(opcode == 0x0D || opcode == 0x18);
160c61e211dSHarvey Harrison 			break;
161c61e211dSHarvey Harrison 		default:
162c61e211dSHarvey Harrison 			scan_more = 0;
163c61e211dSHarvey Harrison 			break;
164c61e211dSHarvey Harrison 		}
165c61e211dSHarvey Harrison 	}
166c61e211dSHarvey Harrison 	return prefetch;
167c61e211dSHarvey Harrison }
168c61e211dSHarvey Harrison 
169c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code,
170c61e211dSHarvey Harrison 	unsigned long address, struct task_struct *tsk)
171c61e211dSHarvey Harrison {
172c61e211dSHarvey Harrison 	siginfo_t info;
173c61e211dSHarvey Harrison 
174c61e211dSHarvey Harrison 	info.si_signo = si_signo;
175c61e211dSHarvey Harrison 	info.si_errno = 0;
176c61e211dSHarvey Harrison 	info.si_code = si_code;
177c61e211dSHarvey Harrison 	info.si_addr = (void __user *)address;
178c61e211dSHarvey Harrison 	force_sig_info(si_signo, &info, tsk);
179c61e211dSHarvey Harrison }
180c61e211dSHarvey Harrison 
181c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
182c61e211dSHarvey Harrison static int bad_address(void *p)
183c61e211dSHarvey Harrison {
184c61e211dSHarvey Harrison 	unsigned long dummy;
185c61e211dSHarvey Harrison 	return probe_kernel_address((unsigned long *)p, dummy);
186c61e211dSHarvey Harrison }
187c61e211dSHarvey Harrison #endif
188c61e211dSHarvey Harrison 
189c61e211dSHarvey Harrison void dump_pagetable(unsigned long address)
190c61e211dSHarvey Harrison {
191c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
192c61e211dSHarvey Harrison 	__typeof__(pte_val(__pte(0))) page;
193c61e211dSHarvey Harrison 
194c61e211dSHarvey Harrison 	page = read_cr3();
195c61e211dSHarvey Harrison 	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
196c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
197c61e211dSHarvey Harrison 	printk("*pdpt = %016Lx ", page);
198c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
199c61e211dSHarvey Harrison 	    && page & _PAGE_PRESENT) {
200c61e211dSHarvey Harrison 		page &= PAGE_MASK;
201c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
202c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PMD - 1)];
203c61e211dSHarvey Harrison 		printk(KERN_CONT "*pde = %016Lx ", page);
204c61e211dSHarvey Harrison 		page &= ~_PAGE_NX;
205c61e211dSHarvey Harrison 	}
206c61e211dSHarvey Harrison #else
207c61e211dSHarvey Harrison 	printk("*pde = %08lx ", page);
208c61e211dSHarvey Harrison #endif
209c61e211dSHarvey Harrison 
210c61e211dSHarvey Harrison 	/*
211c61e211dSHarvey Harrison 	 * We must not directly access the pte in the highpte
212c61e211dSHarvey Harrison 	 * case if the page table is located in highmem.
213c61e211dSHarvey Harrison 	 * And let's rather not kmap-atomic the pte, just in case
214c61e211dSHarvey Harrison 	 * it's allocated already.
215c61e211dSHarvey Harrison 	 */
216c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
217c61e211dSHarvey Harrison 	    && (page & _PAGE_PRESENT)
218c61e211dSHarvey Harrison 	    && !(page & _PAGE_PSE)) {
219c61e211dSHarvey Harrison 		page &= PAGE_MASK;
220c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
221c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PTE - 1)];
222c61e211dSHarvey Harrison 		printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
223c61e211dSHarvey Harrison 	}
224c61e211dSHarvey Harrison 
225c61e211dSHarvey Harrison 	printk("\n");
226c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
227c61e211dSHarvey Harrison 	pgd_t *pgd;
228c61e211dSHarvey Harrison 	pud_t *pud;
229c61e211dSHarvey Harrison 	pmd_t *pmd;
230c61e211dSHarvey Harrison 	pte_t *pte;
231c61e211dSHarvey Harrison 
232c61e211dSHarvey Harrison 	pgd = (pgd_t *)read_cr3();
233c61e211dSHarvey Harrison 
234c61e211dSHarvey Harrison 	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
235c61e211dSHarvey Harrison 	pgd += pgd_index(address);
236c61e211dSHarvey Harrison 	if (bad_address(pgd)) goto bad;
237c61e211dSHarvey Harrison 	printk("PGD %lx ", pgd_val(*pgd));
238c61e211dSHarvey Harrison 	if (!pgd_present(*pgd)) goto ret;
239c61e211dSHarvey Harrison 
240c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
241c61e211dSHarvey Harrison 	if (bad_address(pud)) goto bad;
242c61e211dSHarvey Harrison 	printk("PUD %lx ", pud_val(*pud));
243c61e211dSHarvey Harrison 	if (!pud_present(*pud))	goto ret;
244c61e211dSHarvey Harrison 
245c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
246c61e211dSHarvey Harrison 	if (bad_address(pmd)) goto bad;
247c61e211dSHarvey Harrison 	printk("PMD %lx ", pmd_val(*pmd));
248c61e211dSHarvey Harrison 	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
249c61e211dSHarvey Harrison 
250c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
251c61e211dSHarvey Harrison 	if (bad_address(pte)) goto bad;
252c61e211dSHarvey Harrison 	printk("PTE %lx", pte_val(*pte));
253c61e211dSHarvey Harrison ret:
254c61e211dSHarvey Harrison 	printk("\n");
255c61e211dSHarvey Harrison 	return;
256c61e211dSHarvey Harrison bad:
257c61e211dSHarvey Harrison 	printk("BAD\n");
258c61e211dSHarvey Harrison #endif
259c61e211dSHarvey Harrison }
260c61e211dSHarvey Harrison 
261c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
262c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
263c61e211dSHarvey Harrison {
264c61e211dSHarvey Harrison 	unsigned index = pgd_index(address);
265c61e211dSHarvey Harrison 	pgd_t *pgd_k;
266c61e211dSHarvey Harrison 	pud_t *pud, *pud_k;
267c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_k;
268c61e211dSHarvey Harrison 
269c61e211dSHarvey Harrison 	pgd += index;
270c61e211dSHarvey Harrison 	pgd_k = init_mm.pgd + index;
271c61e211dSHarvey Harrison 
272c61e211dSHarvey Harrison 	if (!pgd_present(*pgd_k))
273c61e211dSHarvey Harrison 		return NULL;
274c61e211dSHarvey Harrison 
275c61e211dSHarvey Harrison 	/*
276c61e211dSHarvey Harrison 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
277c61e211dSHarvey Harrison 	 * and redundant with the set_pmd() on non-PAE. As would
278c61e211dSHarvey Harrison 	 * set_pud.
279c61e211dSHarvey Harrison 	 */
280c61e211dSHarvey Harrison 
281c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
282c61e211dSHarvey Harrison 	pud_k = pud_offset(pgd_k, address);
283c61e211dSHarvey Harrison 	if (!pud_present(*pud_k))
284c61e211dSHarvey Harrison 		return NULL;
285c61e211dSHarvey Harrison 
286c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
287c61e211dSHarvey Harrison 	pmd_k = pmd_offset(pud_k, address);
288c61e211dSHarvey Harrison 	if (!pmd_present(*pmd_k))
289c61e211dSHarvey Harrison 		return NULL;
290c61e211dSHarvey Harrison 	if (!pmd_present(*pmd)) {
291c61e211dSHarvey Harrison 		set_pmd(pmd, *pmd_k);
292c61e211dSHarvey Harrison 		arch_flush_lazy_mmu_mode();
293c61e211dSHarvey Harrison 	} else
294c61e211dSHarvey Harrison 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
295c61e211dSHarvey Harrison 	return pmd_k;
296c61e211dSHarvey Harrison }
297c61e211dSHarvey Harrison #endif
298c61e211dSHarvey Harrison 
299c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
300c61e211dSHarvey Harrison static const char errata93_warning[] =
301c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
302c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
303c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n"
304c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
305c61e211dSHarvey Harrison #endif
306c61e211dSHarvey Harrison 
307c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS.
308c61e211dSHarvey Harrison    BIOS SMM functions are required to use a specific workaround
309c61e211dSHarvey Harrison    to avoid corruption of the 64bit RIP register on C stepping K8.
310c61e211dSHarvey Harrison    A lot of BIOS that didn't get tested properly miss this.
311c61e211dSHarvey Harrison    The OS sees this as a page fault with the upper 32bits of RIP cleared.
312c61e211dSHarvey Harrison    Try to work around it here.
313c61e211dSHarvey Harrison    Note we only handle faults in kernel here.
314c61e211dSHarvey Harrison    Does nothing for X86_32
315c61e211dSHarvey Harrison  */
316c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address)
317c61e211dSHarvey Harrison {
318c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
319c61e211dSHarvey Harrison 	static int warned;
320c61e211dSHarvey Harrison 	if (address != regs->ip)
321c61e211dSHarvey Harrison 		return 0;
322c61e211dSHarvey Harrison 	if ((address >> 32) != 0)
323c61e211dSHarvey Harrison 		return 0;
324c61e211dSHarvey Harrison 	address |= 0xffffffffUL << 32;
325c61e211dSHarvey Harrison 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
326c61e211dSHarvey Harrison 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
327c61e211dSHarvey Harrison 		if (!warned) {
328c61e211dSHarvey Harrison 			printk(errata93_warning);
329c61e211dSHarvey Harrison 			warned = 1;
330c61e211dSHarvey Harrison 		}
331c61e211dSHarvey Harrison 		regs->ip = address;
332c61e211dSHarvey Harrison 		return 1;
333c61e211dSHarvey Harrison 	}
334c61e211dSHarvey Harrison #endif
335c61e211dSHarvey Harrison 	return 0;
336c61e211dSHarvey Harrison }
337c61e211dSHarvey Harrison 
338c61e211dSHarvey Harrison /*
339c61e211dSHarvey Harrison  * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
340c61e211dSHarvey Harrison  * addresses >4GB.  We catch this in the page fault handler because these
341c61e211dSHarvey Harrison  * addresses are not reachable. Just detect this case and return.  Any code
342c61e211dSHarvey Harrison  * segment in LDT is compatibility mode.
343c61e211dSHarvey Harrison  */
344c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address)
345c61e211dSHarvey Harrison {
346c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
347c61e211dSHarvey Harrison 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
348c61e211dSHarvey Harrison 	    (address >> 32))
349c61e211dSHarvey Harrison 		return 1;
350c61e211dSHarvey Harrison #endif
351c61e211dSHarvey Harrison 	return 0;
352c61e211dSHarvey Harrison }
353c61e211dSHarvey Harrison 
354c61e211dSHarvey Harrison void do_invalid_op(struct pt_regs *, unsigned long);
355c61e211dSHarvey Harrison 
356c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
357c61e211dSHarvey Harrison {
358c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG
359c61e211dSHarvey Harrison 	unsigned long nr;
360c61e211dSHarvey Harrison 	/*
361c61e211dSHarvey Harrison 	 * Pentium F0 0F C7 C8 bug workaround.
362c61e211dSHarvey Harrison 	 */
363c61e211dSHarvey Harrison 	if (boot_cpu_data.f00f_bug) {
364c61e211dSHarvey Harrison 		nr = (address - idt_descr.address) >> 3;
365c61e211dSHarvey Harrison 
366c61e211dSHarvey Harrison 		if (nr == 6) {
367c61e211dSHarvey Harrison 			do_invalid_op(regs, 0);
368c61e211dSHarvey Harrison 			return 1;
369c61e211dSHarvey Harrison 		}
370c61e211dSHarvey Harrison 	}
371c61e211dSHarvey Harrison #endif
372c61e211dSHarvey Harrison 	return 0;
373c61e211dSHarvey Harrison }
374c61e211dSHarvey Harrison 
375c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
376c61e211dSHarvey Harrison 			    unsigned long address)
377c61e211dSHarvey Harrison {
378c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
379c61e211dSHarvey Harrison 	if (!oops_may_print())
380c61e211dSHarvey Harrison 		return;
381c61e211dSHarvey Harrison 
382c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
383c61e211dSHarvey Harrison 	if (error_code & PF_INSTR) {
384c61e211dSHarvey Harrison 		int level;
385c61e211dSHarvey Harrison 		pte_t *pte = lookup_address(address, &level);
386c61e211dSHarvey Harrison 
387c61e211dSHarvey Harrison 		if (pte && pte_present(*pte) && !pte_exec(*pte))
388c61e211dSHarvey Harrison 			printk(KERN_CRIT "kernel tried to execute "
389c61e211dSHarvey Harrison 				"NX-protected page - exploit attempt? "
390c61e211dSHarvey Harrison 				"(uid: %d)\n", current->uid);
391c61e211dSHarvey Harrison 	}
392c61e211dSHarvey Harrison #endif
393c61e211dSHarvey Harrison 	printk(KERN_ALERT "BUG: unable to handle kernel ");
394c61e211dSHarvey Harrison 	if (address < PAGE_SIZE)
395c61e211dSHarvey Harrison 		printk(KERN_CONT "NULL pointer dereference");
396c61e211dSHarvey Harrison 	else
397c61e211dSHarvey Harrison 		printk(KERN_CONT "paging request");
398c61e211dSHarvey Harrison 	printk(KERN_CONT " at %08lx\n", address);
399c61e211dSHarvey Harrison 
400c61e211dSHarvey Harrison 	printk(KERN_ALERT "IP:");
401c61e211dSHarvey Harrison 	printk_address(regs->ip, 1);
402c61e211dSHarvey Harrison 	dump_pagetable(address);
403c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
404c61e211dSHarvey Harrison 	printk(KERN_ALERT "BUG: unable to handle kernel ");
405c61e211dSHarvey Harrison 	if (address < PAGE_SIZE)
406c61e211dSHarvey Harrison 		printk(KERN_CONT "NULL pointer dereference");
407c61e211dSHarvey Harrison 	else
408c61e211dSHarvey Harrison 		printk(KERN_CONT "paging request");
409c61e211dSHarvey Harrison 	printk(KERN_CONT " at %016lx\n", address);
410c61e211dSHarvey Harrison 
411c61e211dSHarvey Harrison 	printk(KERN_ALERT "IP:");
412c61e211dSHarvey Harrison 	printk_address(regs->ip, 1);
413c61e211dSHarvey Harrison 	dump_pagetable(address);
414c61e211dSHarvey Harrison #endif
415c61e211dSHarvey Harrison }
416c61e211dSHarvey Harrison 
417c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
418c61e211dSHarvey Harrison static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
419c61e211dSHarvey Harrison 				 unsigned long error_code)
420c61e211dSHarvey Harrison {
421c61e211dSHarvey Harrison 	unsigned long flags = oops_begin();
422c61e211dSHarvey Harrison 	struct task_struct *tsk;
423c61e211dSHarvey Harrison 
424c61e211dSHarvey Harrison 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
425c61e211dSHarvey Harrison 	       current->comm, address);
426c61e211dSHarvey Harrison 	dump_pagetable(address);
427c61e211dSHarvey Harrison 	tsk = current;
428c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
429c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
430c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
431c61e211dSHarvey Harrison 	if (__die("Bad pagetable", regs, error_code))
432c61e211dSHarvey Harrison 		regs = NULL;
433c61e211dSHarvey Harrison 	oops_end(flags, regs, SIGKILL);
434c61e211dSHarvey Harrison }
435c61e211dSHarvey Harrison #endif
436c61e211dSHarvey Harrison 
437c61e211dSHarvey Harrison /*
438*5b727a3bSJeremy Fitzhardinge  * Handle a spurious fault caused by a stale TLB entry.  This allows
439*5b727a3bSJeremy Fitzhardinge  * us to lazily refresh the TLB when increasing the permissions of a
440*5b727a3bSJeremy Fitzhardinge  * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
441*5b727a3bSJeremy Fitzhardinge  * expensive since that implies doing a full cross-processor TLB
442*5b727a3bSJeremy Fitzhardinge  * flush, even if no stale TLB entries exist on other processors.
443*5b727a3bSJeremy Fitzhardinge  * There are no security implications to leaving a stale TLB when
444*5b727a3bSJeremy Fitzhardinge  * increasing the permissions on a page.
445*5b727a3bSJeremy Fitzhardinge  */
446*5b727a3bSJeremy Fitzhardinge static int spurious_fault(unsigned long address,
447*5b727a3bSJeremy Fitzhardinge 			  unsigned long error_code)
448*5b727a3bSJeremy Fitzhardinge {
449*5b727a3bSJeremy Fitzhardinge 	pgd_t *pgd;
450*5b727a3bSJeremy Fitzhardinge 	pud_t *pud;
451*5b727a3bSJeremy Fitzhardinge 	pmd_t *pmd;
452*5b727a3bSJeremy Fitzhardinge 	pte_t *pte;
453*5b727a3bSJeremy Fitzhardinge 
454*5b727a3bSJeremy Fitzhardinge 	/* Reserved-bit violation or user access to kernel space? */
455*5b727a3bSJeremy Fitzhardinge 	if (error_code & (PF_USER | PF_RSVD))
456*5b727a3bSJeremy Fitzhardinge 		return 0;
457*5b727a3bSJeremy Fitzhardinge 
458*5b727a3bSJeremy Fitzhardinge 	pgd = init_mm.pgd + pgd_index(address);
459*5b727a3bSJeremy Fitzhardinge 	if (!pgd_present(*pgd))
460*5b727a3bSJeremy Fitzhardinge 		return 0;
461*5b727a3bSJeremy Fitzhardinge 
462*5b727a3bSJeremy Fitzhardinge 	pud = pud_offset(pgd, address);
463*5b727a3bSJeremy Fitzhardinge 	if (!pud_present(*pud))
464*5b727a3bSJeremy Fitzhardinge 		return 0;
465*5b727a3bSJeremy Fitzhardinge 
466*5b727a3bSJeremy Fitzhardinge 	pmd = pmd_offset(pud, address);
467*5b727a3bSJeremy Fitzhardinge 	if (!pmd_present(*pmd))
468*5b727a3bSJeremy Fitzhardinge 		return 0;
469*5b727a3bSJeremy Fitzhardinge 
470*5b727a3bSJeremy Fitzhardinge 	pte = pte_offset_kernel(pmd, address);
471*5b727a3bSJeremy Fitzhardinge 	if (!pte_present(*pte))
472*5b727a3bSJeremy Fitzhardinge 		return 0;
473*5b727a3bSJeremy Fitzhardinge 
474*5b727a3bSJeremy Fitzhardinge 	if ((error_code & PF_WRITE) && !pte_write(*pte))
475*5b727a3bSJeremy Fitzhardinge 		return 0;
476*5b727a3bSJeremy Fitzhardinge 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
477*5b727a3bSJeremy Fitzhardinge 		return 0;
478*5b727a3bSJeremy Fitzhardinge 
479*5b727a3bSJeremy Fitzhardinge 	return 1;
480*5b727a3bSJeremy Fitzhardinge }
481*5b727a3bSJeremy Fitzhardinge 
482*5b727a3bSJeremy Fitzhardinge /*
483c61e211dSHarvey Harrison  * X86_32
484c61e211dSHarvey Harrison  * Handle a fault on the vmalloc or module mapping area
485c61e211dSHarvey Harrison  *
486c61e211dSHarvey Harrison  * X86_64
487c61e211dSHarvey Harrison  * Handle a fault on the vmalloc area
488c61e211dSHarvey Harrison  *
489c61e211dSHarvey Harrison  * This assumes no large pages in there.
490c61e211dSHarvey Harrison  */
491c61e211dSHarvey Harrison static int vmalloc_fault(unsigned long address)
492c61e211dSHarvey Harrison {
493c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
494c61e211dSHarvey Harrison 	unsigned long pgd_paddr;
495c61e211dSHarvey Harrison 	pmd_t *pmd_k;
496c61e211dSHarvey Harrison 	pte_t *pte_k;
497c61e211dSHarvey Harrison 	/*
498c61e211dSHarvey Harrison 	 * Synchronize this task's top level page-table
499c61e211dSHarvey Harrison 	 * with the 'reference' page table.
500c61e211dSHarvey Harrison 	 *
501c61e211dSHarvey Harrison 	 * Do _not_ use "current" here. We might be inside
502c61e211dSHarvey Harrison 	 * an interrupt in the middle of a task switch..
503c61e211dSHarvey Harrison 	 */
504c61e211dSHarvey Harrison 	pgd_paddr = read_cr3();
505c61e211dSHarvey Harrison 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
506c61e211dSHarvey Harrison 	if (!pmd_k)
507c61e211dSHarvey Harrison 		return -1;
508c61e211dSHarvey Harrison 	pte_k = pte_offset_kernel(pmd_k, address);
509c61e211dSHarvey Harrison 	if (!pte_present(*pte_k))
510c61e211dSHarvey Harrison 		return -1;
511c61e211dSHarvey Harrison 	return 0;
512c61e211dSHarvey Harrison #else
513c61e211dSHarvey Harrison 	pgd_t *pgd, *pgd_ref;
514c61e211dSHarvey Harrison 	pud_t *pud, *pud_ref;
515c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_ref;
516c61e211dSHarvey Harrison 	pte_t *pte, *pte_ref;
517c61e211dSHarvey Harrison 
518c61e211dSHarvey Harrison 	/* Copy kernel mappings over when needed. This can also
519c61e211dSHarvey Harrison 	   happen within a race in page table update. In the later
520c61e211dSHarvey Harrison 	   case just flush. */
521c61e211dSHarvey Harrison 
522c61e211dSHarvey Harrison 	pgd = pgd_offset(current->mm ?: &init_mm, address);
523c61e211dSHarvey Harrison 	pgd_ref = pgd_offset_k(address);
524c61e211dSHarvey Harrison 	if (pgd_none(*pgd_ref))
525c61e211dSHarvey Harrison 		return -1;
526c61e211dSHarvey Harrison 	if (pgd_none(*pgd))
527c61e211dSHarvey Harrison 		set_pgd(pgd, *pgd_ref);
528c61e211dSHarvey Harrison 	else
529c61e211dSHarvey Harrison 		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
530c61e211dSHarvey Harrison 
531c61e211dSHarvey Harrison 	/* Below here mismatches are bugs because these lower tables
532c61e211dSHarvey Harrison 	   are shared */
533c61e211dSHarvey Harrison 
534c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
535c61e211dSHarvey Harrison 	pud_ref = pud_offset(pgd_ref, address);
536c61e211dSHarvey Harrison 	if (pud_none(*pud_ref))
537c61e211dSHarvey Harrison 		return -1;
538c61e211dSHarvey Harrison 	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
539c61e211dSHarvey Harrison 		BUG();
540c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
541c61e211dSHarvey Harrison 	pmd_ref = pmd_offset(pud_ref, address);
542c61e211dSHarvey Harrison 	if (pmd_none(*pmd_ref))
543c61e211dSHarvey Harrison 		return -1;
544c61e211dSHarvey Harrison 	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
545c61e211dSHarvey Harrison 		BUG();
546c61e211dSHarvey Harrison 	pte_ref = pte_offset_kernel(pmd_ref, address);
547c61e211dSHarvey Harrison 	if (!pte_present(*pte_ref))
548c61e211dSHarvey Harrison 		return -1;
549c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
550c61e211dSHarvey Harrison 	/* Don't use pte_page here, because the mappings can point
551c61e211dSHarvey Harrison 	   outside mem_map, and the NUMA hash lookup cannot handle
552c61e211dSHarvey Harrison 	   that. */
553c61e211dSHarvey Harrison 	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
554c61e211dSHarvey Harrison 		BUG();
555c61e211dSHarvey Harrison 	return 0;
556c61e211dSHarvey Harrison #endif
557c61e211dSHarvey Harrison }
558c61e211dSHarvey Harrison 
559c61e211dSHarvey Harrison int show_unhandled_signals = 1;
560c61e211dSHarvey Harrison 
561c61e211dSHarvey Harrison /*
562c61e211dSHarvey Harrison  * This routine handles page faults.  It determines the address,
563c61e211dSHarvey Harrison  * and the problem, and then passes it off to one of the appropriate
564c61e211dSHarvey Harrison  * routines.
565c61e211dSHarvey Harrison  */
566c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
567c61e211dSHarvey Harrison asmlinkage
568c61e211dSHarvey Harrison #endif
569c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
570c61e211dSHarvey Harrison {
571c61e211dSHarvey Harrison 	struct task_struct *tsk;
572c61e211dSHarvey Harrison 	struct mm_struct *mm;
573c61e211dSHarvey Harrison 	struct vm_area_struct *vma;
574c61e211dSHarvey Harrison 	unsigned long address;
575c61e211dSHarvey Harrison 	int write, si_code;
576c61e211dSHarvey Harrison 	int fault;
577c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
578c61e211dSHarvey Harrison 	unsigned long flags;
579c61e211dSHarvey Harrison #endif
580c61e211dSHarvey Harrison 
581c61e211dSHarvey Harrison 	/*
582c61e211dSHarvey Harrison 	 * We can fault from pretty much anywhere, with unknown IRQ state.
583c61e211dSHarvey Harrison 	 */
584c61e211dSHarvey Harrison 	trace_hardirqs_fixup();
585c61e211dSHarvey Harrison 
586c61e211dSHarvey Harrison 	tsk = current;
587c61e211dSHarvey Harrison 	mm = tsk->mm;
588c61e211dSHarvey Harrison 	prefetchw(&mm->mmap_sem);
589c61e211dSHarvey Harrison 
590c61e211dSHarvey Harrison 	/* get the address */
591c61e211dSHarvey Harrison 	address = read_cr2();
592c61e211dSHarvey Harrison 
593c61e211dSHarvey Harrison 	si_code = SEGV_MAPERR;
594c61e211dSHarvey Harrison 
595c61e211dSHarvey Harrison 	if (notify_page_fault(regs))
596c61e211dSHarvey Harrison 		return;
597c61e211dSHarvey Harrison 
598c61e211dSHarvey Harrison 	/*
599c61e211dSHarvey Harrison 	 * We fault-in kernel-space virtual memory on-demand. The
600c61e211dSHarvey Harrison 	 * 'reference' page table is init_mm.pgd.
601c61e211dSHarvey Harrison 	 *
602c61e211dSHarvey Harrison 	 * NOTE! We MUST NOT take any locks for this case. We may
603c61e211dSHarvey Harrison 	 * be in an interrupt or a critical region, and should
604c61e211dSHarvey Harrison 	 * only copy the information from the master page table,
605c61e211dSHarvey Harrison 	 * nothing more.
606c61e211dSHarvey Harrison 	 *
607c61e211dSHarvey Harrison 	 * This verifies that the fault happens in kernel space
608c61e211dSHarvey Harrison 	 * (error_code & 4) == 0, and that the fault was not a
609c61e211dSHarvey Harrison 	 * protection error (error_code & 9) == 0.
610c61e211dSHarvey Harrison 	 */
611c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
612c61e211dSHarvey Harrison 	if (unlikely(address >= TASK_SIZE)) {
613c61e211dSHarvey Harrison 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
614c61e211dSHarvey Harrison 		    vmalloc_fault(address) >= 0)
615c61e211dSHarvey Harrison 			return;
616*5b727a3bSJeremy Fitzhardinge 
617*5b727a3bSJeremy Fitzhardinge 		/* Can handle a stale RO->RW TLB */
618*5b727a3bSJeremy Fitzhardinge 		if (spurious_fault(address, error_code))
619*5b727a3bSJeremy Fitzhardinge 			return;
620*5b727a3bSJeremy Fitzhardinge 
621c61e211dSHarvey Harrison 		/*
622c61e211dSHarvey Harrison 		 * Don't take the mm semaphore here. If we fixup a prefetch
623c61e211dSHarvey Harrison 		 * fault we could otherwise deadlock.
624c61e211dSHarvey Harrison 		 */
625c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
626c61e211dSHarvey Harrison 	}
627c61e211dSHarvey Harrison 
628c61e211dSHarvey Harrison 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
629c61e211dSHarvey Harrison 	   fault has been handled. */
630c61e211dSHarvey Harrison 	if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
631c61e211dSHarvey Harrison 		local_irq_enable();
632c61e211dSHarvey Harrison 
633c61e211dSHarvey Harrison 	/*
634c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
635c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
636c61e211dSHarvey Harrison 	 */
637c61e211dSHarvey Harrison 	if (in_atomic() || !mm)
638c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
639c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
640c61e211dSHarvey Harrison 	if (unlikely(address >= TASK_SIZE64)) {
641c61e211dSHarvey Harrison 		/*
642c61e211dSHarvey Harrison 		 * Don't check for the module range here: its PML4
643c61e211dSHarvey Harrison 		 * is always initialized because it's shared with the main
644c61e211dSHarvey Harrison 		 * kernel text. Only vmalloc may need PML4 syncups.
645c61e211dSHarvey Harrison 		 */
646c61e211dSHarvey Harrison 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
647c61e211dSHarvey Harrison 		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
648c61e211dSHarvey Harrison 			if (vmalloc_fault(address) >= 0)
649c61e211dSHarvey Harrison 				return;
650c61e211dSHarvey Harrison 		}
651*5b727a3bSJeremy Fitzhardinge 
652*5b727a3bSJeremy Fitzhardinge 		/* Can handle a stale RO->RW TLB */
653*5b727a3bSJeremy Fitzhardinge 		if (spurious_fault(address, error_code))
654*5b727a3bSJeremy Fitzhardinge 			return;
655*5b727a3bSJeremy Fitzhardinge 
656c61e211dSHarvey Harrison 		/*
657c61e211dSHarvey Harrison 		 * Don't take the mm semaphore here. If we fixup a prefetch
658c61e211dSHarvey Harrison 		 * fault we could otherwise deadlock.
659c61e211dSHarvey Harrison 		 */
660c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
661c61e211dSHarvey Harrison 	}
662c61e211dSHarvey Harrison 	if (likely(regs->flags & X86_EFLAGS_IF))
663c61e211dSHarvey Harrison 		local_irq_enable();
664c61e211dSHarvey Harrison 
665c61e211dSHarvey Harrison 	if (unlikely(error_code & PF_RSVD))
666c61e211dSHarvey Harrison 		pgtable_bad(address, regs, error_code);
667c61e211dSHarvey Harrison 
668c61e211dSHarvey Harrison 	/*
669c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
670c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
671c61e211dSHarvey Harrison 	 */
672c61e211dSHarvey Harrison 	if (unlikely(in_atomic() || !mm))
673c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
674c61e211dSHarvey Harrison 
675c61e211dSHarvey Harrison 	/*
676c61e211dSHarvey Harrison 	 * User-mode registers count as a user access even for any
677c61e211dSHarvey Harrison 	 * potential system fault or CPU buglet.
678c61e211dSHarvey Harrison 	 */
679c61e211dSHarvey Harrison 	if (user_mode_vm(regs))
680c61e211dSHarvey Harrison 		error_code |= PF_USER;
681c61e211dSHarvey Harrison again:
682c61e211dSHarvey Harrison #endif
683c61e211dSHarvey Harrison 	/* When running in the kernel we expect faults to occur only to
684c61e211dSHarvey Harrison 	 * addresses in user space.  All other faults represent errors in the
685c61e211dSHarvey Harrison 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
686c61e211dSHarvey Harrison 	 * erroneous fault occurring in a code path which already holds mmap_sem
687c61e211dSHarvey Harrison 	 * we will deadlock attempting to validate the fault against the
688c61e211dSHarvey Harrison 	 * address space.  Luckily the kernel only validly references user
689c61e211dSHarvey Harrison 	 * space from well defined areas of code, which are listed in the
690c61e211dSHarvey Harrison 	 * exceptions table.
691c61e211dSHarvey Harrison 	 *
692c61e211dSHarvey Harrison 	 * As the vast majority of faults will be valid we will only perform
693c61e211dSHarvey Harrison 	 * the source reference check when there is a possibility of a deadlock.
694c61e211dSHarvey Harrison 	 * Attempt to lock the address space, if we cannot we then validate the
695c61e211dSHarvey Harrison 	 * source.  If this is invalid we can skip the address space check,
696c61e211dSHarvey Harrison 	 * thus avoiding the deadlock.
697c61e211dSHarvey Harrison 	 */
698c61e211dSHarvey Harrison 	if (!down_read_trylock(&mm->mmap_sem)) {
699c61e211dSHarvey Harrison 		if ((error_code & PF_USER) == 0 &&
700c61e211dSHarvey Harrison 		    !search_exception_tables(regs->ip))
701c61e211dSHarvey Harrison 			goto bad_area_nosemaphore;
702c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
703c61e211dSHarvey Harrison 	}
704c61e211dSHarvey Harrison 
705c61e211dSHarvey Harrison 	vma = find_vma(mm, address);
706c61e211dSHarvey Harrison 	if (!vma)
707c61e211dSHarvey Harrison 		goto bad_area;
708c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
709c61e211dSHarvey Harrison 	if (vma->vm_start <= address)
710c61e211dSHarvey Harrison #else
711c61e211dSHarvey Harrison 	if (likely(vma->vm_start <= address))
712c61e211dSHarvey Harrison #endif
713c61e211dSHarvey Harrison 		goto good_area;
714c61e211dSHarvey Harrison 	if (!(vma->vm_flags & VM_GROWSDOWN))
715c61e211dSHarvey Harrison 		goto bad_area;
716c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
717c61e211dSHarvey Harrison 		/*
718c61e211dSHarvey Harrison 		 * Accessing the stack below %sp is always a bug.
719c61e211dSHarvey Harrison 		 * The large cushion allows instructions like enter
720c61e211dSHarvey Harrison 		 * and pusha to work.  ("enter $65535,$31" pushes
721c61e211dSHarvey Harrison 		 * 32 pointers and then decrements %sp by 65535.)
722c61e211dSHarvey Harrison 		 */
723c61e211dSHarvey Harrison 		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
724c61e211dSHarvey Harrison 			goto bad_area;
725c61e211dSHarvey Harrison 	}
726c61e211dSHarvey Harrison 	if (expand_stack(vma, address))
727c61e211dSHarvey Harrison 		goto bad_area;
728c61e211dSHarvey Harrison /*
729c61e211dSHarvey Harrison  * Ok, we have a good vm_area for this memory access, so
730c61e211dSHarvey Harrison  * we can handle it..
731c61e211dSHarvey Harrison  */
732c61e211dSHarvey Harrison good_area:
733c61e211dSHarvey Harrison 	si_code = SEGV_ACCERR;
734c61e211dSHarvey Harrison 	write = 0;
735c61e211dSHarvey Harrison 	switch (error_code & (PF_PROT|PF_WRITE)) {
736c61e211dSHarvey Harrison 	default:	/* 3: write, present */
737c61e211dSHarvey Harrison 		/* fall through */
738c61e211dSHarvey Harrison 	case PF_WRITE:		/* write, not present */
739c61e211dSHarvey Harrison 		if (!(vma->vm_flags & VM_WRITE))
740c61e211dSHarvey Harrison 			goto bad_area;
741c61e211dSHarvey Harrison 		write++;
742c61e211dSHarvey Harrison 		break;
743c61e211dSHarvey Harrison 	case PF_PROT:		/* read, present */
744c61e211dSHarvey Harrison 		goto bad_area;
745c61e211dSHarvey Harrison 	case 0:			/* read, not present */
746c61e211dSHarvey Harrison 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
747c61e211dSHarvey Harrison 			goto bad_area;
748c61e211dSHarvey Harrison 	}
749c61e211dSHarvey Harrison 
750c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
751c61e211dSHarvey Harrison survive:
752c61e211dSHarvey Harrison #endif
753c61e211dSHarvey Harrison 	/*
754c61e211dSHarvey Harrison 	 * If for any reason at all we couldn't handle the fault,
755c61e211dSHarvey Harrison 	 * make sure we exit gracefully rather than endlessly redo
756c61e211dSHarvey Harrison 	 * the fault.
757c61e211dSHarvey Harrison 	 */
758c61e211dSHarvey Harrison 	fault = handle_mm_fault(mm, vma, address, write);
759c61e211dSHarvey Harrison 	if (unlikely(fault & VM_FAULT_ERROR)) {
760c61e211dSHarvey Harrison 		if (fault & VM_FAULT_OOM)
761c61e211dSHarvey Harrison 			goto out_of_memory;
762c61e211dSHarvey Harrison 		else if (fault & VM_FAULT_SIGBUS)
763c61e211dSHarvey Harrison 			goto do_sigbus;
764c61e211dSHarvey Harrison 		BUG();
765c61e211dSHarvey Harrison 	}
766c61e211dSHarvey Harrison 	if (fault & VM_FAULT_MAJOR)
767c61e211dSHarvey Harrison 		tsk->maj_flt++;
768c61e211dSHarvey Harrison 	else
769c61e211dSHarvey Harrison 		tsk->min_flt++;
770c61e211dSHarvey Harrison 
771c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
772c61e211dSHarvey Harrison 	/*
773c61e211dSHarvey Harrison 	 * Did it hit the DOS screen memory VA from vm86 mode?
774c61e211dSHarvey Harrison 	 */
775c61e211dSHarvey Harrison 	if (v8086_mode(regs)) {
776c61e211dSHarvey Harrison 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
777c61e211dSHarvey Harrison 		if (bit < 32)
778c61e211dSHarvey Harrison 			tsk->thread.screen_bitmap |= 1 << bit;
779c61e211dSHarvey Harrison 	}
780c61e211dSHarvey Harrison #endif
781c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
782c61e211dSHarvey Harrison 	return;
783c61e211dSHarvey Harrison 
784c61e211dSHarvey Harrison /*
785c61e211dSHarvey Harrison  * Something tried to access memory that isn't in our memory map..
786c61e211dSHarvey Harrison  * Fix it, but check if it's kernel or user first..
787c61e211dSHarvey Harrison  */
788c61e211dSHarvey Harrison bad_area:
789c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
790c61e211dSHarvey Harrison 
791c61e211dSHarvey Harrison bad_area_nosemaphore:
792c61e211dSHarvey Harrison 	/* User mode accesses just cause a SIGSEGV */
793c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
794c61e211dSHarvey Harrison 		/*
795c61e211dSHarvey Harrison 		 * It's possible to have interrupts off here.
796c61e211dSHarvey Harrison 		 */
797c61e211dSHarvey Harrison 		local_irq_enable();
798c61e211dSHarvey Harrison 
799c61e211dSHarvey Harrison 		/*
800c61e211dSHarvey Harrison 		 * Valid to do another page fault here because this one came
801c61e211dSHarvey Harrison 		 * from user space.
802c61e211dSHarvey Harrison 		 */
803c61e211dSHarvey Harrison 		if (is_prefetch(regs, address, error_code))
804c61e211dSHarvey Harrison 			return;
805c61e211dSHarvey Harrison 
806c61e211dSHarvey Harrison 		if (is_errata100(regs, address))
807c61e211dSHarvey Harrison 			return;
808c61e211dSHarvey Harrison 
809c61e211dSHarvey Harrison 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
810c61e211dSHarvey Harrison 		    printk_ratelimit()) {
811c61e211dSHarvey Harrison 			printk(
812c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
813c61e211dSHarvey Harrison 			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
814c61e211dSHarvey Harrison #else
815c61e211dSHarvey Harrison 			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
816c61e211dSHarvey Harrison #endif
817c61e211dSHarvey Harrison 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
818c61e211dSHarvey Harrison 			tsk->comm, task_pid_nr(tsk), address, regs->ip,
819c61e211dSHarvey Harrison 			regs->sp, error_code);
820c61e211dSHarvey Harrison 			print_vma_addr(" in ", regs->ip);
821c61e211dSHarvey Harrison 			printk("\n");
822c61e211dSHarvey Harrison 		}
823c61e211dSHarvey Harrison 
824c61e211dSHarvey Harrison 		tsk->thread.cr2 = address;
825c61e211dSHarvey Harrison 		/* Kernel addresses are always protection faults */
826c61e211dSHarvey Harrison 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
827c61e211dSHarvey Harrison 		tsk->thread.trap_no = 14;
828c61e211dSHarvey Harrison 		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
829c61e211dSHarvey Harrison 		return;
830c61e211dSHarvey Harrison 	}
831c61e211dSHarvey Harrison 
832c61e211dSHarvey Harrison 	if (is_f00f_bug(regs, address))
833c61e211dSHarvey Harrison 		return;
834c61e211dSHarvey Harrison 
835c61e211dSHarvey Harrison no_context:
836c61e211dSHarvey Harrison 	/* Are we prepared to handle this kernel fault?  */
837c61e211dSHarvey Harrison 	if (fixup_exception(regs))
838c61e211dSHarvey Harrison 		return;
839c61e211dSHarvey Harrison 
840c61e211dSHarvey Harrison 	/*
841c61e211dSHarvey Harrison 	 * X86_32
842c61e211dSHarvey Harrison 	 * Valid to do another page fault here, because if this fault
843c61e211dSHarvey Harrison 	 * had been triggered by is_prefetch fixup_exception would have
844c61e211dSHarvey Harrison 	 * handled it.
845c61e211dSHarvey Harrison 	 *
846c61e211dSHarvey Harrison 	 * X86_64
847c61e211dSHarvey Harrison 	 * Hall of shame of CPU/BIOS bugs.
848c61e211dSHarvey Harrison 	 */
849c61e211dSHarvey Harrison 	if (is_prefetch(regs, address, error_code))
850c61e211dSHarvey Harrison 		return;
851c61e211dSHarvey Harrison 
852c61e211dSHarvey Harrison 	if (is_errata93(regs, address))
853c61e211dSHarvey Harrison 		return;
854c61e211dSHarvey Harrison 
855c61e211dSHarvey Harrison /*
856c61e211dSHarvey Harrison  * Oops. The kernel tried to access some bad page. We'll have to
857c61e211dSHarvey Harrison  * terminate things with extreme prejudice.
858c61e211dSHarvey Harrison  */
859c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
860c61e211dSHarvey Harrison 	bust_spinlocks(1);
861c61e211dSHarvey Harrison 
862c61e211dSHarvey Harrison 	show_fault_oops(regs, error_code, address);
863c61e211dSHarvey Harrison 
864c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
865c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
866c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
867c61e211dSHarvey Harrison 	die("Oops", regs, error_code);
868c61e211dSHarvey Harrison 	bust_spinlocks(0);
869c61e211dSHarvey Harrison 	do_exit(SIGKILL);
870c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
871c61e211dSHarvey Harrison 	flags = oops_begin();
872c61e211dSHarvey Harrison 
873c61e211dSHarvey Harrison 	show_fault_oops(regs, error_code, address);
874c61e211dSHarvey Harrison 
875c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
876c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
877c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
878c61e211dSHarvey Harrison 	if (__die("Oops", regs, error_code))
879c61e211dSHarvey Harrison 		regs = NULL;
880c61e211dSHarvey Harrison 	/* Executive summary in case the body of the oops scrolled away */
881c61e211dSHarvey Harrison 	printk(KERN_EMERG "CR2: %016lx\n", address);
882c61e211dSHarvey Harrison 	oops_end(flags, regs, SIGKILL);
883c61e211dSHarvey Harrison #endif
884c61e211dSHarvey Harrison 
885c61e211dSHarvey Harrison /*
886c61e211dSHarvey Harrison  * We ran out of memory, or some other thing happened to us that made
887c61e211dSHarvey Harrison  * us unable to handle the page fault gracefully.
888c61e211dSHarvey Harrison  */
889c61e211dSHarvey Harrison out_of_memory:
890c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
891c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
892c61e211dSHarvey Harrison 	if (is_global_init(tsk)) {
893c61e211dSHarvey Harrison 		yield();
894c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
895c61e211dSHarvey Harrison 		goto survive;
896c61e211dSHarvey Harrison 	}
897c61e211dSHarvey Harrison #else
898c61e211dSHarvey Harrison 	if (is_global_init(current)) {
899c61e211dSHarvey Harrison 		yield();
900c61e211dSHarvey Harrison 		goto again;
901c61e211dSHarvey Harrison 	}
902c61e211dSHarvey Harrison #endif
903c61e211dSHarvey Harrison 	printk("VM: killing process %s\n", tsk->comm);
904c61e211dSHarvey Harrison 	if (error_code & PF_USER)
905c61e211dSHarvey Harrison 		do_group_exit(SIGKILL);
906c61e211dSHarvey Harrison 	goto no_context;
907c61e211dSHarvey Harrison 
908c61e211dSHarvey Harrison do_sigbus:
909c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
910c61e211dSHarvey Harrison 
911c61e211dSHarvey Harrison 	/* Kernel mode? Handle exceptions or die */
912c61e211dSHarvey Harrison 	if (!(error_code & PF_USER))
913c61e211dSHarvey Harrison 		goto no_context;
914c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
915c61e211dSHarvey Harrison 	/* User space => ok to do another page fault */
916c61e211dSHarvey Harrison 	if (is_prefetch(regs, address, error_code))
917c61e211dSHarvey Harrison 		return;
918c61e211dSHarvey Harrison #endif
919c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
920c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
921c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
922c61e211dSHarvey Harrison 	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
923c61e211dSHarvey Harrison }
924c61e211dSHarvey Harrison 
925c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
926c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock);
927c61e211dSHarvey Harrison LIST_HEAD(pgd_list);
928c61e211dSHarvey Harrison #endif
929c61e211dSHarvey Harrison 
930c61e211dSHarvey Harrison void vmalloc_sync_all(void)
931c61e211dSHarvey Harrison {
932c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
933c61e211dSHarvey Harrison 	/*
934c61e211dSHarvey Harrison 	 * Note that races in the updates of insync and start aren't
935c61e211dSHarvey Harrison 	 * problematic: insync can only get set bits added, and updates to
936c61e211dSHarvey Harrison 	 * start are only improving performance (without affecting correctness
937c61e211dSHarvey Harrison 	 * if undone).
938c61e211dSHarvey Harrison 	 */
939c61e211dSHarvey Harrison 	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
940c61e211dSHarvey Harrison 	static unsigned long start = TASK_SIZE;
941c61e211dSHarvey Harrison 	unsigned long address;
942c61e211dSHarvey Harrison 
943c61e211dSHarvey Harrison 	if (SHARED_KERNEL_PMD)
944c61e211dSHarvey Harrison 		return;
945c61e211dSHarvey Harrison 
946c61e211dSHarvey Harrison 	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
947c61e211dSHarvey Harrison 	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
948c61e211dSHarvey Harrison 		if (!test_bit(pgd_index(address), insync)) {
949c61e211dSHarvey Harrison 			unsigned long flags;
950c61e211dSHarvey Harrison 			struct page *page;
951c61e211dSHarvey Harrison 
952c61e211dSHarvey Harrison 			spin_lock_irqsave(&pgd_lock, flags);
953c61e211dSHarvey Harrison 			for (page = pgd_list; page; page =
954c61e211dSHarvey Harrison 					(struct page *)page->index)
955c61e211dSHarvey Harrison 				if (!vmalloc_sync_one(page_address(page),
956c61e211dSHarvey Harrison 								address)) {
957c61e211dSHarvey Harrison 					BUG_ON(page != pgd_list);
958c61e211dSHarvey Harrison 					break;
959c61e211dSHarvey Harrison 				}
960c61e211dSHarvey Harrison 			spin_unlock_irqrestore(&pgd_lock, flags);
961c61e211dSHarvey Harrison 			if (!page)
962c61e211dSHarvey Harrison 				set_bit(pgd_index(address), insync);
963c61e211dSHarvey Harrison 		}
964c61e211dSHarvey Harrison 		if (address == start && test_bit(pgd_index(address), insync))
965c61e211dSHarvey Harrison 			start = address + PGDIR_SIZE;
966c61e211dSHarvey Harrison 	}
967c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
968c61e211dSHarvey Harrison 	/*
969c61e211dSHarvey Harrison 	 * Note that races in the updates of insync and start aren't
970c61e211dSHarvey Harrison 	 * problematic: insync can only get set bits added, and updates to
971c61e211dSHarvey Harrison 	 * start are only improving performance (without affecting correctness
972c61e211dSHarvey Harrison 	 * if undone).
973c61e211dSHarvey Harrison 	 */
974c61e211dSHarvey Harrison 	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
975c61e211dSHarvey Harrison 	static unsigned long start = VMALLOC_START & PGDIR_MASK;
976c61e211dSHarvey Harrison 	unsigned long address;
977c61e211dSHarvey Harrison 
978c61e211dSHarvey Harrison 	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
979c61e211dSHarvey Harrison 		if (!test_bit(pgd_index(address), insync)) {
980c61e211dSHarvey Harrison 			const pgd_t *pgd_ref = pgd_offset_k(address);
981c61e211dSHarvey Harrison 			struct page *page;
982c61e211dSHarvey Harrison 
983c61e211dSHarvey Harrison 			if (pgd_none(*pgd_ref))
984c61e211dSHarvey Harrison 				continue;
985c61e211dSHarvey Harrison 			spin_lock(&pgd_lock);
986c61e211dSHarvey Harrison 			list_for_each_entry(page, &pgd_list, lru) {
987c61e211dSHarvey Harrison 				pgd_t *pgd;
988c61e211dSHarvey Harrison 				pgd = (pgd_t *)page_address(page) + pgd_index(address);
989c61e211dSHarvey Harrison 				if (pgd_none(*pgd))
990c61e211dSHarvey Harrison 					set_pgd(pgd, *pgd_ref);
991c61e211dSHarvey Harrison 				else
992c61e211dSHarvey Harrison 					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
993c61e211dSHarvey Harrison 			}
994c61e211dSHarvey Harrison 			spin_unlock(&pgd_lock);
995c61e211dSHarvey Harrison 			set_bit(pgd_index(address), insync);
996c61e211dSHarvey Harrison 		}
997c61e211dSHarvey Harrison 		if (address == start)
998c61e211dSHarvey Harrison 			start = address + PGDIR_SIZE;
999c61e211dSHarvey Harrison 	}
1000c61e211dSHarvey Harrison 	/* Check that there is no need to do the same for the modules area. */
1001c61e211dSHarvey Harrison 	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
1002c61e211dSHarvey Harrison 	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
1003c61e211dSHarvey Harrison 				(__START_KERNEL & PGDIR_MASK)));
1004c61e211dSHarvey Harrison #endif
1005c61e211dSHarvey Harrison }
1006