xref: /openbmc/linux/arch/x86/mm/fault.c (revision 0fd0e3da4557c479b820b9a4a7afa25b4637ddf2)
1c61e211dSHarvey Harrison /*
2c61e211dSHarvey Harrison  *  Copyright (C) 1995  Linus Torvalds
3c61e211dSHarvey Harrison  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4c61e211dSHarvey Harrison  */
5c61e211dSHarvey Harrison 
6c61e211dSHarvey Harrison #include <linux/signal.h>
7c61e211dSHarvey Harrison #include <linux/sched.h>
8c61e211dSHarvey Harrison #include <linux/kernel.h>
9c61e211dSHarvey Harrison #include <linux/errno.h>
10c61e211dSHarvey Harrison #include <linux/string.h>
11c61e211dSHarvey Harrison #include <linux/types.h>
12c61e211dSHarvey Harrison #include <linux/ptrace.h>
13*0fd0e3daSPekka Paalanen #include <linux/mmiotrace.h>
14c61e211dSHarvey Harrison #include <linux/mman.h>
15c61e211dSHarvey Harrison #include <linux/mm.h>
16c61e211dSHarvey Harrison #include <linux/smp.h>
17c61e211dSHarvey Harrison #include <linux/interrupt.h>
18c61e211dSHarvey Harrison #include <linux/init.h>
19c61e211dSHarvey Harrison #include <linux/tty.h>
20c61e211dSHarvey Harrison #include <linux/vt_kern.h>		/* For unblank_screen() */
21c61e211dSHarvey Harrison #include <linux/compiler.h>
22c61e211dSHarvey Harrison #include <linux/highmem.h>
23c61e211dSHarvey Harrison #include <linux/bootmem.h>		/* for max_low_pfn */
24c61e211dSHarvey Harrison #include <linux/vmalloc.h>
25c61e211dSHarvey Harrison #include <linux/module.h>
26c61e211dSHarvey Harrison #include <linux/kprobes.h>
27c61e211dSHarvey Harrison #include <linux/uaccess.h>
28c61e211dSHarvey Harrison #include <linux/kdebug.h>
29c61e211dSHarvey Harrison 
30c61e211dSHarvey Harrison #include <asm/system.h>
31c61e211dSHarvey Harrison #include <asm/desc.h>
32c61e211dSHarvey Harrison #include <asm/segment.h>
33c61e211dSHarvey Harrison #include <asm/pgalloc.h>
34c61e211dSHarvey Harrison #include <asm/smp.h>
35c61e211dSHarvey Harrison #include <asm/tlbflush.h>
36c61e211dSHarvey Harrison #include <asm/proto.h>
37c61e211dSHarvey Harrison #include <asm-generic/sections.h>
38c61e211dSHarvey Harrison 
39c61e211dSHarvey Harrison /*
40c61e211dSHarvey Harrison  * Page fault error code bits
41c61e211dSHarvey Harrison  *	bit 0 == 0 means no page found, 1 means protection fault
42c61e211dSHarvey Harrison  *	bit 1 == 0 means read, 1 means write
43c61e211dSHarvey Harrison  *	bit 2 == 0 means kernel, 1 means user-mode
44c61e211dSHarvey Harrison  *	bit 3 == 1 means use of reserved bit detected
45c61e211dSHarvey Harrison  *	bit 4 == 1 means fault was an instruction fetch
46c61e211dSHarvey Harrison  */
47c61e211dSHarvey Harrison #define PF_PROT		(1<<0)
48c61e211dSHarvey Harrison #define PF_WRITE	(1<<1)
49c61e211dSHarvey Harrison #define PF_USER		(1<<2)
50c61e211dSHarvey Harrison #define PF_RSVD		(1<<3)
51c61e211dSHarvey Harrison #define PF_INSTR	(1<<4)
52c61e211dSHarvey Harrison 
53*0fd0e3daSPekka Paalanen static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
5486069782SPekka Paalanen {
5510c43d2eSPekka Paalanen #ifdef CONFIG_MMIOTRACE_HOOKS
56*0fd0e3daSPekka Paalanen 	if (unlikely(is_kmmio_active()))
57*0fd0e3daSPekka Paalanen 		if (kmmio_handler(regs, addr) == 1)
58*0fd0e3daSPekka Paalanen 			return -1;
5986069782SPekka Paalanen #endif
60*0fd0e3daSPekka Paalanen 	return 0;
6186069782SPekka Paalanen }
6286069782SPekka Paalanen 
63c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs)
64c61e211dSHarvey Harrison {
65c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES
66c61e211dSHarvey Harrison 	int ret = 0;
67c61e211dSHarvey Harrison 
68c61e211dSHarvey Harrison 	/* kprobe_running() needs smp_processor_id() */
69c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
70c61e211dSHarvey Harrison 	if (!user_mode_vm(regs)) {
71c61e211dSHarvey Harrison #else
72c61e211dSHarvey Harrison 	if (!user_mode(regs)) {
73c61e211dSHarvey Harrison #endif
74c61e211dSHarvey Harrison 		preempt_disable();
75c61e211dSHarvey Harrison 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
76c61e211dSHarvey Harrison 			ret = 1;
77c61e211dSHarvey Harrison 		preempt_enable();
78c61e211dSHarvey Harrison 	}
79c61e211dSHarvey Harrison 
80c61e211dSHarvey Harrison 	return ret;
81c61e211dSHarvey Harrison #else
82c61e211dSHarvey Harrison 	return 0;
83c61e211dSHarvey Harrison #endif
84c61e211dSHarvey Harrison }
85c61e211dSHarvey Harrison 
86c61e211dSHarvey Harrison /*
87c61e211dSHarvey Harrison  * X86_32
88c61e211dSHarvey Harrison  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
89c61e211dSHarvey Harrison  * Check that here and ignore it.
90c61e211dSHarvey Harrison  *
91c61e211dSHarvey Harrison  * X86_64
92c61e211dSHarvey Harrison  * Sometimes the CPU reports invalid exceptions on prefetch.
93c61e211dSHarvey Harrison  * Check that here and ignore it.
94c61e211dSHarvey Harrison  *
95c61e211dSHarvey Harrison  * Opcode checker based on code by Richard Brunner
96c61e211dSHarvey Harrison  */
97c61e211dSHarvey Harrison static int is_prefetch(struct pt_regs *regs, unsigned long addr,
98c61e211dSHarvey Harrison 		       unsigned long error_code)
99c61e211dSHarvey Harrison {
100c61e211dSHarvey Harrison 	unsigned char *instr;
101c61e211dSHarvey Harrison 	int scan_more = 1;
102c61e211dSHarvey Harrison 	int prefetch = 0;
103c61e211dSHarvey Harrison 	unsigned char *max_instr;
104c61e211dSHarvey Harrison 
1053085354dSIngo Molnar 	/*
1063085354dSIngo Molnar 	 * If it was a exec (instruction fetch) fault on NX page, then
1073085354dSIngo Molnar 	 * do not ignore the fault:
1083085354dSIngo Molnar 	 */
109c61e211dSHarvey Harrison 	if (error_code & PF_INSTR)
110c61e211dSHarvey Harrison 		return 0;
111c61e211dSHarvey Harrison 
112c61e211dSHarvey Harrison 	instr = (unsigned char *)convert_ip_to_linear(current, regs);
113c61e211dSHarvey Harrison 	max_instr = instr + 15;
114c61e211dSHarvey Harrison 
115c61e211dSHarvey Harrison 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
116c61e211dSHarvey Harrison 		return 0;
117c61e211dSHarvey Harrison 
118c61e211dSHarvey Harrison 	while (scan_more && instr < max_instr) {
119c61e211dSHarvey Harrison 		unsigned char opcode;
120c61e211dSHarvey Harrison 		unsigned char instr_hi;
121c61e211dSHarvey Harrison 		unsigned char instr_lo;
122c61e211dSHarvey Harrison 
123c61e211dSHarvey Harrison 		if (probe_kernel_address(instr, opcode))
124c61e211dSHarvey Harrison 			break;
125c61e211dSHarvey Harrison 
126c61e211dSHarvey Harrison 		instr_hi = opcode & 0xf0;
127c61e211dSHarvey Harrison 		instr_lo = opcode & 0x0f;
128c61e211dSHarvey Harrison 		instr++;
129c61e211dSHarvey Harrison 
130c61e211dSHarvey Harrison 		switch (instr_hi) {
131c61e211dSHarvey Harrison 		case 0x20:
132c61e211dSHarvey Harrison 		case 0x30:
133c61e211dSHarvey Harrison 			/*
134c61e211dSHarvey Harrison 			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
135c61e211dSHarvey Harrison 			 * In X86_64 long mode, the CPU will signal invalid
136c61e211dSHarvey Harrison 			 * opcode if some of these prefixes are present so
137c61e211dSHarvey Harrison 			 * X86_64 will never get here anyway
138c61e211dSHarvey Harrison 			 */
139c61e211dSHarvey Harrison 			scan_more = ((instr_lo & 7) == 0x6);
140c61e211dSHarvey Harrison 			break;
141c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
142c61e211dSHarvey Harrison 		case 0x40:
143c61e211dSHarvey Harrison 			/*
144c61e211dSHarvey Harrison 			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
145c61e211dSHarvey Harrison 			 * Need to figure out under what instruction mode the
146c61e211dSHarvey Harrison 			 * instruction was issued. Could check the LDT for lm,
147c61e211dSHarvey Harrison 			 * but for now it's good enough to assume that long
148c61e211dSHarvey Harrison 			 * mode only uses well known segments or kernel.
149c61e211dSHarvey Harrison 			 */
150c61e211dSHarvey Harrison 			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
151c61e211dSHarvey Harrison 			break;
152c61e211dSHarvey Harrison #endif
153c61e211dSHarvey Harrison 		case 0x60:
154c61e211dSHarvey Harrison 			/* 0x64 thru 0x67 are valid prefixes in all modes. */
155c61e211dSHarvey Harrison 			scan_more = (instr_lo & 0xC) == 0x4;
156c61e211dSHarvey Harrison 			break;
157c61e211dSHarvey Harrison 		case 0xF0:
158c61e211dSHarvey Harrison 			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
159c61e211dSHarvey Harrison 			scan_more = !instr_lo || (instr_lo>>1) == 1;
160c61e211dSHarvey Harrison 			break;
161c61e211dSHarvey Harrison 		case 0x00:
162c61e211dSHarvey Harrison 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
163c61e211dSHarvey Harrison 			scan_more = 0;
164c61e211dSHarvey Harrison 
165c61e211dSHarvey Harrison 			if (probe_kernel_address(instr, opcode))
166c61e211dSHarvey Harrison 				break;
167c61e211dSHarvey Harrison 			prefetch = (instr_lo == 0xF) &&
168c61e211dSHarvey Harrison 				(opcode == 0x0D || opcode == 0x18);
169c61e211dSHarvey Harrison 			break;
170c61e211dSHarvey Harrison 		default:
171c61e211dSHarvey Harrison 			scan_more = 0;
172c61e211dSHarvey Harrison 			break;
173c61e211dSHarvey Harrison 		}
174c61e211dSHarvey Harrison 	}
175c61e211dSHarvey Harrison 	return prefetch;
176c61e211dSHarvey Harrison }
177c61e211dSHarvey Harrison 
178c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code,
179c61e211dSHarvey Harrison 	unsigned long address, struct task_struct *tsk)
180c61e211dSHarvey Harrison {
181c61e211dSHarvey Harrison 	siginfo_t info;
182c61e211dSHarvey Harrison 
183c61e211dSHarvey Harrison 	info.si_signo = si_signo;
184c61e211dSHarvey Harrison 	info.si_errno = 0;
185c61e211dSHarvey Harrison 	info.si_code = si_code;
186c61e211dSHarvey Harrison 	info.si_addr = (void __user *)address;
187c61e211dSHarvey Harrison 	force_sig_info(si_signo, &info, tsk);
188c61e211dSHarvey Harrison }
189c61e211dSHarvey Harrison 
190c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
191c61e211dSHarvey Harrison static int bad_address(void *p)
192c61e211dSHarvey Harrison {
193c61e211dSHarvey Harrison 	unsigned long dummy;
194c61e211dSHarvey Harrison 	return probe_kernel_address((unsigned long *)p, dummy);
195c61e211dSHarvey Harrison }
196c61e211dSHarvey Harrison #endif
197c61e211dSHarvey Harrison 
198cae30f82SAdrian Bunk static void dump_pagetable(unsigned long address)
199c61e211dSHarvey Harrison {
200c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
201c61e211dSHarvey Harrison 	__typeof__(pte_val(__pte(0))) page;
202c61e211dSHarvey Harrison 
203c61e211dSHarvey Harrison 	page = read_cr3();
204c61e211dSHarvey Harrison 	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
205c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
206c61e211dSHarvey Harrison 	printk("*pdpt = %016Lx ", page);
207c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
208c61e211dSHarvey Harrison 	    && page & _PAGE_PRESENT) {
209c61e211dSHarvey Harrison 		page &= PAGE_MASK;
210c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
211c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PMD - 1)];
212c61e211dSHarvey Harrison 		printk(KERN_CONT "*pde = %016Lx ", page);
213c61e211dSHarvey Harrison 		page &= ~_PAGE_NX;
214c61e211dSHarvey Harrison 	}
215c61e211dSHarvey Harrison #else
216c61e211dSHarvey Harrison 	printk("*pde = %08lx ", page);
217c61e211dSHarvey Harrison #endif
218c61e211dSHarvey Harrison 
219c61e211dSHarvey Harrison 	/*
220c61e211dSHarvey Harrison 	 * We must not directly access the pte in the highpte
221c61e211dSHarvey Harrison 	 * case if the page table is located in highmem.
222c61e211dSHarvey Harrison 	 * And let's rather not kmap-atomic the pte, just in case
223c61e211dSHarvey Harrison 	 * it's allocated already.
224c61e211dSHarvey Harrison 	 */
225c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
226c61e211dSHarvey Harrison 	    && (page & _PAGE_PRESENT)
227c61e211dSHarvey Harrison 	    && !(page & _PAGE_PSE)) {
228c61e211dSHarvey Harrison 		page &= PAGE_MASK;
229c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
230c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PTE - 1)];
231c61e211dSHarvey Harrison 		printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
232c61e211dSHarvey Harrison 	}
233c61e211dSHarvey Harrison 
234c61e211dSHarvey Harrison 	printk("\n");
235c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
236c61e211dSHarvey Harrison 	pgd_t *pgd;
237c61e211dSHarvey Harrison 	pud_t *pud;
238c61e211dSHarvey Harrison 	pmd_t *pmd;
239c61e211dSHarvey Harrison 	pte_t *pte;
240c61e211dSHarvey Harrison 
241c61e211dSHarvey Harrison 	pgd = (pgd_t *)read_cr3();
242c61e211dSHarvey Harrison 
243c61e211dSHarvey Harrison 	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
244c61e211dSHarvey Harrison 	pgd += pgd_index(address);
245c61e211dSHarvey Harrison 	if (bad_address(pgd)) goto bad;
246c61e211dSHarvey Harrison 	printk("PGD %lx ", pgd_val(*pgd));
247c61e211dSHarvey Harrison 	if (!pgd_present(*pgd)) goto ret;
248c61e211dSHarvey Harrison 
249c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
250c61e211dSHarvey Harrison 	if (bad_address(pud)) goto bad;
251c61e211dSHarvey Harrison 	printk("PUD %lx ", pud_val(*pud));
252b5360222SAndi Kleen 	if (!pud_present(*pud) || pud_large(*pud))
253b5360222SAndi Kleen 		goto ret;
254c61e211dSHarvey Harrison 
255c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
256c61e211dSHarvey Harrison 	if (bad_address(pmd)) goto bad;
257c61e211dSHarvey Harrison 	printk("PMD %lx ", pmd_val(*pmd));
258c61e211dSHarvey Harrison 	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
259c61e211dSHarvey Harrison 
260c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
261c61e211dSHarvey Harrison 	if (bad_address(pte)) goto bad;
262c61e211dSHarvey Harrison 	printk("PTE %lx", pte_val(*pte));
263c61e211dSHarvey Harrison ret:
264c61e211dSHarvey Harrison 	printk("\n");
265c61e211dSHarvey Harrison 	return;
266c61e211dSHarvey Harrison bad:
267c61e211dSHarvey Harrison 	printk("BAD\n");
268c61e211dSHarvey Harrison #endif
269c61e211dSHarvey Harrison }
270c61e211dSHarvey Harrison 
271c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
272c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
273c61e211dSHarvey Harrison {
274c61e211dSHarvey Harrison 	unsigned index = pgd_index(address);
275c61e211dSHarvey Harrison 	pgd_t *pgd_k;
276c61e211dSHarvey Harrison 	pud_t *pud, *pud_k;
277c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_k;
278c61e211dSHarvey Harrison 
279c61e211dSHarvey Harrison 	pgd += index;
280c61e211dSHarvey Harrison 	pgd_k = init_mm.pgd + index;
281c61e211dSHarvey Harrison 
282c61e211dSHarvey Harrison 	if (!pgd_present(*pgd_k))
283c61e211dSHarvey Harrison 		return NULL;
284c61e211dSHarvey Harrison 
285c61e211dSHarvey Harrison 	/*
286c61e211dSHarvey Harrison 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
287c61e211dSHarvey Harrison 	 * and redundant with the set_pmd() on non-PAE. As would
288c61e211dSHarvey Harrison 	 * set_pud.
289c61e211dSHarvey Harrison 	 */
290c61e211dSHarvey Harrison 
291c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
292c61e211dSHarvey Harrison 	pud_k = pud_offset(pgd_k, address);
293c61e211dSHarvey Harrison 	if (!pud_present(*pud_k))
294c61e211dSHarvey Harrison 		return NULL;
295c61e211dSHarvey Harrison 
296c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
297c61e211dSHarvey Harrison 	pmd_k = pmd_offset(pud_k, address);
298c61e211dSHarvey Harrison 	if (!pmd_present(*pmd_k))
299c61e211dSHarvey Harrison 		return NULL;
300c61e211dSHarvey Harrison 	if (!pmd_present(*pmd)) {
301c61e211dSHarvey Harrison 		set_pmd(pmd, *pmd_k);
302c61e211dSHarvey Harrison 		arch_flush_lazy_mmu_mode();
303c61e211dSHarvey Harrison 	} else
304c61e211dSHarvey Harrison 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
305c61e211dSHarvey Harrison 	return pmd_k;
306c61e211dSHarvey Harrison }
307c61e211dSHarvey Harrison #endif
308c61e211dSHarvey Harrison 
309c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
310c61e211dSHarvey Harrison static const char errata93_warning[] =
311c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
312c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
313c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n"
314c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
315c61e211dSHarvey Harrison #endif
316c61e211dSHarvey Harrison 
317c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS.
318c61e211dSHarvey Harrison    BIOS SMM functions are required to use a specific workaround
319c61e211dSHarvey Harrison    to avoid corruption of the 64bit RIP register on C stepping K8.
320c61e211dSHarvey Harrison    A lot of BIOS that didn't get tested properly miss this.
321c61e211dSHarvey Harrison    The OS sees this as a page fault with the upper 32bits of RIP cleared.
322c61e211dSHarvey Harrison    Try to work around it here.
323c61e211dSHarvey Harrison    Note we only handle faults in kernel here.
324c61e211dSHarvey Harrison    Does nothing for X86_32
325c61e211dSHarvey Harrison  */
326c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address)
327c61e211dSHarvey Harrison {
328c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
329c61e211dSHarvey Harrison 	static int warned;
330c61e211dSHarvey Harrison 	if (address != regs->ip)
331c61e211dSHarvey Harrison 		return 0;
332c61e211dSHarvey Harrison 	if ((address >> 32) != 0)
333c61e211dSHarvey Harrison 		return 0;
334c61e211dSHarvey Harrison 	address |= 0xffffffffUL << 32;
335c61e211dSHarvey Harrison 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
336c61e211dSHarvey Harrison 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
337c61e211dSHarvey Harrison 		if (!warned) {
338c61e211dSHarvey Harrison 			printk(errata93_warning);
339c61e211dSHarvey Harrison 			warned = 1;
340c61e211dSHarvey Harrison 		}
341c61e211dSHarvey Harrison 		regs->ip = address;
342c61e211dSHarvey Harrison 		return 1;
343c61e211dSHarvey Harrison 	}
344c61e211dSHarvey Harrison #endif
345c61e211dSHarvey Harrison 	return 0;
346c61e211dSHarvey Harrison }
347c61e211dSHarvey Harrison 
348c61e211dSHarvey Harrison /*
349c61e211dSHarvey Harrison  * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
350c61e211dSHarvey Harrison  * addresses >4GB.  We catch this in the page fault handler because these
351c61e211dSHarvey Harrison  * addresses are not reachable. Just detect this case and return.  Any code
352c61e211dSHarvey Harrison  * segment in LDT is compatibility mode.
353c61e211dSHarvey Harrison  */
354c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address)
355c61e211dSHarvey Harrison {
356c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
357c61e211dSHarvey Harrison 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
358c61e211dSHarvey Harrison 	    (address >> 32))
359c61e211dSHarvey Harrison 		return 1;
360c61e211dSHarvey Harrison #endif
361c61e211dSHarvey Harrison 	return 0;
362c61e211dSHarvey Harrison }
363c61e211dSHarvey Harrison 
364c61e211dSHarvey Harrison void do_invalid_op(struct pt_regs *, unsigned long);
365c61e211dSHarvey Harrison 
366c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
367c61e211dSHarvey Harrison {
368c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG
369c61e211dSHarvey Harrison 	unsigned long nr;
370c61e211dSHarvey Harrison 	/*
371c61e211dSHarvey Harrison 	 * Pentium F0 0F C7 C8 bug workaround.
372c61e211dSHarvey Harrison 	 */
373c61e211dSHarvey Harrison 	if (boot_cpu_data.f00f_bug) {
374c61e211dSHarvey Harrison 		nr = (address - idt_descr.address) >> 3;
375c61e211dSHarvey Harrison 
376c61e211dSHarvey Harrison 		if (nr == 6) {
377c61e211dSHarvey Harrison 			do_invalid_op(regs, 0);
378c61e211dSHarvey Harrison 			return 1;
379c61e211dSHarvey Harrison 		}
380c61e211dSHarvey Harrison 	}
381c61e211dSHarvey Harrison #endif
382c61e211dSHarvey Harrison 	return 0;
383c61e211dSHarvey Harrison }
384c61e211dSHarvey Harrison 
385c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
386c61e211dSHarvey Harrison 			    unsigned long address)
387c61e211dSHarvey Harrison {
388c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
389c61e211dSHarvey Harrison 	if (!oops_may_print())
390c61e211dSHarvey Harrison 		return;
391fd40d6e3SHarvey Harrison #endif
392c61e211dSHarvey Harrison 
393c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
394c61e211dSHarvey Harrison 	if (error_code & PF_INSTR) {
39593809be8SHarvey Harrison 		unsigned int level;
396c61e211dSHarvey Harrison 		pte_t *pte = lookup_address(address, &level);
397c61e211dSHarvey Harrison 
398c61e211dSHarvey Harrison 		if (pte && pte_present(*pte) && !pte_exec(*pte))
399c61e211dSHarvey Harrison 			printk(KERN_CRIT "kernel tried to execute "
400c61e211dSHarvey Harrison 				"NX-protected page - exploit attempt? "
401c61e211dSHarvey Harrison 				"(uid: %d)\n", current->uid);
402c61e211dSHarvey Harrison 	}
403c61e211dSHarvey Harrison #endif
404fd40d6e3SHarvey Harrison 
405c61e211dSHarvey Harrison 	printk(KERN_ALERT "BUG: unable to handle kernel ");
406c61e211dSHarvey Harrison 	if (address < PAGE_SIZE)
407c61e211dSHarvey Harrison 		printk(KERN_CONT "NULL pointer dereference");
408c61e211dSHarvey Harrison 	else
409c61e211dSHarvey Harrison 		printk(KERN_CONT "paging request");
410fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
411c61e211dSHarvey Harrison 	printk(KERN_CONT " at %08lx\n", address);
412fd40d6e3SHarvey Harrison #else
413c61e211dSHarvey Harrison 	printk(KERN_CONT " at %016lx\n", address);
414fd40d6e3SHarvey Harrison #endif
415c61e211dSHarvey Harrison 	printk(KERN_ALERT "IP:");
416c61e211dSHarvey Harrison 	printk_address(regs->ip, 1);
417c61e211dSHarvey Harrison 	dump_pagetable(address);
418c61e211dSHarvey Harrison }
419c61e211dSHarvey Harrison 
420c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
421c61e211dSHarvey Harrison static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
422c61e211dSHarvey Harrison 				 unsigned long error_code)
423c61e211dSHarvey Harrison {
424c61e211dSHarvey Harrison 	unsigned long flags = oops_begin();
425c61e211dSHarvey Harrison 	struct task_struct *tsk;
426c61e211dSHarvey Harrison 
427c61e211dSHarvey Harrison 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
428c61e211dSHarvey Harrison 	       current->comm, address);
429c61e211dSHarvey Harrison 	dump_pagetable(address);
430c61e211dSHarvey Harrison 	tsk = current;
431c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
432c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
433c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
434c61e211dSHarvey Harrison 	if (__die("Bad pagetable", regs, error_code))
435c61e211dSHarvey Harrison 		regs = NULL;
436c61e211dSHarvey Harrison 	oops_end(flags, regs, SIGKILL);
437c61e211dSHarvey Harrison }
438c61e211dSHarvey Harrison #endif
439c61e211dSHarvey Harrison 
440d8b57bb7SThomas Gleixner static int spurious_fault_check(unsigned long error_code, pte_t *pte)
441d8b57bb7SThomas Gleixner {
442d8b57bb7SThomas Gleixner 	if ((error_code & PF_WRITE) && !pte_write(*pte))
443d8b57bb7SThomas Gleixner 		return 0;
444d8b57bb7SThomas Gleixner 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
445d8b57bb7SThomas Gleixner 		return 0;
446d8b57bb7SThomas Gleixner 
447d8b57bb7SThomas Gleixner 	return 1;
448d8b57bb7SThomas Gleixner }
449d8b57bb7SThomas Gleixner 
450c61e211dSHarvey Harrison /*
4515b727a3bSJeremy Fitzhardinge  * Handle a spurious fault caused by a stale TLB entry.  This allows
4525b727a3bSJeremy Fitzhardinge  * us to lazily refresh the TLB when increasing the permissions of a
4535b727a3bSJeremy Fitzhardinge  * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
4545b727a3bSJeremy Fitzhardinge  * expensive since that implies doing a full cross-processor TLB
4555b727a3bSJeremy Fitzhardinge  * flush, even if no stale TLB entries exist on other processors.
4565b727a3bSJeremy Fitzhardinge  * There are no security implications to leaving a stale TLB when
4575b727a3bSJeremy Fitzhardinge  * increasing the permissions on a page.
4585b727a3bSJeremy Fitzhardinge  */
4595b727a3bSJeremy Fitzhardinge static int spurious_fault(unsigned long address,
4605b727a3bSJeremy Fitzhardinge 			  unsigned long error_code)
4615b727a3bSJeremy Fitzhardinge {
4625b727a3bSJeremy Fitzhardinge 	pgd_t *pgd;
4635b727a3bSJeremy Fitzhardinge 	pud_t *pud;
4645b727a3bSJeremy Fitzhardinge 	pmd_t *pmd;
4655b727a3bSJeremy Fitzhardinge 	pte_t *pte;
4665b727a3bSJeremy Fitzhardinge 
4675b727a3bSJeremy Fitzhardinge 	/* Reserved-bit violation or user access to kernel space? */
4685b727a3bSJeremy Fitzhardinge 	if (error_code & (PF_USER | PF_RSVD))
4695b727a3bSJeremy Fitzhardinge 		return 0;
4705b727a3bSJeremy Fitzhardinge 
4715b727a3bSJeremy Fitzhardinge 	pgd = init_mm.pgd + pgd_index(address);
4725b727a3bSJeremy Fitzhardinge 	if (!pgd_present(*pgd))
4735b727a3bSJeremy Fitzhardinge 		return 0;
4745b727a3bSJeremy Fitzhardinge 
4755b727a3bSJeremy Fitzhardinge 	pud = pud_offset(pgd, address);
4765b727a3bSJeremy Fitzhardinge 	if (!pud_present(*pud))
4775b727a3bSJeremy Fitzhardinge 		return 0;
4785b727a3bSJeremy Fitzhardinge 
479d8b57bb7SThomas Gleixner 	if (pud_large(*pud))
480d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pud);
481d8b57bb7SThomas Gleixner 
4825b727a3bSJeremy Fitzhardinge 	pmd = pmd_offset(pud, address);
4835b727a3bSJeremy Fitzhardinge 	if (!pmd_present(*pmd))
4845b727a3bSJeremy Fitzhardinge 		return 0;
4855b727a3bSJeremy Fitzhardinge 
486d8b57bb7SThomas Gleixner 	if (pmd_large(*pmd))
487d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pmd);
488d8b57bb7SThomas Gleixner 
4895b727a3bSJeremy Fitzhardinge 	pte = pte_offset_kernel(pmd, address);
4905b727a3bSJeremy Fitzhardinge 	if (!pte_present(*pte))
4915b727a3bSJeremy Fitzhardinge 		return 0;
4925b727a3bSJeremy Fitzhardinge 
493d8b57bb7SThomas Gleixner 	return spurious_fault_check(error_code, pte);
4945b727a3bSJeremy Fitzhardinge }
4955b727a3bSJeremy Fitzhardinge 
4965b727a3bSJeremy Fitzhardinge /*
497c61e211dSHarvey Harrison  * X86_32
498c61e211dSHarvey Harrison  * Handle a fault on the vmalloc or module mapping area
499c61e211dSHarvey Harrison  *
500c61e211dSHarvey Harrison  * X86_64
501c61e211dSHarvey Harrison  * Handle a fault on the vmalloc area
502c61e211dSHarvey Harrison  *
503c61e211dSHarvey Harrison  * This assumes no large pages in there.
504c61e211dSHarvey Harrison  */
505c61e211dSHarvey Harrison static int vmalloc_fault(unsigned long address)
506c61e211dSHarvey Harrison {
507c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
508c61e211dSHarvey Harrison 	unsigned long pgd_paddr;
509c61e211dSHarvey Harrison 	pmd_t *pmd_k;
510c61e211dSHarvey Harrison 	pte_t *pte_k;
511c61e211dSHarvey Harrison 	/*
512c61e211dSHarvey Harrison 	 * Synchronize this task's top level page-table
513c61e211dSHarvey Harrison 	 * with the 'reference' page table.
514c61e211dSHarvey Harrison 	 *
515c61e211dSHarvey Harrison 	 * Do _not_ use "current" here. We might be inside
516c61e211dSHarvey Harrison 	 * an interrupt in the middle of a task switch..
517c61e211dSHarvey Harrison 	 */
518c61e211dSHarvey Harrison 	pgd_paddr = read_cr3();
519c61e211dSHarvey Harrison 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
520c61e211dSHarvey Harrison 	if (!pmd_k)
521c61e211dSHarvey Harrison 		return -1;
522c61e211dSHarvey Harrison 	pte_k = pte_offset_kernel(pmd_k, address);
523c61e211dSHarvey Harrison 	if (!pte_present(*pte_k))
524c61e211dSHarvey Harrison 		return -1;
525c61e211dSHarvey Harrison 	return 0;
526c61e211dSHarvey Harrison #else
527c61e211dSHarvey Harrison 	pgd_t *pgd, *pgd_ref;
528c61e211dSHarvey Harrison 	pud_t *pud, *pud_ref;
529c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_ref;
530c61e211dSHarvey Harrison 	pte_t *pte, *pte_ref;
531c61e211dSHarvey Harrison 
532cf89ec92SHarvey Harrison 	/* Make sure we are in vmalloc area */
533cf89ec92SHarvey Harrison 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
534cf89ec92SHarvey Harrison 		return -1;
535cf89ec92SHarvey Harrison 
536c61e211dSHarvey Harrison 	/* Copy kernel mappings over when needed. This can also
537c61e211dSHarvey Harrison 	   happen within a race in page table update. In the later
538c61e211dSHarvey Harrison 	   case just flush. */
539c61e211dSHarvey Harrison 
540c61e211dSHarvey Harrison 	pgd = pgd_offset(current->mm ?: &init_mm, address);
541c61e211dSHarvey Harrison 	pgd_ref = pgd_offset_k(address);
542c61e211dSHarvey Harrison 	if (pgd_none(*pgd_ref))
543c61e211dSHarvey Harrison 		return -1;
544c61e211dSHarvey Harrison 	if (pgd_none(*pgd))
545c61e211dSHarvey Harrison 		set_pgd(pgd, *pgd_ref);
546c61e211dSHarvey Harrison 	else
547c61e211dSHarvey Harrison 		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
548c61e211dSHarvey Harrison 
549c61e211dSHarvey Harrison 	/* Below here mismatches are bugs because these lower tables
550c61e211dSHarvey Harrison 	   are shared */
551c61e211dSHarvey Harrison 
552c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
553c61e211dSHarvey Harrison 	pud_ref = pud_offset(pgd_ref, address);
554c61e211dSHarvey Harrison 	if (pud_none(*pud_ref))
555c61e211dSHarvey Harrison 		return -1;
556c61e211dSHarvey Harrison 	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
557c61e211dSHarvey Harrison 		BUG();
558c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
559c61e211dSHarvey Harrison 	pmd_ref = pmd_offset(pud_ref, address);
560c61e211dSHarvey Harrison 	if (pmd_none(*pmd_ref))
561c61e211dSHarvey Harrison 		return -1;
562c61e211dSHarvey Harrison 	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
563c61e211dSHarvey Harrison 		BUG();
564c61e211dSHarvey Harrison 	pte_ref = pte_offset_kernel(pmd_ref, address);
565c61e211dSHarvey Harrison 	if (!pte_present(*pte_ref))
566c61e211dSHarvey Harrison 		return -1;
567c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
568c61e211dSHarvey Harrison 	/* Don't use pte_page here, because the mappings can point
569c61e211dSHarvey Harrison 	   outside mem_map, and the NUMA hash lookup cannot handle
570c61e211dSHarvey Harrison 	   that. */
571c61e211dSHarvey Harrison 	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
572c61e211dSHarvey Harrison 		BUG();
573c61e211dSHarvey Harrison 	return 0;
574c61e211dSHarvey Harrison #endif
575c61e211dSHarvey Harrison }
576c61e211dSHarvey Harrison 
577c61e211dSHarvey Harrison int show_unhandled_signals = 1;
578c61e211dSHarvey Harrison 
579c61e211dSHarvey Harrison /*
580c61e211dSHarvey Harrison  * This routine handles page faults.  It determines the address,
581c61e211dSHarvey Harrison  * and the problem, and then passes it off to one of the appropriate
582c61e211dSHarvey Harrison  * routines.
583c61e211dSHarvey Harrison  */
584c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
585c61e211dSHarvey Harrison asmlinkage
586c61e211dSHarvey Harrison #endif
587c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
588c61e211dSHarvey Harrison {
589c61e211dSHarvey Harrison 	struct task_struct *tsk;
590c61e211dSHarvey Harrison 	struct mm_struct *mm;
591c61e211dSHarvey Harrison 	struct vm_area_struct *vma;
592c61e211dSHarvey Harrison 	unsigned long address;
593c61e211dSHarvey Harrison 	int write, si_code;
594c61e211dSHarvey Harrison 	int fault;
595c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
596c61e211dSHarvey Harrison 	unsigned long flags;
597c61e211dSHarvey Harrison #endif
598c61e211dSHarvey Harrison 
599c61e211dSHarvey Harrison 	/*
600c61e211dSHarvey Harrison 	 * We can fault from pretty much anywhere, with unknown IRQ state.
601c61e211dSHarvey Harrison 	 */
602c61e211dSHarvey Harrison 	trace_hardirqs_fixup();
603c61e211dSHarvey Harrison 
604c61e211dSHarvey Harrison 	tsk = current;
605c61e211dSHarvey Harrison 	mm = tsk->mm;
606c61e211dSHarvey Harrison 	prefetchw(&mm->mmap_sem);
607c61e211dSHarvey Harrison 
608c61e211dSHarvey Harrison 	/* get the address */
609c61e211dSHarvey Harrison 	address = read_cr2();
610c61e211dSHarvey Harrison 
611c61e211dSHarvey Harrison 	si_code = SEGV_MAPERR;
612c61e211dSHarvey Harrison 
613c61e211dSHarvey Harrison 	if (notify_page_fault(regs))
614c61e211dSHarvey Harrison 		return;
615*0fd0e3daSPekka Paalanen 	if (unlikely(kmmio_fault(regs, address)))
61686069782SPekka Paalanen 		return;
617c61e211dSHarvey Harrison 
618c61e211dSHarvey Harrison 	/*
619c61e211dSHarvey Harrison 	 * We fault-in kernel-space virtual memory on-demand. The
620c61e211dSHarvey Harrison 	 * 'reference' page table is init_mm.pgd.
621c61e211dSHarvey Harrison 	 *
622c61e211dSHarvey Harrison 	 * NOTE! We MUST NOT take any locks for this case. We may
623c61e211dSHarvey Harrison 	 * be in an interrupt or a critical region, and should
624c61e211dSHarvey Harrison 	 * only copy the information from the master page table,
625c61e211dSHarvey Harrison 	 * nothing more.
626c61e211dSHarvey Harrison 	 *
627c61e211dSHarvey Harrison 	 * This verifies that the fault happens in kernel space
628c61e211dSHarvey Harrison 	 * (error_code & 4) == 0, and that the fault was not a
629c61e211dSHarvey Harrison 	 * protection error (error_code & 9) == 0.
630c61e211dSHarvey Harrison 	 */
631c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
632c61e211dSHarvey Harrison 	if (unlikely(address >= TASK_SIZE)) {
633cf89ec92SHarvey Harrison #else
634cf89ec92SHarvey Harrison 	if (unlikely(address >= TASK_SIZE64)) {
635cf89ec92SHarvey Harrison #endif
636c61e211dSHarvey Harrison 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
637c61e211dSHarvey Harrison 		    vmalloc_fault(address) >= 0)
638c61e211dSHarvey Harrison 			return;
6395b727a3bSJeremy Fitzhardinge 
6405b727a3bSJeremy Fitzhardinge 		/* Can handle a stale RO->RW TLB */
6415b727a3bSJeremy Fitzhardinge 		if (spurious_fault(address, error_code))
6425b727a3bSJeremy Fitzhardinge 			return;
6435b727a3bSJeremy Fitzhardinge 
644c61e211dSHarvey Harrison 		/*
645c61e211dSHarvey Harrison 		 * Don't take the mm semaphore here. If we fixup a prefetch
646c61e211dSHarvey Harrison 		 * fault we could otherwise deadlock.
647c61e211dSHarvey Harrison 		 */
648c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
649c61e211dSHarvey Harrison 	}
650c61e211dSHarvey Harrison 
651cf89ec92SHarvey Harrison 
652cf89ec92SHarvey Harrison #ifdef CONFIG_X86_32
653c61e211dSHarvey Harrison 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
654c61e211dSHarvey Harrison 	   fault has been handled. */
6556b6891f9Sgorcunov@gmail.com 	if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
656c61e211dSHarvey Harrison 		local_irq_enable();
657c61e211dSHarvey Harrison 
658c61e211dSHarvey Harrison 	/*
659c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
660c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
661c61e211dSHarvey Harrison 	 */
662c61e211dSHarvey Harrison 	if (in_atomic() || !mm)
663c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
664c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
665c61e211dSHarvey Harrison 	if (likely(regs->flags & X86_EFLAGS_IF))
666c61e211dSHarvey Harrison 		local_irq_enable();
667c61e211dSHarvey Harrison 
668c61e211dSHarvey Harrison 	if (unlikely(error_code & PF_RSVD))
669c61e211dSHarvey Harrison 		pgtable_bad(address, regs, error_code);
670c61e211dSHarvey Harrison 
671c61e211dSHarvey Harrison 	/*
672c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
673c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
674c61e211dSHarvey Harrison 	 */
675c61e211dSHarvey Harrison 	if (unlikely(in_atomic() || !mm))
676c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
677c61e211dSHarvey Harrison 
678c61e211dSHarvey Harrison 	/*
679c61e211dSHarvey Harrison 	 * User-mode registers count as a user access even for any
680c61e211dSHarvey Harrison 	 * potential system fault or CPU buglet.
681c61e211dSHarvey Harrison 	 */
682c61e211dSHarvey Harrison 	if (user_mode_vm(regs))
683c61e211dSHarvey Harrison 		error_code |= PF_USER;
684c61e211dSHarvey Harrison again:
685c61e211dSHarvey Harrison #endif
686c61e211dSHarvey Harrison 	/* When running in the kernel we expect faults to occur only to
687c61e211dSHarvey Harrison 	 * addresses in user space.  All other faults represent errors in the
688c61e211dSHarvey Harrison 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
689c61e211dSHarvey Harrison 	 * erroneous fault occurring in a code path which already holds mmap_sem
690c61e211dSHarvey Harrison 	 * we will deadlock attempting to validate the fault against the
691c61e211dSHarvey Harrison 	 * address space.  Luckily the kernel only validly references user
692c61e211dSHarvey Harrison 	 * space from well defined areas of code, which are listed in the
693c61e211dSHarvey Harrison 	 * exceptions table.
694c61e211dSHarvey Harrison 	 *
695c61e211dSHarvey Harrison 	 * As the vast majority of faults will be valid we will only perform
696c61e211dSHarvey Harrison 	 * the source reference check when there is a possibility of a deadlock.
697c61e211dSHarvey Harrison 	 * Attempt to lock the address space, if we cannot we then validate the
698c61e211dSHarvey Harrison 	 * source.  If this is invalid we can skip the address space check,
699c61e211dSHarvey Harrison 	 * thus avoiding the deadlock.
700c61e211dSHarvey Harrison 	 */
701c61e211dSHarvey Harrison 	if (!down_read_trylock(&mm->mmap_sem)) {
702c61e211dSHarvey Harrison 		if ((error_code & PF_USER) == 0 &&
703c61e211dSHarvey Harrison 		    !search_exception_tables(regs->ip))
704c61e211dSHarvey Harrison 			goto bad_area_nosemaphore;
705c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
706c61e211dSHarvey Harrison 	}
707c61e211dSHarvey Harrison 
708c61e211dSHarvey Harrison 	vma = find_vma(mm, address);
709c61e211dSHarvey Harrison 	if (!vma)
710c61e211dSHarvey Harrison 		goto bad_area;
711c61e211dSHarvey Harrison 	if (vma->vm_start <= address)
712c61e211dSHarvey Harrison 		goto good_area;
713c61e211dSHarvey Harrison 	if (!(vma->vm_flags & VM_GROWSDOWN))
714c61e211dSHarvey Harrison 		goto bad_area;
715c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
716c61e211dSHarvey Harrison 		/*
717c61e211dSHarvey Harrison 		 * Accessing the stack below %sp is always a bug.
718c61e211dSHarvey Harrison 		 * The large cushion allows instructions like enter
719c61e211dSHarvey Harrison 		 * and pusha to work.  ("enter $65535,$31" pushes
720c61e211dSHarvey Harrison 		 * 32 pointers and then decrements %sp by 65535.)
721c61e211dSHarvey Harrison 		 */
722c61e211dSHarvey Harrison 		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
723c61e211dSHarvey Harrison 			goto bad_area;
724c61e211dSHarvey Harrison 	}
725c61e211dSHarvey Harrison 	if (expand_stack(vma, address))
726c61e211dSHarvey Harrison 		goto bad_area;
727c61e211dSHarvey Harrison /*
728c61e211dSHarvey Harrison  * Ok, we have a good vm_area for this memory access, so
729c61e211dSHarvey Harrison  * we can handle it..
730c61e211dSHarvey Harrison  */
731c61e211dSHarvey Harrison good_area:
732c61e211dSHarvey Harrison 	si_code = SEGV_ACCERR;
733c61e211dSHarvey Harrison 	write = 0;
734c61e211dSHarvey Harrison 	switch (error_code & (PF_PROT|PF_WRITE)) {
735c61e211dSHarvey Harrison 	default:	/* 3: write, present */
736c61e211dSHarvey Harrison 		/* fall through */
737c61e211dSHarvey Harrison 	case PF_WRITE:		/* write, not present */
738c61e211dSHarvey Harrison 		if (!(vma->vm_flags & VM_WRITE))
739c61e211dSHarvey Harrison 			goto bad_area;
740c61e211dSHarvey Harrison 		write++;
741c61e211dSHarvey Harrison 		break;
742c61e211dSHarvey Harrison 	case PF_PROT:		/* read, present */
743c61e211dSHarvey Harrison 		goto bad_area;
744c61e211dSHarvey Harrison 	case 0:			/* read, not present */
745c61e211dSHarvey Harrison 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
746c61e211dSHarvey Harrison 			goto bad_area;
747c61e211dSHarvey Harrison 	}
748c61e211dSHarvey Harrison 
749c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
750c61e211dSHarvey Harrison survive:
751c61e211dSHarvey Harrison #endif
752c61e211dSHarvey Harrison 	/*
753c61e211dSHarvey Harrison 	 * If for any reason at all we couldn't handle the fault,
754c61e211dSHarvey Harrison 	 * make sure we exit gracefully rather than endlessly redo
755c61e211dSHarvey Harrison 	 * the fault.
756c61e211dSHarvey Harrison 	 */
757c61e211dSHarvey Harrison 	fault = handle_mm_fault(mm, vma, address, write);
758c61e211dSHarvey Harrison 	if (unlikely(fault & VM_FAULT_ERROR)) {
759c61e211dSHarvey Harrison 		if (fault & VM_FAULT_OOM)
760c61e211dSHarvey Harrison 			goto out_of_memory;
761c61e211dSHarvey Harrison 		else if (fault & VM_FAULT_SIGBUS)
762c61e211dSHarvey Harrison 			goto do_sigbus;
763c61e211dSHarvey Harrison 		BUG();
764c61e211dSHarvey Harrison 	}
765c61e211dSHarvey Harrison 	if (fault & VM_FAULT_MAJOR)
766c61e211dSHarvey Harrison 		tsk->maj_flt++;
767c61e211dSHarvey Harrison 	else
768c61e211dSHarvey Harrison 		tsk->min_flt++;
769c61e211dSHarvey Harrison 
770c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
771c61e211dSHarvey Harrison 	/*
772c61e211dSHarvey Harrison 	 * Did it hit the DOS screen memory VA from vm86 mode?
773c61e211dSHarvey Harrison 	 */
774c61e211dSHarvey Harrison 	if (v8086_mode(regs)) {
775c61e211dSHarvey Harrison 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
776c61e211dSHarvey Harrison 		if (bit < 32)
777c61e211dSHarvey Harrison 			tsk->thread.screen_bitmap |= 1 << bit;
778c61e211dSHarvey Harrison 	}
779c61e211dSHarvey Harrison #endif
780c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
781c61e211dSHarvey Harrison 	return;
782c61e211dSHarvey Harrison 
783c61e211dSHarvey Harrison /*
784c61e211dSHarvey Harrison  * Something tried to access memory that isn't in our memory map..
785c61e211dSHarvey Harrison  * Fix it, but check if it's kernel or user first..
786c61e211dSHarvey Harrison  */
787c61e211dSHarvey Harrison bad_area:
788c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
789c61e211dSHarvey Harrison 
790c61e211dSHarvey Harrison bad_area_nosemaphore:
791c61e211dSHarvey Harrison 	/* User mode accesses just cause a SIGSEGV */
792c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
793c61e211dSHarvey Harrison 		/*
794c61e211dSHarvey Harrison 		 * It's possible to have interrupts off here.
795c61e211dSHarvey Harrison 		 */
796c61e211dSHarvey Harrison 		local_irq_enable();
797c61e211dSHarvey Harrison 
798c61e211dSHarvey Harrison 		/*
799c61e211dSHarvey Harrison 		 * Valid to do another page fault here because this one came
800c61e211dSHarvey Harrison 		 * from user space.
801c61e211dSHarvey Harrison 		 */
802c61e211dSHarvey Harrison 		if (is_prefetch(regs, address, error_code))
803c61e211dSHarvey Harrison 			return;
804c61e211dSHarvey Harrison 
805c61e211dSHarvey Harrison 		if (is_errata100(regs, address))
806c61e211dSHarvey Harrison 			return;
807c61e211dSHarvey Harrison 
808c61e211dSHarvey Harrison 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
809c61e211dSHarvey Harrison 		    printk_ratelimit()) {
810c61e211dSHarvey Harrison 			printk(
811c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
812c61e211dSHarvey Harrison 			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
813c61e211dSHarvey Harrison #else
814c61e211dSHarvey Harrison 			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
815c61e211dSHarvey Harrison #endif
816c61e211dSHarvey Harrison 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
817c61e211dSHarvey Harrison 			tsk->comm, task_pid_nr(tsk), address, regs->ip,
818c61e211dSHarvey Harrison 			regs->sp, error_code);
819c61e211dSHarvey Harrison 			print_vma_addr(" in ", regs->ip);
820c61e211dSHarvey Harrison 			printk("\n");
821c61e211dSHarvey Harrison 		}
822c61e211dSHarvey Harrison 
823c61e211dSHarvey Harrison 		tsk->thread.cr2 = address;
824c61e211dSHarvey Harrison 		/* Kernel addresses are always protection faults */
825c61e211dSHarvey Harrison 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
826c61e211dSHarvey Harrison 		tsk->thread.trap_no = 14;
827c61e211dSHarvey Harrison 		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
828c61e211dSHarvey Harrison 		return;
829c61e211dSHarvey Harrison 	}
830c61e211dSHarvey Harrison 
831c61e211dSHarvey Harrison 	if (is_f00f_bug(regs, address))
832c61e211dSHarvey Harrison 		return;
833c61e211dSHarvey Harrison 
834c61e211dSHarvey Harrison no_context:
835c61e211dSHarvey Harrison 	/* Are we prepared to handle this kernel fault?  */
836c61e211dSHarvey Harrison 	if (fixup_exception(regs))
837c61e211dSHarvey Harrison 		return;
838c61e211dSHarvey Harrison 
839c61e211dSHarvey Harrison 	/*
840c61e211dSHarvey Harrison 	 * X86_32
841c61e211dSHarvey Harrison 	 * Valid to do another page fault here, because if this fault
842c61e211dSHarvey Harrison 	 * had been triggered by is_prefetch fixup_exception would have
843c61e211dSHarvey Harrison 	 * handled it.
844c61e211dSHarvey Harrison 	 *
845c61e211dSHarvey Harrison 	 * X86_64
846c61e211dSHarvey Harrison 	 * Hall of shame of CPU/BIOS bugs.
847c61e211dSHarvey Harrison 	 */
848c61e211dSHarvey Harrison 	if (is_prefetch(regs, address, error_code))
849c61e211dSHarvey Harrison 		return;
850c61e211dSHarvey Harrison 
851c61e211dSHarvey Harrison 	if (is_errata93(regs, address))
852c61e211dSHarvey Harrison 		return;
853c61e211dSHarvey Harrison 
854c61e211dSHarvey Harrison /*
855c61e211dSHarvey Harrison  * Oops. The kernel tried to access some bad page. We'll have to
856c61e211dSHarvey Harrison  * terminate things with extreme prejudice.
857c61e211dSHarvey Harrison  */
858c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
859c61e211dSHarvey Harrison 	bust_spinlocks(1);
860fd40d6e3SHarvey Harrison #else
861fd40d6e3SHarvey Harrison 	flags = oops_begin();
862fd40d6e3SHarvey Harrison #endif
863c61e211dSHarvey Harrison 
864c61e211dSHarvey Harrison 	show_fault_oops(regs, error_code, address);
865c61e211dSHarvey Harrison 
866c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
867c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
868c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
869fd40d6e3SHarvey Harrison 
870fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
871c61e211dSHarvey Harrison 	die("Oops", regs, error_code);
872c61e211dSHarvey Harrison 	bust_spinlocks(0);
873c61e211dSHarvey Harrison 	do_exit(SIGKILL);
874fd40d6e3SHarvey Harrison #else
875c61e211dSHarvey Harrison 	if (__die("Oops", regs, error_code))
876c61e211dSHarvey Harrison 		regs = NULL;
877c61e211dSHarvey Harrison 	/* Executive summary in case the body of the oops scrolled away */
878c61e211dSHarvey Harrison 	printk(KERN_EMERG "CR2: %016lx\n", address);
879c61e211dSHarvey Harrison 	oops_end(flags, regs, SIGKILL);
880c61e211dSHarvey Harrison #endif
881c61e211dSHarvey Harrison 
882c61e211dSHarvey Harrison /*
883c61e211dSHarvey Harrison  * We ran out of memory, or some other thing happened to us that made
884c61e211dSHarvey Harrison  * us unable to handle the page fault gracefully.
885c61e211dSHarvey Harrison  */
886c61e211dSHarvey Harrison out_of_memory:
887c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
888c61e211dSHarvey Harrison 	if (is_global_init(tsk)) {
889c61e211dSHarvey Harrison 		yield();
890fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
891c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
892c61e211dSHarvey Harrison 		goto survive;
893c61e211dSHarvey Harrison #else
894c61e211dSHarvey Harrison 		goto again;
895c61e211dSHarvey Harrison #endif
896fd40d6e3SHarvey Harrison 	}
897fd40d6e3SHarvey Harrison 
898c61e211dSHarvey Harrison 	printk("VM: killing process %s\n", tsk->comm);
899c61e211dSHarvey Harrison 	if (error_code & PF_USER)
900c61e211dSHarvey Harrison 		do_group_exit(SIGKILL);
901c61e211dSHarvey Harrison 	goto no_context;
902c61e211dSHarvey Harrison 
903c61e211dSHarvey Harrison do_sigbus:
904c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
905c61e211dSHarvey Harrison 
906c61e211dSHarvey Harrison 	/* Kernel mode? Handle exceptions or die */
907c61e211dSHarvey Harrison 	if (!(error_code & PF_USER))
908c61e211dSHarvey Harrison 		goto no_context;
909c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
910c61e211dSHarvey Harrison 	/* User space => ok to do another page fault */
911c61e211dSHarvey Harrison 	if (is_prefetch(regs, address, error_code))
912c61e211dSHarvey Harrison 		return;
913c61e211dSHarvey Harrison #endif
914c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
915c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
916c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
917c61e211dSHarvey Harrison 	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
918c61e211dSHarvey Harrison }
919c61e211dSHarvey Harrison 
920c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock);
921c61e211dSHarvey Harrison LIST_HEAD(pgd_list);
922c61e211dSHarvey Harrison 
923c61e211dSHarvey Harrison void vmalloc_sync_all(void)
924c61e211dSHarvey Harrison {
925c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
926c61e211dSHarvey Harrison 	/*
927c61e211dSHarvey Harrison 	 * Note that races in the updates of insync and start aren't
928c61e211dSHarvey Harrison 	 * problematic: insync can only get set bits added, and updates to
929c61e211dSHarvey Harrison 	 * start are only improving performance (without affecting correctness
930c61e211dSHarvey Harrison 	 * if undone).
931c61e211dSHarvey Harrison 	 */
932c61e211dSHarvey Harrison 	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
933c61e211dSHarvey Harrison 	static unsigned long start = TASK_SIZE;
934c61e211dSHarvey Harrison 	unsigned long address;
935c61e211dSHarvey Harrison 
936c61e211dSHarvey Harrison 	if (SHARED_KERNEL_PMD)
937c61e211dSHarvey Harrison 		return;
938c61e211dSHarvey Harrison 
939c61e211dSHarvey Harrison 	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
940c61e211dSHarvey Harrison 	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
941c61e211dSHarvey Harrison 		if (!test_bit(pgd_index(address), insync)) {
942c61e211dSHarvey Harrison 			unsigned long flags;
943c61e211dSHarvey Harrison 			struct page *page;
944c61e211dSHarvey Harrison 
945c61e211dSHarvey Harrison 			spin_lock_irqsave(&pgd_lock, flags);
946e3ed910dSJeremy Fitzhardinge 			list_for_each_entry(page, &pgd_list, lru) {
947c61e211dSHarvey Harrison 				if (!vmalloc_sync_one(page_address(page),
948e3ed910dSJeremy Fitzhardinge 						      address))
949c61e211dSHarvey Harrison 					break;
950c61e211dSHarvey Harrison 			}
951c61e211dSHarvey Harrison 			spin_unlock_irqrestore(&pgd_lock, flags);
952c61e211dSHarvey Harrison 			if (!page)
953c61e211dSHarvey Harrison 				set_bit(pgd_index(address), insync);
954c61e211dSHarvey Harrison 		}
955c61e211dSHarvey Harrison 		if (address == start && test_bit(pgd_index(address), insync))
956c61e211dSHarvey Harrison 			start = address + PGDIR_SIZE;
957c61e211dSHarvey Harrison 	}
958c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
959c61e211dSHarvey Harrison 	/*
960c61e211dSHarvey Harrison 	 * Note that races in the updates of insync and start aren't
961c61e211dSHarvey Harrison 	 * problematic: insync can only get set bits added, and updates to
962c61e211dSHarvey Harrison 	 * start are only improving performance (without affecting correctness
963c61e211dSHarvey Harrison 	 * if undone).
964c61e211dSHarvey Harrison 	 */
965c61e211dSHarvey Harrison 	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
966c61e211dSHarvey Harrison 	static unsigned long start = VMALLOC_START & PGDIR_MASK;
967c61e211dSHarvey Harrison 	unsigned long address;
968c61e211dSHarvey Harrison 
969c61e211dSHarvey Harrison 	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
970c61e211dSHarvey Harrison 		if (!test_bit(pgd_index(address), insync)) {
971c61e211dSHarvey Harrison 			const pgd_t *pgd_ref = pgd_offset_k(address);
97258d5d0d8SIngo Molnar 			unsigned long flags;
973c61e211dSHarvey Harrison 			struct page *page;
974c61e211dSHarvey Harrison 
975c61e211dSHarvey Harrison 			if (pgd_none(*pgd_ref))
976c61e211dSHarvey Harrison 				continue;
97758d5d0d8SIngo Molnar 			spin_lock_irqsave(&pgd_lock, flags);
978c61e211dSHarvey Harrison 			list_for_each_entry(page, &pgd_list, lru) {
979c61e211dSHarvey Harrison 				pgd_t *pgd;
980c61e211dSHarvey Harrison 				pgd = (pgd_t *)page_address(page) + pgd_index(address);
981c61e211dSHarvey Harrison 				if (pgd_none(*pgd))
982c61e211dSHarvey Harrison 					set_pgd(pgd, *pgd_ref);
983c61e211dSHarvey Harrison 				else
984c61e211dSHarvey Harrison 					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
985c61e211dSHarvey Harrison 			}
98658d5d0d8SIngo Molnar 			spin_unlock_irqrestore(&pgd_lock, flags);
987c61e211dSHarvey Harrison 			set_bit(pgd_index(address), insync);
988c61e211dSHarvey Harrison 		}
989c61e211dSHarvey Harrison 		if (address == start)
990c61e211dSHarvey Harrison 			start = address + PGDIR_SIZE;
991c61e211dSHarvey Harrison 	}
992c61e211dSHarvey Harrison #endif
993c61e211dSHarvey Harrison }
994