xref: /openbmc/linux/arch/x86/mm/fault.c (revision 198030782cedf25391e67e7c88b04f87a5eb6563)
1c61e211dSHarvey Harrison /*
2c61e211dSHarvey Harrison  *  Copyright (C) 1995  Linus Torvalds
3c61e211dSHarvey Harrison  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4c61e211dSHarvey Harrison  */
5c61e211dSHarvey Harrison 
6c61e211dSHarvey Harrison #include <linux/signal.h>
7c61e211dSHarvey Harrison #include <linux/sched.h>
8c61e211dSHarvey Harrison #include <linux/kernel.h>
9c61e211dSHarvey Harrison #include <linux/errno.h>
10c61e211dSHarvey Harrison #include <linux/string.h>
11c61e211dSHarvey Harrison #include <linux/types.h>
12c61e211dSHarvey Harrison #include <linux/ptrace.h>
130fd0e3daSPekka Paalanen #include <linux/mmiotrace.h>
14c61e211dSHarvey Harrison #include <linux/mman.h>
15c61e211dSHarvey Harrison #include <linux/mm.h>
16c61e211dSHarvey Harrison #include <linux/smp.h>
17c61e211dSHarvey Harrison #include <linux/interrupt.h>
18c61e211dSHarvey Harrison #include <linux/init.h>
19c61e211dSHarvey Harrison #include <linux/tty.h>
20c61e211dSHarvey Harrison #include <linux/vt_kern.h>		/* For unblank_screen() */
21c61e211dSHarvey Harrison #include <linux/compiler.h>
22c61e211dSHarvey Harrison #include <linux/highmem.h>
23c61e211dSHarvey Harrison #include <linux/bootmem.h>		/* for max_low_pfn */
24c61e211dSHarvey Harrison #include <linux/vmalloc.h>
25c61e211dSHarvey Harrison #include <linux/module.h>
26c61e211dSHarvey Harrison #include <linux/kprobes.h>
27c61e211dSHarvey Harrison #include <linux/uaccess.h>
28c61e211dSHarvey Harrison #include <linux/kdebug.h>
297c9f8861SEric Sandeen #include <linux/magic.h>
30c61e211dSHarvey Harrison 
31c61e211dSHarvey Harrison #include <asm/system.h>
32c61e211dSHarvey Harrison #include <asm/desc.h>
33c61e211dSHarvey Harrison #include <asm/segment.h>
34c61e211dSHarvey Harrison #include <asm/pgalloc.h>
35c61e211dSHarvey Harrison #include <asm/smp.h>
36c61e211dSHarvey Harrison #include <asm/tlbflush.h>
37c61e211dSHarvey Harrison #include <asm/proto.h>
38c61e211dSHarvey Harrison #include <asm-generic/sections.h>
3970ef5641SJaswinder Singh #include <asm/traps.h>
40c61e211dSHarvey Harrison 
41c61e211dSHarvey Harrison /*
42c61e211dSHarvey Harrison  * Page fault error code bits
43c61e211dSHarvey Harrison  *	bit 0 == 0 means no page found, 1 means protection fault
44c61e211dSHarvey Harrison  *	bit 1 == 0 means read, 1 means write
45c61e211dSHarvey Harrison  *	bit 2 == 0 means kernel, 1 means user-mode
46c61e211dSHarvey Harrison  *	bit 3 == 1 means use of reserved bit detected
47c61e211dSHarvey Harrison  *	bit 4 == 1 means fault was an instruction fetch
48c61e211dSHarvey Harrison  */
49c61e211dSHarvey Harrison #define PF_PROT		(1<<0)
50c61e211dSHarvey Harrison #define PF_WRITE	(1<<1)
51c61e211dSHarvey Harrison #define PF_USER		(1<<2)
52c61e211dSHarvey Harrison #define PF_RSVD		(1<<3)
53c61e211dSHarvey Harrison #define PF_INSTR	(1<<4)
54c61e211dSHarvey Harrison 
550fd0e3daSPekka Paalanen static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
5686069782SPekka Paalanen {
57fd3fdf11SPekka Paalanen #ifdef CONFIG_MMIOTRACE
580fd0e3daSPekka Paalanen 	if (unlikely(is_kmmio_active()))
590fd0e3daSPekka Paalanen 		if (kmmio_handler(regs, addr) == 1)
600fd0e3daSPekka Paalanen 			return -1;
6186069782SPekka Paalanen #endif
620fd0e3daSPekka Paalanen 	return 0;
6386069782SPekka Paalanen }
6486069782SPekka Paalanen 
65c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs)
66c61e211dSHarvey Harrison {
67c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES
68c61e211dSHarvey Harrison 	int ret = 0;
69c61e211dSHarvey Harrison 
70c61e211dSHarvey Harrison 	/* kprobe_running() needs smp_processor_id() */
71c61e211dSHarvey Harrison 	if (!user_mode_vm(regs)) {
72c61e211dSHarvey Harrison 		preempt_disable();
73c61e211dSHarvey Harrison 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
74c61e211dSHarvey Harrison 			ret = 1;
75c61e211dSHarvey Harrison 		preempt_enable();
76c61e211dSHarvey Harrison 	}
77c61e211dSHarvey Harrison 
78c61e211dSHarvey Harrison 	return ret;
79c61e211dSHarvey Harrison #else
80c61e211dSHarvey Harrison 	return 0;
81c61e211dSHarvey Harrison #endif
82c61e211dSHarvey Harrison }
83c61e211dSHarvey Harrison 
84c61e211dSHarvey Harrison /*
85c61e211dSHarvey Harrison  * X86_32
86c61e211dSHarvey Harrison  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
87c61e211dSHarvey Harrison  * Check that here and ignore it.
88c61e211dSHarvey Harrison  *
89c61e211dSHarvey Harrison  * X86_64
90c61e211dSHarvey Harrison  * Sometimes the CPU reports invalid exceptions on prefetch.
91c61e211dSHarvey Harrison  * Check that here and ignore it.
92c61e211dSHarvey Harrison  *
93c61e211dSHarvey Harrison  * Opcode checker based on code by Richard Brunner
94c61e211dSHarvey Harrison  */
9592181f19SNick Piggin static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
9692181f19SNick Piggin 			unsigned long addr)
97c61e211dSHarvey Harrison {
98c61e211dSHarvey Harrison 	unsigned char *instr;
99c61e211dSHarvey Harrison 	int scan_more = 1;
100c61e211dSHarvey Harrison 	int prefetch = 0;
101c61e211dSHarvey Harrison 	unsigned char *max_instr;
102c61e211dSHarvey Harrison 
1033085354dSIngo Molnar 	/*
1043085354dSIngo Molnar 	 * If it was a exec (instruction fetch) fault on NX page, then
1053085354dSIngo Molnar 	 * do not ignore the fault:
1063085354dSIngo Molnar 	 */
107c61e211dSHarvey Harrison 	if (error_code & PF_INSTR)
108c61e211dSHarvey Harrison 		return 0;
109c61e211dSHarvey Harrison 
110c61e211dSHarvey Harrison 	instr = (unsigned char *)convert_ip_to_linear(current, regs);
111c61e211dSHarvey Harrison 	max_instr = instr + 15;
112c61e211dSHarvey Harrison 
113c61e211dSHarvey Harrison 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
114c61e211dSHarvey Harrison 		return 0;
115c61e211dSHarvey Harrison 
116c61e211dSHarvey Harrison 	while (scan_more && instr < max_instr) {
117c61e211dSHarvey Harrison 		unsigned char opcode;
118c61e211dSHarvey Harrison 		unsigned char instr_hi;
119c61e211dSHarvey Harrison 		unsigned char instr_lo;
120c61e211dSHarvey Harrison 
121c61e211dSHarvey Harrison 		if (probe_kernel_address(instr, opcode))
122c61e211dSHarvey Harrison 			break;
123c61e211dSHarvey Harrison 
124c61e211dSHarvey Harrison 		instr_hi = opcode & 0xf0;
125c61e211dSHarvey Harrison 		instr_lo = opcode & 0x0f;
126c61e211dSHarvey Harrison 		instr++;
127c61e211dSHarvey Harrison 
128c61e211dSHarvey Harrison 		switch (instr_hi) {
129c61e211dSHarvey Harrison 		case 0x20:
130c61e211dSHarvey Harrison 		case 0x30:
131c61e211dSHarvey Harrison 			/*
132c61e211dSHarvey Harrison 			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
133c61e211dSHarvey Harrison 			 * In X86_64 long mode, the CPU will signal invalid
134c61e211dSHarvey Harrison 			 * opcode if some of these prefixes are present so
135c61e211dSHarvey Harrison 			 * X86_64 will never get here anyway
136c61e211dSHarvey Harrison 			 */
137c61e211dSHarvey Harrison 			scan_more = ((instr_lo & 7) == 0x6);
138c61e211dSHarvey Harrison 			break;
139c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
140c61e211dSHarvey Harrison 		case 0x40:
141c61e211dSHarvey Harrison 			/*
142c61e211dSHarvey Harrison 			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
143c61e211dSHarvey Harrison 			 * Need to figure out under what instruction mode the
144c61e211dSHarvey Harrison 			 * instruction was issued. Could check the LDT for lm,
145c61e211dSHarvey Harrison 			 * but for now it's good enough to assume that long
146c61e211dSHarvey Harrison 			 * mode only uses well known segments or kernel.
147c61e211dSHarvey Harrison 			 */
148c61e211dSHarvey Harrison 			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
149c61e211dSHarvey Harrison 			break;
150c61e211dSHarvey Harrison #endif
151c61e211dSHarvey Harrison 		case 0x60:
152c61e211dSHarvey Harrison 			/* 0x64 thru 0x67 are valid prefixes in all modes. */
153c61e211dSHarvey Harrison 			scan_more = (instr_lo & 0xC) == 0x4;
154c61e211dSHarvey Harrison 			break;
155c61e211dSHarvey Harrison 		case 0xF0:
156c61e211dSHarvey Harrison 			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
157c61e211dSHarvey Harrison 			scan_more = !instr_lo || (instr_lo>>1) == 1;
158c61e211dSHarvey Harrison 			break;
159c61e211dSHarvey Harrison 		case 0x00:
160c61e211dSHarvey Harrison 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
161c61e211dSHarvey Harrison 			scan_more = 0;
162c61e211dSHarvey Harrison 
163c61e211dSHarvey Harrison 			if (probe_kernel_address(instr, opcode))
164c61e211dSHarvey Harrison 				break;
165c61e211dSHarvey Harrison 			prefetch = (instr_lo == 0xF) &&
166c61e211dSHarvey Harrison 				(opcode == 0x0D || opcode == 0x18);
167c61e211dSHarvey Harrison 			break;
168c61e211dSHarvey Harrison 		default:
169c61e211dSHarvey Harrison 			scan_more = 0;
170c61e211dSHarvey Harrison 			break;
171c61e211dSHarvey Harrison 		}
172c61e211dSHarvey Harrison 	}
173c61e211dSHarvey Harrison 	return prefetch;
174c61e211dSHarvey Harrison }
175c61e211dSHarvey Harrison 
176c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code,
177c61e211dSHarvey Harrison 	unsigned long address, struct task_struct *tsk)
178c61e211dSHarvey Harrison {
179c61e211dSHarvey Harrison 	siginfo_t info;
180c61e211dSHarvey Harrison 
181c61e211dSHarvey Harrison 	info.si_signo = si_signo;
182c61e211dSHarvey Harrison 	info.si_errno = 0;
183c61e211dSHarvey Harrison 	info.si_code = si_code;
184c61e211dSHarvey Harrison 	info.si_addr = (void __user *)address;
185c61e211dSHarvey Harrison 	force_sig_info(si_signo, &info, tsk);
186c61e211dSHarvey Harrison }
187c61e211dSHarvey Harrison 
188c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
189c61e211dSHarvey Harrison static int bad_address(void *p)
190c61e211dSHarvey Harrison {
191c61e211dSHarvey Harrison 	unsigned long dummy;
192c61e211dSHarvey Harrison 	return probe_kernel_address((unsigned long *)p, dummy);
193c61e211dSHarvey Harrison }
194c61e211dSHarvey Harrison #endif
195c61e211dSHarvey Harrison 
196cae30f82SAdrian Bunk static void dump_pagetable(unsigned long address)
197c61e211dSHarvey Harrison {
198c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
199c61e211dSHarvey Harrison 	__typeof__(pte_val(__pte(0))) page;
200c61e211dSHarvey Harrison 
201c61e211dSHarvey Harrison 	page = read_cr3();
202c61e211dSHarvey Harrison 	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
203c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
204c61e211dSHarvey Harrison 	printk("*pdpt = %016Lx ", page);
205c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
206c61e211dSHarvey Harrison 	    && page & _PAGE_PRESENT) {
207c61e211dSHarvey Harrison 		page &= PAGE_MASK;
208c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
209c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PMD - 1)];
210c61e211dSHarvey Harrison 		printk(KERN_CONT "*pde = %016Lx ", page);
211c61e211dSHarvey Harrison 		page &= ~_PAGE_NX;
212c61e211dSHarvey Harrison 	}
213c61e211dSHarvey Harrison #else
214c61e211dSHarvey Harrison 	printk("*pde = %08lx ", page);
215c61e211dSHarvey Harrison #endif
216c61e211dSHarvey Harrison 
217c61e211dSHarvey Harrison 	/*
218c61e211dSHarvey Harrison 	 * We must not directly access the pte in the highpte
219c61e211dSHarvey Harrison 	 * case if the page table is located in highmem.
220c61e211dSHarvey Harrison 	 * And let's rather not kmap-atomic the pte, just in case
221c61e211dSHarvey Harrison 	 * it's allocated already.
222c61e211dSHarvey Harrison 	 */
223c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
224c61e211dSHarvey Harrison 	    && (page & _PAGE_PRESENT)
225c61e211dSHarvey Harrison 	    && !(page & _PAGE_PSE)) {
226c61e211dSHarvey Harrison 		page &= PAGE_MASK;
227c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
228c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PTE - 1)];
229c61e211dSHarvey Harrison 		printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
230c61e211dSHarvey Harrison 	}
231c61e211dSHarvey Harrison 
232c61e211dSHarvey Harrison 	printk("\n");
233c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
234c61e211dSHarvey Harrison 	pgd_t *pgd;
235c61e211dSHarvey Harrison 	pud_t *pud;
236c61e211dSHarvey Harrison 	pmd_t *pmd;
237c61e211dSHarvey Harrison 	pte_t *pte;
238c61e211dSHarvey Harrison 
239c61e211dSHarvey Harrison 	pgd = (pgd_t *)read_cr3();
240c61e211dSHarvey Harrison 
241c61e211dSHarvey Harrison 	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
242c61e211dSHarvey Harrison 	pgd += pgd_index(address);
243c61e211dSHarvey Harrison 	if (bad_address(pgd)) goto bad;
244c61e211dSHarvey Harrison 	printk("PGD %lx ", pgd_val(*pgd));
245c61e211dSHarvey Harrison 	if (!pgd_present(*pgd)) goto ret;
246c61e211dSHarvey Harrison 
247c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
248c61e211dSHarvey Harrison 	if (bad_address(pud)) goto bad;
249c61e211dSHarvey Harrison 	printk("PUD %lx ", pud_val(*pud));
250b5360222SAndi Kleen 	if (!pud_present(*pud) || pud_large(*pud))
251b5360222SAndi Kleen 		goto ret;
252c61e211dSHarvey Harrison 
253c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
254c61e211dSHarvey Harrison 	if (bad_address(pmd)) goto bad;
255c61e211dSHarvey Harrison 	printk("PMD %lx ", pmd_val(*pmd));
256c61e211dSHarvey Harrison 	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
257c61e211dSHarvey Harrison 
258c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
259c61e211dSHarvey Harrison 	if (bad_address(pte)) goto bad;
260c61e211dSHarvey Harrison 	printk("PTE %lx", pte_val(*pte));
261c61e211dSHarvey Harrison ret:
262c61e211dSHarvey Harrison 	printk("\n");
263c61e211dSHarvey Harrison 	return;
264c61e211dSHarvey Harrison bad:
265c61e211dSHarvey Harrison 	printk("BAD\n");
266c61e211dSHarvey Harrison #endif
267c61e211dSHarvey Harrison }
268c61e211dSHarvey Harrison 
269c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
270c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
271c61e211dSHarvey Harrison {
272c61e211dSHarvey Harrison 	unsigned index = pgd_index(address);
273c61e211dSHarvey Harrison 	pgd_t *pgd_k;
274c61e211dSHarvey Harrison 	pud_t *pud, *pud_k;
275c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_k;
276c61e211dSHarvey Harrison 
277c61e211dSHarvey Harrison 	pgd += index;
278c61e211dSHarvey Harrison 	pgd_k = init_mm.pgd + index;
279c61e211dSHarvey Harrison 
280c61e211dSHarvey Harrison 	if (!pgd_present(*pgd_k))
281c61e211dSHarvey Harrison 		return NULL;
282c61e211dSHarvey Harrison 
283c61e211dSHarvey Harrison 	/*
284c61e211dSHarvey Harrison 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
285c61e211dSHarvey Harrison 	 * and redundant with the set_pmd() on non-PAE. As would
286c61e211dSHarvey Harrison 	 * set_pud.
287c61e211dSHarvey Harrison 	 */
288c61e211dSHarvey Harrison 
289c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
290c61e211dSHarvey Harrison 	pud_k = pud_offset(pgd_k, address);
291c61e211dSHarvey Harrison 	if (!pud_present(*pud_k))
292c61e211dSHarvey Harrison 		return NULL;
293c61e211dSHarvey Harrison 
294c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
295c61e211dSHarvey Harrison 	pmd_k = pmd_offset(pud_k, address);
296c61e211dSHarvey Harrison 	if (!pmd_present(*pmd_k))
297c61e211dSHarvey Harrison 		return NULL;
298c61e211dSHarvey Harrison 	if (!pmd_present(*pmd)) {
299c61e211dSHarvey Harrison 		set_pmd(pmd, *pmd_k);
300c61e211dSHarvey Harrison 		arch_flush_lazy_mmu_mode();
301c61e211dSHarvey Harrison 	} else
302c61e211dSHarvey Harrison 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
303c61e211dSHarvey Harrison 	return pmd_k;
304c61e211dSHarvey Harrison }
305c61e211dSHarvey Harrison #endif
306c61e211dSHarvey Harrison 
307c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
308c61e211dSHarvey Harrison static const char errata93_warning[] =
309c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
310c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
311c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n"
312c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
313c61e211dSHarvey Harrison #endif
314c61e211dSHarvey Harrison 
315c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS.
316c61e211dSHarvey Harrison    BIOS SMM functions are required to use a specific workaround
317c61e211dSHarvey Harrison    to avoid corruption of the 64bit RIP register on C stepping K8.
318c61e211dSHarvey Harrison    A lot of BIOS that didn't get tested properly miss this.
319c61e211dSHarvey Harrison    The OS sees this as a page fault with the upper 32bits of RIP cleared.
320c61e211dSHarvey Harrison    Try to work around it here.
321c61e211dSHarvey Harrison    Note we only handle faults in kernel here.
322c61e211dSHarvey Harrison    Does nothing for X86_32
323c61e211dSHarvey Harrison  */
324c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address)
325c61e211dSHarvey Harrison {
326c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
327c61e211dSHarvey Harrison 	static int warned;
328c61e211dSHarvey Harrison 	if (address != regs->ip)
329c61e211dSHarvey Harrison 		return 0;
330c61e211dSHarvey Harrison 	if ((address >> 32) != 0)
331c61e211dSHarvey Harrison 		return 0;
332c61e211dSHarvey Harrison 	address |= 0xffffffffUL << 32;
333c61e211dSHarvey Harrison 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
334c61e211dSHarvey Harrison 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
335c61e211dSHarvey Harrison 		if (!warned) {
336c61e211dSHarvey Harrison 			printk(errata93_warning);
337c61e211dSHarvey Harrison 			warned = 1;
338c61e211dSHarvey Harrison 		}
339c61e211dSHarvey Harrison 		regs->ip = address;
340c61e211dSHarvey Harrison 		return 1;
341c61e211dSHarvey Harrison 	}
342c61e211dSHarvey Harrison #endif
343c61e211dSHarvey Harrison 	return 0;
344c61e211dSHarvey Harrison }
345c61e211dSHarvey Harrison 
346c61e211dSHarvey Harrison /*
347c61e211dSHarvey Harrison  * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
348c61e211dSHarvey Harrison  * addresses >4GB.  We catch this in the page fault handler because these
349c61e211dSHarvey Harrison  * addresses are not reachable. Just detect this case and return.  Any code
350c61e211dSHarvey Harrison  * segment in LDT is compatibility mode.
351c61e211dSHarvey Harrison  */
352c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address)
353c61e211dSHarvey Harrison {
354c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
355c61e211dSHarvey Harrison 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
356c61e211dSHarvey Harrison 	    (address >> 32))
357c61e211dSHarvey Harrison 		return 1;
358c61e211dSHarvey Harrison #endif
359c61e211dSHarvey Harrison 	return 0;
360c61e211dSHarvey Harrison }
361c61e211dSHarvey Harrison 
362c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363c61e211dSHarvey Harrison {
364c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG
365c61e211dSHarvey Harrison 	unsigned long nr;
366c61e211dSHarvey Harrison 	/*
367c61e211dSHarvey Harrison 	 * Pentium F0 0F C7 C8 bug workaround.
368c61e211dSHarvey Harrison 	 */
369c61e211dSHarvey Harrison 	if (boot_cpu_data.f00f_bug) {
370c61e211dSHarvey Harrison 		nr = (address - idt_descr.address) >> 3;
371c61e211dSHarvey Harrison 
372c61e211dSHarvey Harrison 		if (nr == 6) {
373c61e211dSHarvey Harrison 			do_invalid_op(regs, 0);
374c61e211dSHarvey Harrison 			return 1;
375c61e211dSHarvey Harrison 		}
376c61e211dSHarvey Harrison 	}
377c61e211dSHarvey Harrison #endif
378c61e211dSHarvey Harrison 	return 0;
379c61e211dSHarvey Harrison }
380c61e211dSHarvey Harrison 
381c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
382c61e211dSHarvey Harrison 			    unsigned long address)
383c61e211dSHarvey Harrison {
384c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
385c61e211dSHarvey Harrison 	if (!oops_may_print())
386c61e211dSHarvey Harrison 		return;
387fd40d6e3SHarvey Harrison #endif
388c61e211dSHarvey Harrison 
389c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
390c61e211dSHarvey Harrison 	if (error_code & PF_INSTR) {
39193809be8SHarvey Harrison 		unsigned int level;
392c61e211dSHarvey Harrison 		pte_t *pte = lookup_address(address, &level);
393c61e211dSHarvey Harrison 
394c61e211dSHarvey Harrison 		if (pte && pte_present(*pte) && !pte_exec(*pte))
395c61e211dSHarvey Harrison 			printk(KERN_CRIT "kernel tried to execute "
396c61e211dSHarvey Harrison 				"NX-protected page - exploit attempt? "
397350b4da7SDavid Howells 				"(uid: %d)\n", current_uid());
398c61e211dSHarvey Harrison 	}
399c61e211dSHarvey Harrison #endif
400fd40d6e3SHarvey Harrison 
401c61e211dSHarvey Harrison 	printk(KERN_ALERT "BUG: unable to handle kernel ");
402c61e211dSHarvey Harrison 	if (address < PAGE_SIZE)
403c61e211dSHarvey Harrison 		printk(KERN_CONT "NULL pointer dereference");
404c61e211dSHarvey Harrison 	else
405c61e211dSHarvey Harrison 		printk(KERN_CONT "paging request");
406f294a8ceSVegard Nossum 	printk(KERN_CONT " at %p\n", (void *) address);
407c61e211dSHarvey Harrison 	printk(KERN_ALERT "IP:");
408c61e211dSHarvey Harrison 	printk_address(regs->ip, 1);
409c61e211dSHarvey Harrison 	dump_pagetable(address);
410c61e211dSHarvey Harrison }
411c61e211dSHarvey Harrison 
412c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
41392181f19SNick Piggin static noinline void pgtable_bad(struct pt_regs *regs,
41492181f19SNick Piggin 			 unsigned long error_code, unsigned long address)
415c61e211dSHarvey Harrison {
416c61e211dSHarvey Harrison 	unsigned long flags = oops_begin();
417874d93d1SAlexander van Heukelum 	int sig = SIGKILL;
41892181f19SNick Piggin 	struct task_struct *tsk = current;
419c61e211dSHarvey Harrison 
420c61e211dSHarvey Harrison 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
42192181f19SNick Piggin 	       tsk->comm, address);
422c61e211dSHarvey Harrison 	dump_pagetable(address);
423c61e211dSHarvey Harrison 	tsk = current;
424c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
425c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
426c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
427c61e211dSHarvey Harrison 	if (__die("Bad pagetable", regs, error_code))
428874d93d1SAlexander van Heukelum 		sig = 0;
429874d93d1SAlexander van Heukelum 	oops_end(flags, regs, sig);
430c61e211dSHarvey Harrison }
431c61e211dSHarvey Harrison #endif
432c61e211dSHarvey Harrison 
43392181f19SNick Piggin static noinline void no_context(struct pt_regs *regs,
43492181f19SNick Piggin 			unsigned long error_code, unsigned long address)
43592181f19SNick Piggin {
43692181f19SNick Piggin 	struct task_struct *tsk = current;
437*19803078SIngo Molnar 	unsigned long *stackend;
438*19803078SIngo Molnar 
43992181f19SNick Piggin #ifdef CONFIG_X86_64
44092181f19SNick Piggin 	unsigned long flags;
44192181f19SNick Piggin 	int sig;
44292181f19SNick Piggin #endif
44392181f19SNick Piggin 
44492181f19SNick Piggin 	/* Are we prepared to handle this kernel fault?  */
44592181f19SNick Piggin 	if (fixup_exception(regs))
44692181f19SNick Piggin 		return;
44792181f19SNick Piggin 
44892181f19SNick Piggin 	/*
44992181f19SNick Piggin 	 * X86_32
45092181f19SNick Piggin 	 * Valid to do another page fault here, because if this fault
45192181f19SNick Piggin 	 * had been triggered by is_prefetch fixup_exception would have
45292181f19SNick Piggin 	 * handled it.
45392181f19SNick Piggin 	 *
45492181f19SNick Piggin 	 * X86_64
45592181f19SNick Piggin 	 * Hall of shame of CPU/BIOS bugs.
45692181f19SNick Piggin 	 */
45792181f19SNick Piggin 	if (is_prefetch(regs, error_code, address))
45892181f19SNick Piggin 		return;
45992181f19SNick Piggin 
46092181f19SNick Piggin 	if (is_errata93(regs, address))
46192181f19SNick Piggin 		return;
46292181f19SNick Piggin 
46392181f19SNick Piggin 	/*
46492181f19SNick Piggin 	 * Oops. The kernel tried to access some bad page. We'll have to
46592181f19SNick Piggin 	 * terminate things with extreme prejudice.
46692181f19SNick Piggin 	 */
46792181f19SNick Piggin #ifdef CONFIG_X86_32
46892181f19SNick Piggin 	bust_spinlocks(1);
46992181f19SNick Piggin #else
47092181f19SNick Piggin 	flags = oops_begin();
47192181f19SNick Piggin #endif
47292181f19SNick Piggin 
47392181f19SNick Piggin 	show_fault_oops(regs, error_code, address);
47492181f19SNick Piggin 
475*19803078SIngo Molnar  	stackend = end_of_stack(tsk);
476*19803078SIngo Molnar 	if (*stackend != STACK_END_MAGIC)
477*19803078SIngo Molnar 		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
478*19803078SIngo Molnar 
47992181f19SNick Piggin 	tsk->thread.cr2 = address;
48092181f19SNick Piggin 	tsk->thread.trap_no = 14;
48192181f19SNick Piggin 	tsk->thread.error_code = error_code;
48292181f19SNick Piggin 
48392181f19SNick Piggin #ifdef CONFIG_X86_32
48492181f19SNick Piggin 	die("Oops", regs, error_code);
48592181f19SNick Piggin 	bust_spinlocks(0);
48692181f19SNick Piggin 	do_exit(SIGKILL);
48792181f19SNick Piggin #else
48892181f19SNick Piggin 	sig = SIGKILL;
48992181f19SNick Piggin 	if (__die("Oops", regs, error_code))
49092181f19SNick Piggin 		sig = 0;
49192181f19SNick Piggin 	/* Executive summary in case the body of the oops scrolled away */
49292181f19SNick Piggin 	printk(KERN_EMERG "CR2: %016lx\n", address);
49392181f19SNick Piggin 	oops_end(flags, regs, sig);
49492181f19SNick Piggin #endif
49592181f19SNick Piggin }
49692181f19SNick Piggin 
49792181f19SNick Piggin static void __bad_area_nosemaphore(struct pt_regs *regs,
49892181f19SNick Piggin 			unsigned long error_code, unsigned long address,
49992181f19SNick Piggin 			int si_code)
50092181f19SNick Piggin {
50192181f19SNick Piggin 	struct task_struct *tsk = current;
50292181f19SNick Piggin 
50392181f19SNick Piggin 	/* User mode accesses just cause a SIGSEGV */
50492181f19SNick Piggin 	if (error_code & PF_USER) {
50592181f19SNick Piggin 		/*
50692181f19SNick Piggin 		 * It's possible to have interrupts off here.
50792181f19SNick Piggin 		 */
50892181f19SNick Piggin 		local_irq_enable();
50992181f19SNick Piggin 
51092181f19SNick Piggin 		/*
51192181f19SNick Piggin 		 * Valid to do another page fault here because this one came
51292181f19SNick Piggin 		 * from user space.
51392181f19SNick Piggin 		 */
51492181f19SNick Piggin 		if (is_prefetch(regs, error_code, address))
51592181f19SNick Piggin 			return;
51692181f19SNick Piggin 
51792181f19SNick Piggin 		if (is_errata100(regs, address))
51892181f19SNick Piggin 			return;
51992181f19SNick Piggin 
52092181f19SNick Piggin 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
52192181f19SNick Piggin 		    printk_ratelimit()) {
52292181f19SNick Piggin 			printk(
52392181f19SNick Piggin 			"%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
52492181f19SNick Piggin 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
52592181f19SNick Piggin 			tsk->comm, task_pid_nr(tsk), address,
52692181f19SNick Piggin 			(void *) regs->ip, (void *) regs->sp, error_code);
52792181f19SNick Piggin 			print_vma_addr(" in ", regs->ip);
52892181f19SNick Piggin 			printk("\n");
52992181f19SNick Piggin 		}
53092181f19SNick Piggin 
53192181f19SNick Piggin 		tsk->thread.cr2 = address;
53292181f19SNick Piggin 		/* Kernel addresses are always protection faults */
53392181f19SNick Piggin 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
53492181f19SNick Piggin 		tsk->thread.trap_no = 14;
53592181f19SNick Piggin 		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
53692181f19SNick Piggin 		return;
53792181f19SNick Piggin 	}
53892181f19SNick Piggin 
53992181f19SNick Piggin 	if (is_f00f_bug(regs, address))
54092181f19SNick Piggin 		return;
54192181f19SNick Piggin 
54292181f19SNick Piggin 	no_context(regs, error_code, address);
54392181f19SNick Piggin }
54492181f19SNick Piggin 
54592181f19SNick Piggin static noinline void bad_area_nosemaphore(struct pt_regs *regs,
54692181f19SNick Piggin 			unsigned long error_code, unsigned long address)
54792181f19SNick Piggin {
54892181f19SNick Piggin 	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
54992181f19SNick Piggin }
55092181f19SNick Piggin 
55192181f19SNick Piggin static void __bad_area(struct pt_regs *regs,
55292181f19SNick Piggin 			unsigned long error_code, unsigned long address,
55392181f19SNick Piggin 			int si_code)
55492181f19SNick Piggin {
55592181f19SNick Piggin 	struct mm_struct *mm = current->mm;
55692181f19SNick Piggin 
55792181f19SNick Piggin 	/*
55892181f19SNick Piggin 	 * Something tried to access memory that isn't in our memory map..
55992181f19SNick Piggin 	 * Fix it, but check if it's kernel or user first..
56092181f19SNick Piggin 	 */
56192181f19SNick Piggin 	up_read(&mm->mmap_sem);
56292181f19SNick Piggin 
56392181f19SNick Piggin 	__bad_area_nosemaphore(regs, error_code, address, si_code);
56492181f19SNick Piggin }
56592181f19SNick Piggin 
56692181f19SNick Piggin static noinline void bad_area(struct pt_regs *regs,
56792181f19SNick Piggin 			unsigned long error_code, unsigned long address)
56892181f19SNick Piggin {
56992181f19SNick Piggin 	__bad_area(regs, error_code, address, SEGV_MAPERR);
57092181f19SNick Piggin }
57192181f19SNick Piggin 
57292181f19SNick Piggin static noinline void bad_area_access_error(struct pt_regs *regs,
57392181f19SNick Piggin 			unsigned long error_code, unsigned long address)
57492181f19SNick Piggin {
57592181f19SNick Piggin 	__bad_area(regs, error_code, address, SEGV_ACCERR);
57692181f19SNick Piggin }
57792181f19SNick Piggin 
57892181f19SNick Piggin /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
57992181f19SNick Piggin static void out_of_memory(struct pt_regs *regs,
58092181f19SNick Piggin 			unsigned long error_code, unsigned long address)
58192181f19SNick Piggin {
58292181f19SNick Piggin 	/*
58392181f19SNick Piggin 	 * We ran out of memory, call the OOM killer, and return the userspace
58492181f19SNick Piggin 	 * (which will retry the fault, or kill us if we got oom-killed).
58592181f19SNick Piggin 	 */
58692181f19SNick Piggin 	up_read(&current->mm->mmap_sem);
58792181f19SNick Piggin 	pagefault_out_of_memory();
58892181f19SNick Piggin }
58992181f19SNick Piggin 
59092181f19SNick Piggin static void do_sigbus(struct pt_regs *regs,
59192181f19SNick Piggin 			unsigned long error_code, unsigned long address)
59292181f19SNick Piggin {
59392181f19SNick Piggin 	struct task_struct *tsk = current;
59492181f19SNick Piggin 	struct mm_struct *mm = tsk->mm;
59592181f19SNick Piggin 
59692181f19SNick Piggin 	up_read(&mm->mmap_sem);
59792181f19SNick Piggin 
59892181f19SNick Piggin 	/* Kernel mode? Handle exceptions or die */
59992181f19SNick Piggin 	if (!(error_code & PF_USER))
60092181f19SNick Piggin 		no_context(regs, error_code, address);
60192181f19SNick Piggin #ifdef CONFIG_X86_32
60292181f19SNick Piggin 	/* User space => ok to do another page fault */
60392181f19SNick Piggin 	if (is_prefetch(regs, error_code, address))
60492181f19SNick Piggin 		return;
60592181f19SNick Piggin #endif
60692181f19SNick Piggin 	tsk->thread.cr2 = address;
60792181f19SNick Piggin 	tsk->thread.error_code = error_code;
60892181f19SNick Piggin 	tsk->thread.trap_no = 14;
60992181f19SNick Piggin 	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
61092181f19SNick Piggin }
61192181f19SNick Piggin 
61292181f19SNick Piggin static noinline void mm_fault_error(struct pt_regs *regs,
61392181f19SNick Piggin 		unsigned long error_code, unsigned long address, unsigned int fault)
61492181f19SNick Piggin {
61592181f19SNick Piggin 	if (fault & VM_FAULT_OOM)
61692181f19SNick Piggin 		out_of_memory(regs, error_code, address);
61792181f19SNick Piggin 	else if (fault & VM_FAULT_SIGBUS)
61892181f19SNick Piggin 		do_sigbus(regs, error_code, address);
61992181f19SNick Piggin 	else
62092181f19SNick Piggin 		BUG();
62192181f19SNick Piggin }
62292181f19SNick Piggin 
623d8b57bb7SThomas Gleixner static int spurious_fault_check(unsigned long error_code, pte_t *pte)
624d8b57bb7SThomas Gleixner {
625d8b57bb7SThomas Gleixner 	if ((error_code & PF_WRITE) && !pte_write(*pte))
626d8b57bb7SThomas Gleixner 		return 0;
627d8b57bb7SThomas Gleixner 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
628d8b57bb7SThomas Gleixner 		return 0;
629d8b57bb7SThomas Gleixner 
630d8b57bb7SThomas Gleixner 	return 1;
631d8b57bb7SThomas Gleixner }
632d8b57bb7SThomas Gleixner 
633c61e211dSHarvey Harrison /*
6345b727a3bSJeremy Fitzhardinge  * Handle a spurious fault caused by a stale TLB entry.  This allows
6355b727a3bSJeremy Fitzhardinge  * us to lazily refresh the TLB when increasing the permissions of a
6365b727a3bSJeremy Fitzhardinge  * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
6375b727a3bSJeremy Fitzhardinge  * expensive since that implies doing a full cross-processor TLB
6385b727a3bSJeremy Fitzhardinge  * flush, even if no stale TLB entries exist on other processors.
6395b727a3bSJeremy Fitzhardinge  * There are no security implications to leaving a stale TLB when
6405b727a3bSJeremy Fitzhardinge  * increasing the permissions on a page.
6415b727a3bSJeremy Fitzhardinge  */
64292181f19SNick Piggin static noinline int spurious_fault(unsigned long error_code,
64392181f19SNick Piggin 				unsigned long address)
6445b727a3bSJeremy Fitzhardinge {
6455b727a3bSJeremy Fitzhardinge 	pgd_t *pgd;
6465b727a3bSJeremy Fitzhardinge 	pud_t *pud;
6475b727a3bSJeremy Fitzhardinge 	pmd_t *pmd;
6485b727a3bSJeremy Fitzhardinge 	pte_t *pte;
6495b727a3bSJeremy Fitzhardinge 
6505b727a3bSJeremy Fitzhardinge 	/* Reserved-bit violation or user access to kernel space? */
6515b727a3bSJeremy Fitzhardinge 	if (error_code & (PF_USER | PF_RSVD))
6525b727a3bSJeremy Fitzhardinge 		return 0;
6535b727a3bSJeremy Fitzhardinge 
6545b727a3bSJeremy Fitzhardinge 	pgd = init_mm.pgd + pgd_index(address);
6555b727a3bSJeremy Fitzhardinge 	if (!pgd_present(*pgd))
6565b727a3bSJeremy Fitzhardinge 		return 0;
6575b727a3bSJeremy Fitzhardinge 
6585b727a3bSJeremy Fitzhardinge 	pud = pud_offset(pgd, address);
6595b727a3bSJeremy Fitzhardinge 	if (!pud_present(*pud))
6605b727a3bSJeremy Fitzhardinge 		return 0;
6615b727a3bSJeremy Fitzhardinge 
662d8b57bb7SThomas Gleixner 	if (pud_large(*pud))
663d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pud);
664d8b57bb7SThomas Gleixner 
6655b727a3bSJeremy Fitzhardinge 	pmd = pmd_offset(pud, address);
6665b727a3bSJeremy Fitzhardinge 	if (!pmd_present(*pmd))
6675b727a3bSJeremy Fitzhardinge 		return 0;
6685b727a3bSJeremy Fitzhardinge 
669d8b57bb7SThomas Gleixner 	if (pmd_large(*pmd))
670d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pmd);
671d8b57bb7SThomas Gleixner 
6725b727a3bSJeremy Fitzhardinge 	pte = pte_offset_kernel(pmd, address);
6735b727a3bSJeremy Fitzhardinge 	if (!pte_present(*pte))
6745b727a3bSJeremy Fitzhardinge 		return 0;
6755b727a3bSJeremy Fitzhardinge 
676d8b57bb7SThomas Gleixner 	return spurious_fault_check(error_code, pte);
6775b727a3bSJeremy Fitzhardinge }
6785b727a3bSJeremy Fitzhardinge 
6795b727a3bSJeremy Fitzhardinge /*
680c61e211dSHarvey Harrison  * X86_32
681c61e211dSHarvey Harrison  * Handle a fault on the vmalloc or module mapping area
682c61e211dSHarvey Harrison  *
683c61e211dSHarvey Harrison  * X86_64
684c61e211dSHarvey Harrison  * Handle a fault on the vmalloc area
685c61e211dSHarvey Harrison  *
686c61e211dSHarvey Harrison  * This assumes no large pages in there.
687c61e211dSHarvey Harrison  */
68892181f19SNick Piggin static noinline int vmalloc_fault(unsigned long address)
689c61e211dSHarvey Harrison {
690c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
691c61e211dSHarvey Harrison 	unsigned long pgd_paddr;
692c61e211dSHarvey Harrison 	pmd_t *pmd_k;
693c61e211dSHarvey Harrison 	pte_t *pte_k;
694b29c701dSHenry Nestler 
695b29c701dSHenry Nestler 	/* Make sure we are in vmalloc area */
696b29c701dSHenry Nestler 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
697b29c701dSHenry Nestler 		return -1;
698b29c701dSHenry Nestler 
699c61e211dSHarvey Harrison 	/*
700c61e211dSHarvey Harrison 	 * Synchronize this task's top level page-table
701c61e211dSHarvey Harrison 	 * with the 'reference' page table.
702c61e211dSHarvey Harrison 	 *
703c61e211dSHarvey Harrison 	 * Do _not_ use "current" here. We might be inside
704c61e211dSHarvey Harrison 	 * an interrupt in the middle of a task switch..
705c61e211dSHarvey Harrison 	 */
706c61e211dSHarvey Harrison 	pgd_paddr = read_cr3();
707c61e211dSHarvey Harrison 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
708c61e211dSHarvey Harrison 	if (!pmd_k)
709c61e211dSHarvey Harrison 		return -1;
710c61e211dSHarvey Harrison 	pte_k = pte_offset_kernel(pmd_k, address);
711c61e211dSHarvey Harrison 	if (!pte_present(*pte_k))
712c61e211dSHarvey Harrison 		return -1;
713c61e211dSHarvey Harrison 	return 0;
714c61e211dSHarvey Harrison #else
715c61e211dSHarvey Harrison 	pgd_t *pgd, *pgd_ref;
716c61e211dSHarvey Harrison 	pud_t *pud, *pud_ref;
717c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_ref;
718c61e211dSHarvey Harrison 	pte_t *pte, *pte_ref;
719c61e211dSHarvey Harrison 
720cf89ec92SHarvey Harrison 	/* Make sure we are in vmalloc area */
721cf89ec92SHarvey Harrison 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
722cf89ec92SHarvey Harrison 		return -1;
723cf89ec92SHarvey Harrison 
724c61e211dSHarvey Harrison 	/* Copy kernel mappings over when needed. This can also
725c61e211dSHarvey Harrison 	   happen within a race in page table update. In the later
726c61e211dSHarvey Harrison 	   case just flush. */
727c61e211dSHarvey Harrison 
728f313e123SAndi Kleen 	pgd = pgd_offset(current->active_mm, address);
729c61e211dSHarvey Harrison 	pgd_ref = pgd_offset_k(address);
730c61e211dSHarvey Harrison 	if (pgd_none(*pgd_ref))
731c61e211dSHarvey Harrison 		return -1;
732c61e211dSHarvey Harrison 	if (pgd_none(*pgd))
733c61e211dSHarvey Harrison 		set_pgd(pgd, *pgd_ref);
734c61e211dSHarvey Harrison 	else
735c61e211dSHarvey Harrison 		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
736c61e211dSHarvey Harrison 
737c61e211dSHarvey Harrison 	/* Below here mismatches are bugs because these lower tables
738c61e211dSHarvey Harrison 	   are shared */
739c61e211dSHarvey Harrison 
740c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
741c61e211dSHarvey Harrison 	pud_ref = pud_offset(pgd_ref, address);
742c61e211dSHarvey Harrison 	if (pud_none(*pud_ref))
743c61e211dSHarvey Harrison 		return -1;
744c61e211dSHarvey Harrison 	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
745c61e211dSHarvey Harrison 		BUG();
746c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
747c61e211dSHarvey Harrison 	pmd_ref = pmd_offset(pud_ref, address);
748c61e211dSHarvey Harrison 	if (pmd_none(*pmd_ref))
749c61e211dSHarvey Harrison 		return -1;
750c61e211dSHarvey Harrison 	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
751c61e211dSHarvey Harrison 		BUG();
752c61e211dSHarvey Harrison 	pte_ref = pte_offset_kernel(pmd_ref, address);
753c61e211dSHarvey Harrison 	if (!pte_present(*pte_ref))
754c61e211dSHarvey Harrison 		return -1;
755c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
756c61e211dSHarvey Harrison 	/* Don't use pte_page here, because the mappings can point
757c61e211dSHarvey Harrison 	   outside mem_map, and the NUMA hash lookup cannot handle
758c61e211dSHarvey Harrison 	   that. */
759c61e211dSHarvey Harrison 	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
760c61e211dSHarvey Harrison 		BUG();
761c61e211dSHarvey Harrison 	return 0;
762c61e211dSHarvey Harrison #endif
763c61e211dSHarvey Harrison }
764c61e211dSHarvey Harrison 
765c61e211dSHarvey Harrison int show_unhandled_signals = 1;
766c61e211dSHarvey Harrison 
76792181f19SNick Piggin static inline int access_error(unsigned long error_code, int write,
76892181f19SNick Piggin 				struct vm_area_struct *vma)
76992181f19SNick Piggin {
77092181f19SNick Piggin 	if (write) {
77192181f19SNick Piggin 		/* write, present and write, not present */
77292181f19SNick Piggin 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
77392181f19SNick Piggin 			return 1;
77492181f19SNick Piggin 	} else if (unlikely(error_code & PF_PROT)) {
77592181f19SNick Piggin 		/* read, present */
77692181f19SNick Piggin 		return 1;
77792181f19SNick Piggin 	} else {
77892181f19SNick Piggin 		/* read, not present */
77992181f19SNick Piggin 		if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
78092181f19SNick Piggin 			return 1;
78192181f19SNick Piggin 	}
78292181f19SNick Piggin 
78392181f19SNick Piggin 	return 0;
78492181f19SNick Piggin }
78592181f19SNick Piggin 
786c61e211dSHarvey Harrison /*
787c61e211dSHarvey Harrison  * This routine handles page faults.  It determines the address,
788c61e211dSHarvey Harrison  * and the problem, and then passes it off to one of the appropriate
789c61e211dSHarvey Harrison  * routines.
790c61e211dSHarvey Harrison  */
791c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
792c61e211dSHarvey Harrison asmlinkage
793c61e211dSHarvey Harrison #endif
794c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
795c61e211dSHarvey Harrison {
79692181f19SNick Piggin 	unsigned long address;
797c61e211dSHarvey Harrison 	struct task_struct *tsk;
798c61e211dSHarvey Harrison 	struct mm_struct *mm;
799c61e211dSHarvey Harrison 	struct vm_area_struct *vma;
80092181f19SNick Piggin 	int write;
801c61e211dSHarvey Harrison 	int fault;
802c61e211dSHarvey Harrison 
803c61e211dSHarvey Harrison 	tsk = current;
804c61e211dSHarvey Harrison 	mm = tsk->mm;
805c61e211dSHarvey Harrison 	prefetchw(&mm->mmap_sem);
806c61e211dSHarvey Harrison 
807c61e211dSHarvey Harrison 	/* get the address */
808c61e211dSHarvey Harrison 	address = read_cr2();
809c61e211dSHarvey Harrison 
81092181f19SNick Piggin 	if (unlikely(notify_page_fault(regs)))
811c61e211dSHarvey Harrison 		return;
8120fd0e3daSPekka Paalanen 	if (unlikely(kmmio_fault(regs, address)))
81386069782SPekka Paalanen 		return;
814c61e211dSHarvey Harrison 
815c61e211dSHarvey Harrison 	/*
816c61e211dSHarvey Harrison 	 * We fault-in kernel-space virtual memory on-demand. The
817c61e211dSHarvey Harrison 	 * 'reference' page table is init_mm.pgd.
818c61e211dSHarvey Harrison 	 *
819c61e211dSHarvey Harrison 	 * NOTE! We MUST NOT take any locks for this case. We may
820c61e211dSHarvey Harrison 	 * be in an interrupt or a critical region, and should
821c61e211dSHarvey Harrison 	 * only copy the information from the master page table,
822c61e211dSHarvey Harrison 	 * nothing more.
823c61e211dSHarvey Harrison 	 *
824c61e211dSHarvey Harrison 	 * This verifies that the fault happens in kernel space
825c61e211dSHarvey Harrison 	 * (error_code & 4) == 0, and that the fault was not a
826c61e211dSHarvey Harrison 	 * protection error (error_code & 9) == 0.
827c61e211dSHarvey Harrison 	 */
828c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
829c61e211dSHarvey Harrison 	if (unlikely(address >= TASK_SIZE)) {
830cf89ec92SHarvey Harrison #else
831cf89ec92SHarvey Harrison 	if (unlikely(address >= TASK_SIZE64)) {
832cf89ec92SHarvey Harrison #endif
833c61e211dSHarvey Harrison 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
834c61e211dSHarvey Harrison 		    vmalloc_fault(address) >= 0)
835c61e211dSHarvey Harrison 			return;
8365b727a3bSJeremy Fitzhardinge 
8375b727a3bSJeremy Fitzhardinge 		/* Can handle a stale RO->RW TLB */
83892181f19SNick Piggin 		if (spurious_fault(error_code, address))
8395b727a3bSJeremy Fitzhardinge 			return;
8405b727a3bSJeremy Fitzhardinge 
841c61e211dSHarvey Harrison 		/*
842c61e211dSHarvey Harrison 		 * Don't take the mm semaphore here. If we fixup a prefetch
843c61e211dSHarvey Harrison 		 * fault we could otherwise deadlock.
844c61e211dSHarvey Harrison 		 */
84592181f19SNick Piggin 		bad_area_nosemaphore(regs, error_code, address);
84692181f19SNick Piggin 		return;
847c61e211dSHarvey Harrison 	}
848c61e211dSHarvey Harrison 
849c61e211dSHarvey Harrison 	/*
850891cffbdSLinus Torvalds 	 * It's safe to allow irq's after cr2 has been saved and the
851891cffbdSLinus Torvalds 	 * vmalloc fault has been handled.
852891cffbdSLinus Torvalds 	 *
853891cffbdSLinus Torvalds 	 * User-mode registers count as a user access even for any
854891cffbdSLinus Torvalds 	 * potential system fault or CPU buglet.
855c61e211dSHarvey Harrison 	 */
856891cffbdSLinus Torvalds 	if (user_mode_vm(regs)) {
857891cffbdSLinus Torvalds 		local_irq_enable();
858891cffbdSLinus Torvalds 		error_code |= PF_USER;
859891cffbdSLinus Torvalds 	} else if (regs->flags & X86_EFLAGS_IF)
860c61e211dSHarvey Harrison 		local_irq_enable();
861c61e211dSHarvey Harrison 
862891cffbdSLinus Torvalds #ifdef CONFIG_X86_64
863c61e211dSHarvey Harrison 	if (unlikely(error_code & PF_RSVD))
86492181f19SNick Piggin 		pgtable_bad(regs, error_code, address);
865891cffbdSLinus Torvalds #endif
866c61e211dSHarvey Harrison 
867c61e211dSHarvey Harrison 	/*
868c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
869c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
870c61e211dSHarvey Harrison 	 */
87192181f19SNick Piggin 	if (unlikely(in_atomic() || !mm)) {
87292181f19SNick Piggin 		bad_area_nosemaphore(regs, error_code, address);
87392181f19SNick Piggin 		return;
87492181f19SNick Piggin 	}
875c61e211dSHarvey Harrison 
8763a1dfe6eSIngo Molnar 	/*
8773a1dfe6eSIngo Molnar 	 * When running in the kernel we expect faults to occur only to
878c61e211dSHarvey Harrison 	 * addresses in user space.  All other faults represent errors in the
879c61e211dSHarvey Harrison 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
880c61e211dSHarvey Harrison 	 * erroneous fault occurring in a code path which already holds mmap_sem
881c61e211dSHarvey Harrison 	 * we will deadlock attempting to validate the fault against the
882c61e211dSHarvey Harrison 	 * address space.  Luckily the kernel only validly references user
883c61e211dSHarvey Harrison 	 * space from well defined areas of code, which are listed in the
884c61e211dSHarvey Harrison 	 * exceptions table.
885c61e211dSHarvey Harrison 	 *
886c61e211dSHarvey Harrison 	 * As the vast majority of faults will be valid we will only perform
887c61e211dSHarvey Harrison 	 * the source reference check when there is a possibility of a deadlock.
888c61e211dSHarvey Harrison 	 * Attempt to lock the address space, if we cannot we then validate the
889c61e211dSHarvey Harrison 	 * source.  If this is invalid we can skip the address space check,
890c61e211dSHarvey Harrison 	 * thus avoiding the deadlock.
891c61e211dSHarvey Harrison 	 */
89292181f19SNick Piggin 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
893c61e211dSHarvey Harrison 		if ((error_code & PF_USER) == 0 &&
89492181f19SNick Piggin 		    !search_exception_tables(regs->ip)) {
89592181f19SNick Piggin 			bad_area_nosemaphore(regs, error_code, address);
89692181f19SNick Piggin 			return;
89792181f19SNick Piggin 		}
898c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
899c61e211dSHarvey Harrison 	}
900c61e211dSHarvey Harrison 
901c61e211dSHarvey Harrison 	vma = find_vma(mm, address);
90292181f19SNick Piggin 	if (unlikely(!vma)) {
90392181f19SNick Piggin 		bad_area(regs, error_code, address);
90492181f19SNick Piggin 		return;
90592181f19SNick Piggin 	}
90692181f19SNick Piggin 	if (likely(vma->vm_start <= address))
907c61e211dSHarvey Harrison 		goto good_area;
90892181f19SNick Piggin 	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
90992181f19SNick Piggin 		bad_area(regs, error_code, address);
91092181f19SNick Piggin 		return;
91192181f19SNick Piggin 	}
912c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
913c61e211dSHarvey Harrison 		/*
914c61e211dSHarvey Harrison 		 * Accessing the stack below %sp is always a bug.
915c61e211dSHarvey Harrison 		 * The large cushion allows instructions like enter
916c61e211dSHarvey Harrison 		 * and pusha to work.  ("enter $65535,$31" pushes
917c61e211dSHarvey Harrison 		 * 32 pointers and then decrements %sp by 65535.)
918c61e211dSHarvey Harrison 		 */
91992181f19SNick Piggin 		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
92092181f19SNick Piggin 			bad_area(regs, error_code, address);
92192181f19SNick Piggin 			return;
922c61e211dSHarvey Harrison 		}
92392181f19SNick Piggin 	}
92492181f19SNick Piggin 	if (unlikely(expand_stack(vma, address))) {
92592181f19SNick Piggin 		bad_area(regs, error_code, address);
92692181f19SNick Piggin 		return;
92792181f19SNick Piggin 	}
92892181f19SNick Piggin 
929c61e211dSHarvey Harrison 	/*
930c61e211dSHarvey Harrison 	 * Ok, we have a good vm_area for this memory access, so
931c61e211dSHarvey Harrison 	 * we can handle it..
932c61e211dSHarvey Harrison 	 */
933c61e211dSHarvey Harrison good_area:
93492181f19SNick Piggin 	write = error_code & PF_WRITE;
93592181f19SNick Piggin 	if (unlikely(access_error(error_code, write, vma))) {
93692181f19SNick Piggin 		bad_area_access_error(regs, error_code, address);
93792181f19SNick Piggin 		return;
938c61e211dSHarvey Harrison 	}
939c61e211dSHarvey Harrison 
940c61e211dSHarvey Harrison 	/*
941c61e211dSHarvey Harrison 	 * If for any reason at all we couldn't handle the fault,
942c61e211dSHarvey Harrison 	 * make sure we exit gracefully rather than endlessly redo
943c61e211dSHarvey Harrison 	 * the fault.
944c61e211dSHarvey Harrison 	 */
945c61e211dSHarvey Harrison 	fault = handle_mm_fault(mm, vma, address, write);
946c61e211dSHarvey Harrison 	if (unlikely(fault & VM_FAULT_ERROR)) {
94792181f19SNick Piggin 		mm_fault_error(regs, error_code, address, fault);
94892181f19SNick Piggin 		return;
949c61e211dSHarvey Harrison 	}
950c61e211dSHarvey Harrison 	if (fault & VM_FAULT_MAJOR)
951c61e211dSHarvey Harrison 		tsk->maj_flt++;
952c61e211dSHarvey Harrison 	else
953c61e211dSHarvey Harrison 		tsk->min_flt++;
954c61e211dSHarvey Harrison 
955c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
956c61e211dSHarvey Harrison 	/*
957c61e211dSHarvey Harrison 	 * Did it hit the DOS screen memory VA from vm86 mode?
958c61e211dSHarvey Harrison 	 */
959c61e211dSHarvey Harrison 	if (v8086_mode(regs)) {
960c61e211dSHarvey Harrison 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
961c61e211dSHarvey Harrison 		if (bit < 32)
962c61e211dSHarvey Harrison 			tsk->thread.screen_bitmap |= 1 << bit;
963c61e211dSHarvey Harrison 	}
964c61e211dSHarvey Harrison #endif
965c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
966c61e211dSHarvey Harrison }
967c61e211dSHarvey Harrison 
968c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock);
969c61e211dSHarvey Harrison LIST_HEAD(pgd_list);
970c61e211dSHarvey Harrison 
971c61e211dSHarvey Harrison void vmalloc_sync_all(void)
972c61e211dSHarvey Harrison {
973c61e211dSHarvey Harrison 	unsigned long address;
974c61e211dSHarvey Harrison 
975cc643d46SJan Beulich #ifdef CONFIG_X86_32
976c61e211dSHarvey Harrison 	if (SHARED_KERNEL_PMD)
977c61e211dSHarvey Harrison 		return;
978c61e211dSHarvey Harrison 
979cc643d46SJan Beulich 	for (address = VMALLOC_START & PMD_MASK;
980cc643d46SJan Beulich 	     address >= TASK_SIZE && address < FIXADDR_TOP;
981cc643d46SJan Beulich 	     address += PMD_SIZE) {
982c61e211dSHarvey Harrison 		unsigned long flags;
983c61e211dSHarvey Harrison 		struct page *page;
984c61e211dSHarvey Harrison 
985c61e211dSHarvey Harrison 		spin_lock_irqsave(&pgd_lock, flags);
986e3ed910dSJeremy Fitzhardinge 		list_for_each_entry(page, &pgd_list, lru) {
987c61e211dSHarvey Harrison 			if (!vmalloc_sync_one(page_address(page),
988e3ed910dSJeremy Fitzhardinge 					      address))
989c61e211dSHarvey Harrison 				break;
990c61e211dSHarvey Harrison 		}
991c61e211dSHarvey Harrison 		spin_unlock_irqrestore(&pgd_lock, flags);
992c61e211dSHarvey Harrison 	}
993c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
994cc643d46SJan Beulich 	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
995cc643d46SJan Beulich 	     address += PGDIR_SIZE) {
996c61e211dSHarvey Harrison 		const pgd_t *pgd_ref = pgd_offset_k(address);
99758d5d0d8SIngo Molnar 		unsigned long flags;
998c61e211dSHarvey Harrison 		struct page *page;
999c61e211dSHarvey Harrison 
1000c61e211dSHarvey Harrison 		if (pgd_none(*pgd_ref))
1001c61e211dSHarvey Harrison 			continue;
100258d5d0d8SIngo Molnar 		spin_lock_irqsave(&pgd_lock, flags);
1003c61e211dSHarvey Harrison 		list_for_each_entry(page, &pgd_list, lru) {
1004c61e211dSHarvey Harrison 			pgd_t *pgd;
1005c61e211dSHarvey Harrison 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
1006c61e211dSHarvey Harrison 			if (pgd_none(*pgd))
1007c61e211dSHarvey Harrison 				set_pgd(pgd, *pgd_ref);
1008c61e211dSHarvey Harrison 			else
1009c61e211dSHarvey Harrison 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1010c61e211dSHarvey Harrison 		}
101158d5d0d8SIngo Molnar 		spin_unlock_irqrestore(&pgd_lock, flags);
1012c61e211dSHarvey Harrison 	}
1013c61e211dSHarvey Harrison #endif
1014c61e211dSHarvey Harrison }
1015