xref: /openbmc/linux/arch/x86/mm/fault.c (revision 010060741ad35eacb504414bc6fb9bb575b15f62)
1c61e211dSHarvey Harrison /*
2c61e211dSHarvey Harrison  *  Copyright (C) 1995  Linus Torvalds
3c61e211dSHarvey Harrison  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4c61e211dSHarvey Harrison  */
5c61e211dSHarvey Harrison 
6c61e211dSHarvey Harrison #include <linux/signal.h>
7c61e211dSHarvey Harrison #include <linux/sched.h>
8c61e211dSHarvey Harrison #include <linux/kernel.h>
9c61e211dSHarvey Harrison #include <linux/errno.h>
10c61e211dSHarvey Harrison #include <linux/string.h>
11c61e211dSHarvey Harrison #include <linux/types.h>
12c61e211dSHarvey Harrison #include <linux/ptrace.h>
130fd0e3daSPekka Paalanen #include <linux/mmiotrace.h>
14c61e211dSHarvey Harrison #include <linux/mman.h>
15c61e211dSHarvey Harrison #include <linux/mm.h>
16c61e211dSHarvey Harrison #include <linux/smp.h>
17c61e211dSHarvey Harrison #include <linux/interrupt.h>
18c61e211dSHarvey Harrison #include <linux/init.h>
19c61e211dSHarvey Harrison #include <linux/tty.h>
20c61e211dSHarvey Harrison #include <linux/vt_kern.h>		/* For unblank_screen() */
21c61e211dSHarvey Harrison #include <linux/compiler.h>
22c61e211dSHarvey Harrison #include <linux/highmem.h>
23c61e211dSHarvey Harrison #include <linux/bootmem.h>		/* for max_low_pfn */
24c61e211dSHarvey Harrison #include <linux/vmalloc.h>
25c61e211dSHarvey Harrison #include <linux/module.h>
26c61e211dSHarvey Harrison #include <linux/kprobes.h>
27c61e211dSHarvey Harrison #include <linux/uaccess.h>
28c61e211dSHarvey Harrison #include <linux/kdebug.h>
29c61e211dSHarvey Harrison 
30c61e211dSHarvey Harrison #include <asm/system.h>
31c61e211dSHarvey Harrison #include <asm/desc.h>
32c61e211dSHarvey Harrison #include <asm/segment.h>
33c61e211dSHarvey Harrison #include <asm/pgalloc.h>
34c61e211dSHarvey Harrison #include <asm/smp.h>
35c61e211dSHarvey Harrison #include <asm/tlbflush.h>
36c61e211dSHarvey Harrison #include <asm/proto.h>
37c61e211dSHarvey Harrison #include <asm-generic/sections.h>
3870ef5641SJaswinder Singh #include <asm/traps.h>
39c61e211dSHarvey Harrison 
40c61e211dSHarvey Harrison /*
41c61e211dSHarvey Harrison  * Page fault error code bits
42c61e211dSHarvey Harrison  *	bit 0 == 0 means no page found, 1 means protection fault
43c61e211dSHarvey Harrison  *	bit 1 == 0 means read, 1 means write
44c61e211dSHarvey Harrison  *	bit 2 == 0 means kernel, 1 means user-mode
45c61e211dSHarvey Harrison  *	bit 3 == 1 means use of reserved bit detected
46c61e211dSHarvey Harrison  *	bit 4 == 1 means fault was an instruction fetch
47c61e211dSHarvey Harrison  */
48c61e211dSHarvey Harrison #define PF_PROT		(1<<0)
49c61e211dSHarvey Harrison #define PF_WRITE	(1<<1)
50c61e211dSHarvey Harrison #define PF_USER		(1<<2)
51c61e211dSHarvey Harrison #define PF_RSVD		(1<<3)
52c61e211dSHarvey Harrison #define PF_INSTR	(1<<4)
53c61e211dSHarvey Harrison 
540fd0e3daSPekka Paalanen static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
5586069782SPekka Paalanen {
56fd3fdf11SPekka Paalanen #ifdef CONFIG_MMIOTRACE
570fd0e3daSPekka Paalanen 	if (unlikely(is_kmmio_active()))
580fd0e3daSPekka Paalanen 		if (kmmio_handler(regs, addr) == 1)
590fd0e3daSPekka Paalanen 			return -1;
6086069782SPekka Paalanen #endif
610fd0e3daSPekka Paalanen 	return 0;
6286069782SPekka Paalanen }
6386069782SPekka Paalanen 
64c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs)
65c61e211dSHarvey Harrison {
66c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES
67c61e211dSHarvey Harrison 	int ret = 0;
68c61e211dSHarvey Harrison 
69c61e211dSHarvey Harrison 	/* kprobe_running() needs smp_processor_id() */
70c61e211dSHarvey Harrison 	if (!user_mode_vm(regs)) {
71c61e211dSHarvey Harrison 		preempt_disable();
72c61e211dSHarvey Harrison 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
73c61e211dSHarvey Harrison 			ret = 1;
74c61e211dSHarvey Harrison 		preempt_enable();
75c61e211dSHarvey Harrison 	}
76c61e211dSHarvey Harrison 
77c61e211dSHarvey Harrison 	return ret;
78c61e211dSHarvey Harrison #else
79c61e211dSHarvey Harrison 	return 0;
80c61e211dSHarvey Harrison #endif
81c61e211dSHarvey Harrison }
82c61e211dSHarvey Harrison 
83c61e211dSHarvey Harrison /*
84c61e211dSHarvey Harrison  * X86_32
85c61e211dSHarvey Harrison  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
86c61e211dSHarvey Harrison  * Check that here and ignore it.
87c61e211dSHarvey Harrison  *
88c61e211dSHarvey Harrison  * X86_64
89c61e211dSHarvey Harrison  * Sometimes the CPU reports invalid exceptions on prefetch.
90c61e211dSHarvey Harrison  * Check that here and ignore it.
91c61e211dSHarvey Harrison  *
92c61e211dSHarvey Harrison  * Opcode checker based on code by Richard Brunner
93c61e211dSHarvey Harrison  */
9492181f19SNick Piggin static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
9592181f19SNick Piggin 			unsigned long addr)
96c61e211dSHarvey Harrison {
97c61e211dSHarvey Harrison 	unsigned char *instr;
98c61e211dSHarvey Harrison 	int scan_more = 1;
99c61e211dSHarvey Harrison 	int prefetch = 0;
100c61e211dSHarvey Harrison 	unsigned char *max_instr;
101c61e211dSHarvey Harrison 
1023085354dSIngo Molnar 	/*
1033085354dSIngo Molnar 	 * If it was a exec (instruction fetch) fault on NX page, then
1043085354dSIngo Molnar 	 * do not ignore the fault:
1053085354dSIngo Molnar 	 */
106c61e211dSHarvey Harrison 	if (error_code & PF_INSTR)
107c61e211dSHarvey Harrison 		return 0;
108c61e211dSHarvey Harrison 
109c61e211dSHarvey Harrison 	instr = (unsigned char *)convert_ip_to_linear(current, regs);
110c61e211dSHarvey Harrison 	max_instr = instr + 15;
111c61e211dSHarvey Harrison 
112c61e211dSHarvey Harrison 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
113c61e211dSHarvey Harrison 		return 0;
114c61e211dSHarvey Harrison 
115c61e211dSHarvey Harrison 	while (scan_more && instr < max_instr) {
116c61e211dSHarvey Harrison 		unsigned char opcode;
117c61e211dSHarvey Harrison 		unsigned char instr_hi;
118c61e211dSHarvey Harrison 		unsigned char instr_lo;
119c61e211dSHarvey Harrison 
120c61e211dSHarvey Harrison 		if (probe_kernel_address(instr, opcode))
121c61e211dSHarvey Harrison 			break;
122c61e211dSHarvey Harrison 
123c61e211dSHarvey Harrison 		instr_hi = opcode & 0xf0;
124c61e211dSHarvey Harrison 		instr_lo = opcode & 0x0f;
125c61e211dSHarvey Harrison 		instr++;
126c61e211dSHarvey Harrison 
127c61e211dSHarvey Harrison 		switch (instr_hi) {
128c61e211dSHarvey Harrison 		case 0x20:
129c61e211dSHarvey Harrison 		case 0x30:
130c61e211dSHarvey Harrison 			/*
131c61e211dSHarvey Harrison 			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
132c61e211dSHarvey Harrison 			 * In X86_64 long mode, the CPU will signal invalid
133c61e211dSHarvey Harrison 			 * opcode if some of these prefixes are present so
134c61e211dSHarvey Harrison 			 * X86_64 will never get here anyway
135c61e211dSHarvey Harrison 			 */
136c61e211dSHarvey Harrison 			scan_more = ((instr_lo & 7) == 0x6);
137c61e211dSHarvey Harrison 			break;
138c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
139c61e211dSHarvey Harrison 		case 0x40:
140c61e211dSHarvey Harrison 			/*
141c61e211dSHarvey Harrison 			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
142c61e211dSHarvey Harrison 			 * Need to figure out under what instruction mode the
143c61e211dSHarvey Harrison 			 * instruction was issued. Could check the LDT for lm,
144c61e211dSHarvey Harrison 			 * but for now it's good enough to assume that long
145c61e211dSHarvey Harrison 			 * mode only uses well known segments or kernel.
146c61e211dSHarvey Harrison 			 */
147c61e211dSHarvey Harrison 			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
148c61e211dSHarvey Harrison 			break;
149c61e211dSHarvey Harrison #endif
150c61e211dSHarvey Harrison 		case 0x60:
151c61e211dSHarvey Harrison 			/* 0x64 thru 0x67 are valid prefixes in all modes. */
152c61e211dSHarvey Harrison 			scan_more = (instr_lo & 0xC) == 0x4;
153c61e211dSHarvey Harrison 			break;
154c61e211dSHarvey Harrison 		case 0xF0:
155c61e211dSHarvey Harrison 			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
156c61e211dSHarvey Harrison 			scan_more = !instr_lo || (instr_lo>>1) == 1;
157c61e211dSHarvey Harrison 			break;
158c61e211dSHarvey Harrison 		case 0x00:
159c61e211dSHarvey Harrison 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
160c61e211dSHarvey Harrison 			scan_more = 0;
161c61e211dSHarvey Harrison 
162c61e211dSHarvey Harrison 			if (probe_kernel_address(instr, opcode))
163c61e211dSHarvey Harrison 				break;
164c61e211dSHarvey Harrison 			prefetch = (instr_lo == 0xF) &&
165c61e211dSHarvey Harrison 				(opcode == 0x0D || opcode == 0x18);
166c61e211dSHarvey Harrison 			break;
167c61e211dSHarvey Harrison 		default:
168c61e211dSHarvey Harrison 			scan_more = 0;
169c61e211dSHarvey Harrison 			break;
170c61e211dSHarvey Harrison 		}
171c61e211dSHarvey Harrison 	}
172c61e211dSHarvey Harrison 	return prefetch;
173c61e211dSHarvey Harrison }
174c61e211dSHarvey Harrison 
175c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code,
176c61e211dSHarvey Harrison 	unsigned long address, struct task_struct *tsk)
177c61e211dSHarvey Harrison {
178c61e211dSHarvey Harrison 	siginfo_t info;
179c61e211dSHarvey Harrison 
180c61e211dSHarvey Harrison 	info.si_signo = si_signo;
181c61e211dSHarvey Harrison 	info.si_errno = 0;
182c61e211dSHarvey Harrison 	info.si_code = si_code;
183c61e211dSHarvey Harrison 	info.si_addr = (void __user *)address;
184c61e211dSHarvey Harrison 	force_sig_info(si_signo, &info, tsk);
185c61e211dSHarvey Harrison }
186c61e211dSHarvey Harrison 
187c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
188c61e211dSHarvey Harrison static int bad_address(void *p)
189c61e211dSHarvey Harrison {
190c61e211dSHarvey Harrison 	unsigned long dummy;
191c61e211dSHarvey Harrison 	return probe_kernel_address((unsigned long *)p, dummy);
192c61e211dSHarvey Harrison }
193c61e211dSHarvey Harrison #endif
194c61e211dSHarvey Harrison 
195cae30f82SAdrian Bunk static void dump_pagetable(unsigned long address)
196c61e211dSHarvey Harrison {
197c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
198c61e211dSHarvey Harrison 	__typeof__(pte_val(__pte(0))) page;
199c61e211dSHarvey Harrison 
200c61e211dSHarvey Harrison 	page = read_cr3();
201c61e211dSHarvey Harrison 	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
202c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
203c61e211dSHarvey Harrison 	printk("*pdpt = %016Lx ", page);
204c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
205c61e211dSHarvey Harrison 	    && page & _PAGE_PRESENT) {
206c61e211dSHarvey Harrison 		page &= PAGE_MASK;
207c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
208c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PMD - 1)];
209c61e211dSHarvey Harrison 		printk(KERN_CONT "*pde = %016Lx ", page);
210c61e211dSHarvey Harrison 		page &= ~_PAGE_NX;
211c61e211dSHarvey Harrison 	}
212c61e211dSHarvey Harrison #else
213c61e211dSHarvey Harrison 	printk("*pde = %08lx ", page);
214c61e211dSHarvey Harrison #endif
215c61e211dSHarvey Harrison 
216c61e211dSHarvey Harrison 	/*
217c61e211dSHarvey Harrison 	 * We must not directly access the pte in the highpte
218c61e211dSHarvey Harrison 	 * case if the page table is located in highmem.
219c61e211dSHarvey Harrison 	 * And let's rather not kmap-atomic the pte, just in case
220c61e211dSHarvey Harrison 	 * it's allocated already.
221c61e211dSHarvey Harrison 	 */
222c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
223c61e211dSHarvey Harrison 	    && (page & _PAGE_PRESENT)
224c61e211dSHarvey Harrison 	    && !(page & _PAGE_PSE)) {
225c61e211dSHarvey Harrison 		page &= PAGE_MASK;
226c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
227c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PTE - 1)];
228c61e211dSHarvey Harrison 		printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
229c61e211dSHarvey Harrison 	}
230c61e211dSHarvey Harrison 
231c61e211dSHarvey Harrison 	printk("\n");
232c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
233c61e211dSHarvey Harrison 	pgd_t *pgd;
234c61e211dSHarvey Harrison 	pud_t *pud;
235c61e211dSHarvey Harrison 	pmd_t *pmd;
236c61e211dSHarvey Harrison 	pte_t *pte;
237c61e211dSHarvey Harrison 
238c61e211dSHarvey Harrison 	pgd = (pgd_t *)read_cr3();
239c61e211dSHarvey Harrison 
240c61e211dSHarvey Harrison 	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
241c61e211dSHarvey Harrison 	pgd += pgd_index(address);
242c61e211dSHarvey Harrison 	if (bad_address(pgd)) goto bad;
243c61e211dSHarvey Harrison 	printk("PGD %lx ", pgd_val(*pgd));
244c61e211dSHarvey Harrison 	if (!pgd_present(*pgd)) goto ret;
245c61e211dSHarvey Harrison 
246c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
247c61e211dSHarvey Harrison 	if (bad_address(pud)) goto bad;
248c61e211dSHarvey Harrison 	printk("PUD %lx ", pud_val(*pud));
249b5360222SAndi Kleen 	if (!pud_present(*pud) || pud_large(*pud))
250b5360222SAndi Kleen 		goto ret;
251c61e211dSHarvey Harrison 
252c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
253c61e211dSHarvey Harrison 	if (bad_address(pmd)) goto bad;
254c61e211dSHarvey Harrison 	printk("PMD %lx ", pmd_val(*pmd));
255c61e211dSHarvey Harrison 	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
256c61e211dSHarvey Harrison 
257c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
258c61e211dSHarvey Harrison 	if (bad_address(pte)) goto bad;
259c61e211dSHarvey Harrison 	printk("PTE %lx", pte_val(*pte));
260c61e211dSHarvey Harrison ret:
261c61e211dSHarvey Harrison 	printk("\n");
262c61e211dSHarvey Harrison 	return;
263c61e211dSHarvey Harrison bad:
264c61e211dSHarvey Harrison 	printk("BAD\n");
265c61e211dSHarvey Harrison #endif
266c61e211dSHarvey Harrison }
267c61e211dSHarvey Harrison 
268c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
269c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
270c61e211dSHarvey Harrison {
271c61e211dSHarvey Harrison 	unsigned index = pgd_index(address);
272c61e211dSHarvey Harrison 	pgd_t *pgd_k;
273c61e211dSHarvey Harrison 	pud_t *pud, *pud_k;
274c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_k;
275c61e211dSHarvey Harrison 
276c61e211dSHarvey Harrison 	pgd += index;
277c61e211dSHarvey Harrison 	pgd_k = init_mm.pgd + index;
278c61e211dSHarvey Harrison 
279c61e211dSHarvey Harrison 	if (!pgd_present(*pgd_k))
280c61e211dSHarvey Harrison 		return NULL;
281c61e211dSHarvey Harrison 
282c61e211dSHarvey Harrison 	/*
283c61e211dSHarvey Harrison 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
284c61e211dSHarvey Harrison 	 * and redundant with the set_pmd() on non-PAE. As would
285c61e211dSHarvey Harrison 	 * set_pud.
286c61e211dSHarvey Harrison 	 */
287c61e211dSHarvey Harrison 
288c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
289c61e211dSHarvey Harrison 	pud_k = pud_offset(pgd_k, address);
290c61e211dSHarvey Harrison 	if (!pud_present(*pud_k))
291c61e211dSHarvey Harrison 		return NULL;
292c61e211dSHarvey Harrison 
293c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
294c61e211dSHarvey Harrison 	pmd_k = pmd_offset(pud_k, address);
295c61e211dSHarvey Harrison 	if (!pmd_present(*pmd_k))
296c61e211dSHarvey Harrison 		return NULL;
297c61e211dSHarvey Harrison 	if (!pmd_present(*pmd)) {
298c61e211dSHarvey Harrison 		set_pmd(pmd, *pmd_k);
299c61e211dSHarvey Harrison 		arch_flush_lazy_mmu_mode();
300c61e211dSHarvey Harrison 	} else
301c61e211dSHarvey Harrison 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
302c61e211dSHarvey Harrison 	return pmd_k;
303c61e211dSHarvey Harrison }
304c61e211dSHarvey Harrison #endif
305c61e211dSHarvey Harrison 
306c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
307c61e211dSHarvey Harrison static const char errata93_warning[] =
308c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
309c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
310c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n"
311c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
312c61e211dSHarvey Harrison #endif
313c61e211dSHarvey Harrison 
314c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS.
315c61e211dSHarvey Harrison    BIOS SMM functions are required to use a specific workaround
316c61e211dSHarvey Harrison    to avoid corruption of the 64bit RIP register on C stepping K8.
317c61e211dSHarvey Harrison    A lot of BIOS that didn't get tested properly miss this.
318c61e211dSHarvey Harrison    The OS sees this as a page fault with the upper 32bits of RIP cleared.
319c61e211dSHarvey Harrison    Try to work around it here.
320c61e211dSHarvey Harrison    Note we only handle faults in kernel here.
321c61e211dSHarvey Harrison    Does nothing for X86_32
322c61e211dSHarvey Harrison  */
323c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address)
324c61e211dSHarvey Harrison {
325c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
326c61e211dSHarvey Harrison 	static int warned;
327c61e211dSHarvey Harrison 	if (address != regs->ip)
328c61e211dSHarvey Harrison 		return 0;
329c61e211dSHarvey Harrison 	if ((address >> 32) != 0)
330c61e211dSHarvey Harrison 		return 0;
331c61e211dSHarvey Harrison 	address |= 0xffffffffUL << 32;
332c61e211dSHarvey Harrison 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
333c61e211dSHarvey Harrison 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
334c61e211dSHarvey Harrison 		if (!warned) {
335c61e211dSHarvey Harrison 			printk(errata93_warning);
336c61e211dSHarvey Harrison 			warned = 1;
337c61e211dSHarvey Harrison 		}
338c61e211dSHarvey Harrison 		regs->ip = address;
339c61e211dSHarvey Harrison 		return 1;
340c61e211dSHarvey Harrison 	}
341c61e211dSHarvey Harrison #endif
342c61e211dSHarvey Harrison 	return 0;
343c61e211dSHarvey Harrison }
344c61e211dSHarvey Harrison 
345c61e211dSHarvey Harrison /*
346c61e211dSHarvey Harrison  * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
347c61e211dSHarvey Harrison  * addresses >4GB.  We catch this in the page fault handler because these
348c61e211dSHarvey Harrison  * addresses are not reachable. Just detect this case and return.  Any code
349c61e211dSHarvey Harrison  * segment in LDT is compatibility mode.
350c61e211dSHarvey Harrison  */
351c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address)
352c61e211dSHarvey Harrison {
353c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
354c61e211dSHarvey Harrison 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
355c61e211dSHarvey Harrison 	    (address >> 32))
356c61e211dSHarvey Harrison 		return 1;
357c61e211dSHarvey Harrison #endif
358c61e211dSHarvey Harrison 	return 0;
359c61e211dSHarvey Harrison }
360c61e211dSHarvey Harrison 
361c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
362c61e211dSHarvey Harrison {
363c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG
364c61e211dSHarvey Harrison 	unsigned long nr;
365c61e211dSHarvey Harrison 	/*
366c61e211dSHarvey Harrison 	 * Pentium F0 0F C7 C8 bug workaround.
367c61e211dSHarvey Harrison 	 */
368c61e211dSHarvey Harrison 	if (boot_cpu_data.f00f_bug) {
369c61e211dSHarvey Harrison 		nr = (address - idt_descr.address) >> 3;
370c61e211dSHarvey Harrison 
371c61e211dSHarvey Harrison 		if (nr == 6) {
372c61e211dSHarvey Harrison 			do_invalid_op(regs, 0);
373c61e211dSHarvey Harrison 			return 1;
374c61e211dSHarvey Harrison 		}
375c61e211dSHarvey Harrison 	}
376c61e211dSHarvey Harrison #endif
377c61e211dSHarvey Harrison 	return 0;
378c61e211dSHarvey Harrison }
379c61e211dSHarvey Harrison 
380c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
381c61e211dSHarvey Harrison 			    unsigned long address)
382c61e211dSHarvey Harrison {
383c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
384c61e211dSHarvey Harrison 	if (!oops_may_print())
385c61e211dSHarvey Harrison 		return;
386fd40d6e3SHarvey Harrison #endif
387c61e211dSHarvey Harrison 
388c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
389c61e211dSHarvey Harrison 	if (error_code & PF_INSTR) {
39093809be8SHarvey Harrison 		unsigned int level;
391c61e211dSHarvey Harrison 		pte_t *pte = lookup_address(address, &level);
392c61e211dSHarvey Harrison 
393c61e211dSHarvey Harrison 		if (pte && pte_present(*pte) && !pte_exec(*pte))
394c61e211dSHarvey Harrison 			printk(KERN_CRIT "kernel tried to execute "
395c61e211dSHarvey Harrison 				"NX-protected page - exploit attempt? "
396350b4da7SDavid Howells 				"(uid: %d)\n", current_uid());
397c61e211dSHarvey Harrison 	}
398c61e211dSHarvey Harrison #endif
399fd40d6e3SHarvey Harrison 
400c61e211dSHarvey Harrison 	printk(KERN_ALERT "BUG: unable to handle kernel ");
401c61e211dSHarvey Harrison 	if (address < PAGE_SIZE)
402c61e211dSHarvey Harrison 		printk(KERN_CONT "NULL pointer dereference");
403c61e211dSHarvey Harrison 	else
404c61e211dSHarvey Harrison 		printk(KERN_CONT "paging request");
405f294a8ceSVegard Nossum 	printk(KERN_CONT " at %p\n", (void *) address);
406c61e211dSHarvey Harrison 	printk(KERN_ALERT "IP:");
407c61e211dSHarvey Harrison 	printk_address(regs->ip, 1);
408c61e211dSHarvey Harrison 	dump_pagetable(address);
409c61e211dSHarvey Harrison }
410c61e211dSHarvey Harrison 
411c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
41292181f19SNick Piggin static noinline void pgtable_bad(struct pt_regs *regs,
41392181f19SNick Piggin 			 unsigned long error_code, unsigned long address)
414c61e211dSHarvey Harrison {
415c61e211dSHarvey Harrison 	unsigned long flags = oops_begin();
416874d93d1SAlexander van Heukelum 	int sig = SIGKILL;
41792181f19SNick Piggin 	struct task_struct *tsk = current;
418c61e211dSHarvey Harrison 
419c61e211dSHarvey Harrison 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
42092181f19SNick Piggin 	       tsk->comm, address);
421c61e211dSHarvey Harrison 	dump_pagetable(address);
422c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
423c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
424c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
425c61e211dSHarvey Harrison 	if (__die("Bad pagetable", regs, error_code))
426874d93d1SAlexander van Heukelum 		sig = 0;
427874d93d1SAlexander van Heukelum 	oops_end(flags, regs, sig);
428c61e211dSHarvey Harrison }
429c61e211dSHarvey Harrison #endif
430c61e211dSHarvey Harrison 
43192181f19SNick Piggin static noinline void no_context(struct pt_regs *regs,
43292181f19SNick Piggin 			unsigned long error_code, unsigned long address)
43392181f19SNick Piggin {
43492181f19SNick Piggin 	struct task_struct *tsk = current;
43592181f19SNick Piggin #ifdef CONFIG_X86_64
43692181f19SNick Piggin 	unsigned long flags;
43792181f19SNick Piggin 	int sig;
43892181f19SNick Piggin #endif
43992181f19SNick Piggin 
44092181f19SNick Piggin 	/* Are we prepared to handle this kernel fault?  */
44192181f19SNick Piggin 	if (fixup_exception(regs))
44292181f19SNick Piggin 		return;
44392181f19SNick Piggin 
44492181f19SNick Piggin 	/*
44592181f19SNick Piggin 	 * X86_32
44692181f19SNick Piggin 	 * Valid to do another page fault here, because if this fault
44792181f19SNick Piggin 	 * had been triggered by is_prefetch fixup_exception would have
44892181f19SNick Piggin 	 * handled it.
44992181f19SNick Piggin 	 *
45092181f19SNick Piggin 	 * X86_64
45192181f19SNick Piggin 	 * Hall of shame of CPU/BIOS bugs.
45292181f19SNick Piggin 	 */
45392181f19SNick Piggin 	if (is_prefetch(regs, error_code, address))
45492181f19SNick Piggin 		return;
45592181f19SNick Piggin 
45692181f19SNick Piggin 	if (is_errata93(regs, address))
45792181f19SNick Piggin 		return;
45892181f19SNick Piggin 
45992181f19SNick Piggin 	/*
46092181f19SNick Piggin 	 * Oops. The kernel tried to access some bad page. We'll have to
46192181f19SNick Piggin 	 * terminate things with extreme prejudice.
46292181f19SNick Piggin 	 */
46392181f19SNick Piggin #ifdef CONFIG_X86_32
46492181f19SNick Piggin 	bust_spinlocks(1);
46592181f19SNick Piggin #else
46692181f19SNick Piggin 	flags = oops_begin();
46792181f19SNick Piggin #endif
46892181f19SNick Piggin 
46992181f19SNick Piggin 	show_fault_oops(regs, error_code, address);
47092181f19SNick Piggin 
47192181f19SNick Piggin 	tsk->thread.cr2 = address;
47292181f19SNick Piggin 	tsk->thread.trap_no = 14;
47392181f19SNick Piggin 	tsk->thread.error_code = error_code;
47492181f19SNick Piggin 
47592181f19SNick Piggin #ifdef CONFIG_X86_32
47692181f19SNick Piggin 	die("Oops", regs, error_code);
47792181f19SNick Piggin 	bust_spinlocks(0);
47892181f19SNick Piggin 	do_exit(SIGKILL);
47992181f19SNick Piggin #else
48092181f19SNick Piggin 	sig = SIGKILL;
48192181f19SNick Piggin 	if (__die("Oops", regs, error_code))
48292181f19SNick Piggin 		sig = 0;
48392181f19SNick Piggin 	/* Executive summary in case the body of the oops scrolled away */
48492181f19SNick Piggin 	printk(KERN_EMERG "CR2: %016lx\n", address);
48592181f19SNick Piggin 	oops_end(flags, regs, sig);
48692181f19SNick Piggin #endif
48792181f19SNick Piggin }
48892181f19SNick Piggin 
48992181f19SNick Piggin static void __bad_area_nosemaphore(struct pt_regs *regs,
49092181f19SNick Piggin 			unsigned long error_code, unsigned long address,
49192181f19SNick Piggin 			int si_code)
49292181f19SNick Piggin {
49392181f19SNick Piggin 	struct task_struct *tsk = current;
49492181f19SNick Piggin 
49592181f19SNick Piggin 	/* User mode accesses just cause a SIGSEGV */
49692181f19SNick Piggin 	if (error_code & PF_USER) {
49792181f19SNick Piggin 		/*
49892181f19SNick Piggin 		 * It's possible to have interrupts off here.
49992181f19SNick Piggin 		 */
50092181f19SNick Piggin 		local_irq_enable();
50192181f19SNick Piggin 
50292181f19SNick Piggin 		/*
50392181f19SNick Piggin 		 * Valid to do another page fault here because this one came
50492181f19SNick Piggin 		 * from user space.
50592181f19SNick Piggin 		 */
50692181f19SNick Piggin 		if (is_prefetch(regs, error_code, address))
50792181f19SNick Piggin 			return;
50892181f19SNick Piggin 
50992181f19SNick Piggin 		if (is_errata100(regs, address))
51092181f19SNick Piggin 			return;
51192181f19SNick Piggin 
51292181f19SNick Piggin 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
51392181f19SNick Piggin 		    printk_ratelimit()) {
51492181f19SNick Piggin 			printk(
51592181f19SNick Piggin 			"%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
51692181f19SNick Piggin 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
51792181f19SNick Piggin 			tsk->comm, task_pid_nr(tsk), address,
51892181f19SNick Piggin 			(void *) regs->ip, (void *) regs->sp, error_code);
51992181f19SNick Piggin 			print_vma_addr(" in ", regs->ip);
52092181f19SNick Piggin 			printk("\n");
52192181f19SNick Piggin 		}
52292181f19SNick Piggin 
52392181f19SNick Piggin 		tsk->thread.cr2 = address;
52492181f19SNick Piggin 		/* Kernel addresses are always protection faults */
52592181f19SNick Piggin 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
52692181f19SNick Piggin 		tsk->thread.trap_no = 14;
52792181f19SNick Piggin 		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
52892181f19SNick Piggin 		return;
52992181f19SNick Piggin 	}
53092181f19SNick Piggin 
53192181f19SNick Piggin 	if (is_f00f_bug(regs, address))
53292181f19SNick Piggin 		return;
53392181f19SNick Piggin 
53492181f19SNick Piggin 	no_context(regs, error_code, address);
53592181f19SNick Piggin }
53692181f19SNick Piggin 
53792181f19SNick Piggin static noinline void bad_area_nosemaphore(struct pt_regs *regs,
53892181f19SNick Piggin 			unsigned long error_code, unsigned long address)
53992181f19SNick Piggin {
54092181f19SNick Piggin 	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
54192181f19SNick Piggin }
54292181f19SNick Piggin 
54392181f19SNick Piggin static void __bad_area(struct pt_regs *regs,
54492181f19SNick Piggin 			unsigned long error_code, unsigned long address,
54592181f19SNick Piggin 			int si_code)
54692181f19SNick Piggin {
54792181f19SNick Piggin 	struct mm_struct *mm = current->mm;
54892181f19SNick Piggin 
54992181f19SNick Piggin 	/*
55092181f19SNick Piggin 	 * Something tried to access memory that isn't in our memory map..
55192181f19SNick Piggin 	 * Fix it, but check if it's kernel or user first..
55292181f19SNick Piggin 	 */
55392181f19SNick Piggin 	up_read(&mm->mmap_sem);
55492181f19SNick Piggin 
55592181f19SNick Piggin 	__bad_area_nosemaphore(regs, error_code, address, si_code);
55692181f19SNick Piggin }
55792181f19SNick Piggin 
55892181f19SNick Piggin static noinline void bad_area(struct pt_regs *regs,
55992181f19SNick Piggin 			unsigned long error_code, unsigned long address)
56092181f19SNick Piggin {
56192181f19SNick Piggin 	__bad_area(regs, error_code, address, SEGV_MAPERR);
56292181f19SNick Piggin }
56392181f19SNick Piggin 
56492181f19SNick Piggin static noinline void bad_area_access_error(struct pt_regs *regs,
56592181f19SNick Piggin 			unsigned long error_code, unsigned long address)
56692181f19SNick Piggin {
56792181f19SNick Piggin 	__bad_area(regs, error_code, address, SEGV_ACCERR);
56892181f19SNick Piggin }
56992181f19SNick Piggin 
57092181f19SNick Piggin /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
57192181f19SNick Piggin static void out_of_memory(struct pt_regs *regs,
57292181f19SNick Piggin 			unsigned long error_code, unsigned long address)
57392181f19SNick Piggin {
57492181f19SNick Piggin 	/*
57592181f19SNick Piggin 	 * We ran out of memory, call the OOM killer, and return the userspace
57692181f19SNick Piggin 	 * (which will retry the fault, or kill us if we got oom-killed).
57792181f19SNick Piggin 	 */
57892181f19SNick Piggin 	up_read(&current->mm->mmap_sem);
57992181f19SNick Piggin 	pagefault_out_of_memory();
58092181f19SNick Piggin }
58192181f19SNick Piggin 
58292181f19SNick Piggin static void do_sigbus(struct pt_regs *regs,
58392181f19SNick Piggin 			unsigned long error_code, unsigned long address)
58492181f19SNick Piggin {
58592181f19SNick Piggin 	struct task_struct *tsk = current;
58692181f19SNick Piggin 	struct mm_struct *mm = tsk->mm;
58792181f19SNick Piggin 
58892181f19SNick Piggin 	up_read(&mm->mmap_sem);
58992181f19SNick Piggin 
59092181f19SNick Piggin 	/* Kernel mode? Handle exceptions or die */
59192181f19SNick Piggin 	if (!(error_code & PF_USER))
59292181f19SNick Piggin 		no_context(regs, error_code, address);
59392181f19SNick Piggin #ifdef CONFIG_X86_32
59492181f19SNick Piggin 	/* User space => ok to do another page fault */
59592181f19SNick Piggin 	if (is_prefetch(regs, error_code, address))
59692181f19SNick Piggin 		return;
59792181f19SNick Piggin #endif
59892181f19SNick Piggin 	tsk->thread.cr2 = address;
59992181f19SNick Piggin 	tsk->thread.error_code = error_code;
60092181f19SNick Piggin 	tsk->thread.trap_no = 14;
60192181f19SNick Piggin 	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
60292181f19SNick Piggin }
60392181f19SNick Piggin 
60492181f19SNick Piggin static noinline void mm_fault_error(struct pt_regs *regs,
60592181f19SNick Piggin 		unsigned long error_code, unsigned long address, unsigned int fault)
60692181f19SNick Piggin {
60792181f19SNick Piggin 	if (fault & VM_FAULT_OOM)
60892181f19SNick Piggin 		out_of_memory(regs, error_code, address);
60992181f19SNick Piggin 	else if (fault & VM_FAULT_SIGBUS)
61092181f19SNick Piggin 		do_sigbus(regs, error_code, address);
61192181f19SNick Piggin 	else
61292181f19SNick Piggin 		BUG();
61392181f19SNick Piggin }
61492181f19SNick Piggin 
615d8b57bb7SThomas Gleixner static int spurious_fault_check(unsigned long error_code, pte_t *pte)
616d8b57bb7SThomas Gleixner {
617d8b57bb7SThomas Gleixner 	if ((error_code & PF_WRITE) && !pte_write(*pte))
618d8b57bb7SThomas Gleixner 		return 0;
619d8b57bb7SThomas Gleixner 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
620d8b57bb7SThomas Gleixner 		return 0;
621d8b57bb7SThomas Gleixner 
622d8b57bb7SThomas Gleixner 	return 1;
623d8b57bb7SThomas Gleixner }
624d8b57bb7SThomas Gleixner 
625c61e211dSHarvey Harrison /*
6265b727a3bSJeremy Fitzhardinge  * Handle a spurious fault caused by a stale TLB entry.  This allows
6275b727a3bSJeremy Fitzhardinge  * us to lazily refresh the TLB when increasing the permissions of a
6285b727a3bSJeremy Fitzhardinge  * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
6295b727a3bSJeremy Fitzhardinge  * expensive since that implies doing a full cross-processor TLB
6305b727a3bSJeremy Fitzhardinge  * flush, even if no stale TLB entries exist on other processors.
6315b727a3bSJeremy Fitzhardinge  * There are no security implications to leaving a stale TLB when
6325b727a3bSJeremy Fitzhardinge  * increasing the permissions on a page.
6335b727a3bSJeremy Fitzhardinge  */
63492181f19SNick Piggin static noinline int spurious_fault(unsigned long error_code,
63592181f19SNick Piggin 				unsigned long address)
6365b727a3bSJeremy Fitzhardinge {
6375b727a3bSJeremy Fitzhardinge 	pgd_t *pgd;
6385b727a3bSJeremy Fitzhardinge 	pud_t *pud;
6395b727a3bSJeremy Fitzhardinge 	pmd_t *pmd;
6405b727a3bSJeremy Fitzhardinge 	pte_t *pte;
6415b727a3bSJeremy Fitzhardinge 
6425b727a3bSJeremy Fitzhardinge 	/* Reserved-bit violation or user access to kernel space? */
6435b727a3bSJeremy Fitzhardinge 	if (error_code & (PF_USER | PF_RSVD))
6445b727a3bSJeremy Fitzhardinge 		return 0;
6455b727a3bSJeremy Fitzhardinge 
6465b727a3bSJeremy Fitzhardinge 	pgd = init_mm.pgd + pgd_index(address);
6475b727a3bSJeremy Fitzhardinge 	if (!pgd_present(*pgd))
6485b727a3bSJeremy Fitzhardinge 		return 0;
6495b727a3bSJeremy Fitzhardinge 
6505b727a3bSJeremy Fitzhardinge 	pud = pud_offset(pgd, address);
6515b727a3bSJeremy Fitzhardinge 	if (!pud_present(*pud))
6525b727a3bSJeremy Fitzhardinge 		return 0;
6535b727a3bSJeremy Fitzhardinge 
654d8b57bb7SThomas Gleixner 	if (pud_large(*pud))
655d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pud);
656d8b57bb7SThomas Gleixner 
6575b727a3bSJeremy Fitzhardinge 	pmd = pmd_offset(pud, address);
6585b727a3bSJeremy Fitzhardinge 	if (!pmd_present(*pmd))
6595b727a3bSJeremy Fitzhardinge 		return 0;
6605b727a3bSJeremy Fitzhardinge 
661d8b57bb7SThomas Gleixner 	if (pmd_large(*pmd))
662d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pmd);
663d8b57bb7SThomas Gleixner 
6645b727a3bSJeremy Fitzhardinge 	pte = pte_offset_kernel(pmd, address);
6655b727a3bSJeremy Fitzhardinge 	if (!pte_present(*pte))
6665b727a3bSJeremy Fitzhardinge 		return 0;
6675b727a3bSJeremy Fitzhardinge 
668d8b57bb7SThomas Gleixner 	return spurious_fault_check(error_code, pte);
6695b727a3bSJeremy Fitzhardinge }
6705b727a3bSJeremy Fitzhardinge 
6715b727a3bSJeremy Fitzhardinge /*
672c61e211dSHarvey Harrison  * X86_32
673c61e211dSHarvey Harrison  * Handle a fault on the vmalloc or module mapping area
674c61e211dSHarvey Harrison  *
675c61e211dSHarvey Harrison  * X86_64
676c61e211dSHarvey Harrison  * Handle a fault on the vmalloc area
677c61e211dSHarvey Harrison  *
678c61e211dSHarvey Harrison  * This assumes no large pages in there.
679c61e211dSHarvey Harrison  */
68092181f19SNick Piggin static noinline int vmalloc_fault(unsigned long address)
681c61e211dSHarvey Harrison {
682c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
683c61e211dSHarvey Harrison 	unsigned long pgd_paddr;
684c61e211dSHarvey Harrison 	pmd_t *pmd_k;
685c61e211dSHarvey Harrison 	pte_t *pte_k;
686b29c701dSHenry Nestler 
687b29c701dSHenry Nestler 	/* Make sure we are in vmalloc area */
688b29c701dSHenry Nestler 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
689b29c701dSHenry Nestler 		return -1;
690b29c701dSHenry Nestler 
691c61e211dSHarvey Harrison 	/*
692c61e211dSHarvey Harrison 	 * Synchronize this task's top level page-table
693c61e211dSHarvey Harrison 	 * with the 'reference' page table.
694c61e211dSHarvey Harrison 	 *
695c61e211dSHarvey Harrison 	 * Do _not_ use "current" here. We might be inside
696c61e211dSHarvey Harrison 	 * an interrupt in the middle of a task switch..
697c61e211dSHarvey Harrison 	 */
698c61e211dSHarvey Harrison 	pgd_paddr = read_cr3();
699c61e211dSHarvey Harrison 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
700c61e211dSHarvey Harrison 	if (!pmd_k)
701c61e211dSHarvey Harrison 		return -1;
702c61e211dSHarvey Harrison 	pte_k = pte_offset_kernel(pmd_k, address);
703c61e211dSHarvey Harrison 	if (!pte_present(*pte_k))
704c61e211dSHarvey Harrison 		return -1;
705c61e211dSHarvey Harrison 	return 0;
706c61e211dSHarvey Harrison #else
707c61e211dSHarvey Harrison 	pgd_t *pgd, *pgd_ref;
708c61e211dSHarvey Harrison 	pud_t *pud, *pud_ref;
709c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_ref;
710c61e211dSHarvey Harrison 	pte_t *pte, *pte_ref;
711c61e211dSHarvey Harrison 
712cf89ec92SHarvey Harrison 	/* Make sure we are in vmalloc area */
713cf89ec92SHarvey Harrison 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
714cf89ec92SHarvey Harrison 		return -1;
715cf89ec92SHarvey Harrison 
716c61e211dSHarvey Harrison 	/* Copy kernel mappings over when needed. This can also
717c61e211dSHarvey Harrison 	   happen within a race in page table update. In the later
718c61e211dSHarvey Harrison 	   case just flush. */
719c61e211dSHarvey Harrison 
720f313e123SAndi Kleen 	pgd = pgd_offset(current->active_mm, address);
721c61e211dSHarvey Harrison 	pgd_ref = pgd_offset_k(address);
722c61e211dSHarvey Harrison 	if (pgd_none(*pgd_ref))
723c61e211dSHarvey Harrison 		return -1;
724c61e211dSHarvey Harrison 	if (pgd_none(*pgd))
725c61e211dSHarvey Harrison 		set_pgd(pgd, *pgd_ref);
726c61e211dSHarvey Harrison 	else
727c61e211dSHarvey Harrison 		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
728c61e211dSHarvey Harrison 
729c61e211dSHarvey Harrison 	/* Below here mismatches are bugs because these lower tables
730c61e211dSHarvey Harrison 	   are shared */
731c61e211dSHarvey Harrison 
732c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
733c61e211dSHarvey Harrison 	pud_ref = pud_offset(pgd_ref, address);
734c61e211dSHarvey Harrison 	if (pud_none(*pud_ref))
735c61e211dSHarvey Harrison 		return -1;
736c61e211dSHarvey Harrison 	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
737c61e211dSHarvey Harrison 		BUG();
738c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
739c61e211dSHarvey Harrison 	pmd_ref = pmd_offset(pud_ref, address);
740c61e211dSHarvey Harrison 	if (pmd_none(*pmd_ref))
741c61e211dSHarvey Harrison 		return -1;
742c61e211dSHarvey Harrison 	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
743c61e211dSHarvey Harrison 		BUG();
744c61e211dSHarvey Harrison 	pte_ref = pte_offset_kernel(pmd_ref, address);
745c61e211dSHarvey Harrison 	if (!pte_present(*pte_ref))
746c61e211dSHarvey Harrison 		return -1;
747c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
748c61e211dSHarvey Harrison 	/* Don't use pte_page here, because the mappings can point
749c61e211dSHarvey Harrison 	   outside mem_map, and the NUMA hash lookup cannot handle
750c61e211dSHarvey Harrison 	   that. */
751c61e211dSHarvey Harrison 	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
752c61e211dSHarvey Harrison 		BUG();
753c61e211dSHarvey Harrison 	return 0;
754c61e211dSHarvey Harrison #endif
755c61e211dSHarvey Harrison }
756c61e211dSHarvey Harrison 
757c61e211dSHarvey Harrison int show_unhandled_signals = 1;
758c61e211dSHarvey Harrison 
75992181f19SNick Piggin static inline int access_error(unsigned long error_code, int write,
76092181f19SNick Piggin 				struct vm_area_struct *vma)
76192181f19SNick Piggin {
76292181f19SNick Piggin 	if (write) {
76392181f19SNick Piggin 		/* write, present and write, not present */
76492181f19SNick Piggin 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
76592181f19SNick Piggin 			return 1;
76692181f19SNick Piggin 	} else if (unlikely(error_code & PF_PROT)) {
76792181f19SNick Piggin 		/* read, present */
76892181f19SNick Piggin 		return 1;
76992181f19SNick Piggin 	} else {
77092181f19SNick Piggin 		/* read, not present */
77192181f19SNick Piggin 		if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
77292181f19SNick Piggin 			return 1;
77392181f19SNick Piggin 	}
77492181f19SNick Piggin 
77592181f19SNick Piggin 	return 0;
77692181f19SNick Piggin }
77792181f19SNick Piggin 
778c61e211dSHarvey Harrison /*
779c61e211dSHarvey Harrison  * This routine handles page faults.  It determines the address,
780c61e211dSHarvey Harrison  * and the problem, and then passes it off to one of the appropriate
781c61e211dSHarvey Harrison  * routines.
782c61e211dSHarvey Harrison  */
783c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
784c61e211dSHarvey Harrison asmlinkage
785c61e211dSHarvey Harrison #endif
786c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
787c61e211dSHarvey Harrison {
78892181f19SNick Piggin 	unsigned long address;
789c61e211dSHarvey Harrison 	struct task_struct *tsk;
790c61e211dSHarvey Harrison 	struct mm_struct *mm;
791c61e211dSHarvey Harrison 	struct vm_area_struct *vma;
79292181f19SNick Piggin 	int write;
793c61e211dSHarvey Harrison 	int fault;
794c61e211dSHarvey Harrison 
795c61e211dSHarvey Harrison 	tsk = current;
796c61e211dSHarvey Harrison 	mm = tsk->mm;
797c61e211dSHarvey Harrison 	prefetchw(&mm->mmap_sem);
798c61e211dSHarvey Harrison 
799c61e211dSHarvey Harrison 	/* get the address */
800c61e211dSHarvey Harrison 	address = read_cr2();
801c61e211dSHarvey Harrison 
80292181f19SNick Piggin 	if (unlikely(notify_page_fault(regs)))
803c61e211dSHarvey Harrison 		return;
8040fd0e3daSPekka Paalanen 	if (unlikely(kmmio_fault(regs, address)))
80586069782SPekka Paalanen 		return;
806c61e211dSHarvey Harrison 
807c61e211dSHarvey Harrison 	/*
808c61e211dSHarvey Harrison 	 * We fault-in kernel-space virtual memory on-demand. The
809c61e211dSHarvey Harrison 	 * 'reference' page table is init_mm.pgd.
810c61e211dSHarvey Harrison 	 *
811c61e211dSHarvey Harrison 	 * NOTE! We MUST NOT take any locks for this case. We may
812c61e211dSHarvey Harrison 	 * be in an interrupt or a critical region, and should
813c61e211dSHarvey Harrison 	 * only copy the information from the master page table,
814c61e211dSHarvey Harrison 	 * nothing more.
815c61e211dSHarvey Harrison 	 *
816c61e211dSHarvey Harrison 	 * This verifies that the fault happens in kernel space
817c61e211dSHarvey Harrison 	 * (error_code & 4) == 0, and that the fault was not a
818c61e211dSHarvey Harrison 	 * protection error (error_code & 9) == 0.
819c61e211dSHarvey Harrison 	 */
820c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
821c61e211dSHarvey Harrison 	if (unlikely(address >= TASK_SIZE)) {
822cf89ec92SHarvey Harrison #else
823cf89ec92SHarvey Harrison 	if (unlikely(address >= TASK_SIZE64)) {
824cf89ec92SHarvey Harrison #endif
825c61e211dSHarvey Harrison 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
826c61e211dSHarvey Harrison 		    vmalloc_fault(address) >= 0)
827c61e211dSHarvey Harrison 			return;
8285b727a3bSJeremy Fitzhardinge 
8295b727a3bSJeremy Fitzhardinge 		/* Can handle a stale RO->RW TLB */
83092181f19SNick Piggin 		if (spurious_fault(error_code, address))
8315b727a3bSJeremy Fitzhardinge 			return;
8325b727a3bSJeremy Fitzhardinge 
833c61e211dSHarvey Harrison 		/*
834c61e211dSHarvey Harrison 		 * Don't take the mm semaphore here. If we fixup a prefetch
835c61e211dSHarvey Harrison 		 * fault we could otherwise deadlock.
836c61e211dSHarvey Harrison 		 */
83792181f19SNick Piggin 		bad_area_nosemaphore(regs, error_code, address);
83892181f19SNick Piggin 		return;
839c61e211dSHarvey Harrison 	}
840c61e211dSHarvey Harrison 
841c61e211dSHarvey Harrison 	/*
842891cffbdSLinus Torvalds 	 * It's safe to allow irq's after cr2 has been saved and the
843891cffbdSLinus Torvalds 	 * vmalloc fault has been handled.
844891cffbdSLinus Torvalds 	 *
845891cffbdSLinus Torvalds 	 * User-mode registers count as a user access even for any
846891cffbdSLinus Torvalds 	 * potential system fault or CPU buglet.
847c61e211dSHarvey Harrison 	 */
848891cffbdSLinus Torvalds 	if (user_mode_vm(regs)) {
849891cffbdSLinus Torvalds 		local_irq_enable();
850891cffbdSLinus Torvalds 		error_code |= PF_USER;
851891cffbdSLinus Torvalds 	} else if (regs->flags & X86_EFLAGS_IF)
852c61e211dSHarvey Harrison 		local_irq_enable();
853c61e211dSHarvey Harrison 
854891cffbdSLinus Torvalds #ifdef CONFIG_X86_64
855c61e211dSHarvey Harrison 	if (unlikely(error_code & PF_RSVD))
85692181f19SNick Piggin 		pgtable_bad(regs, error_code, address);
857891cffbdSLinus Torvalds #endif
858c61e211dSHarvey Harrison 
859c61e211dSHarvey Harrison 	/*
860c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
861c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
862c61e211dSHarvey Harrison 	 */
86392181f19SNick Piggin 	if (unlikely(in_atomic() || !mm)) {
86492181f19SNick Piggin 		bad_area_nosemaphore(regs, error_code, address);
86592181f19SNick Piggin 		return;
86692181f19SNick Piggin 	}
867c61e211dSHarvey Harrison 
8683a1dfe6eSIngo Molnar 	/*
8693a1dfe6eSIngo Molnar 	 * When running in the kernel we expect faults to occur only to
870c61e211dSHarvey Harrison 	 * addresses in user space.  All other faults represent errors in the
871c61e211dSHarvey Harrison 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
872c61e211dSHarvey Harrison 	 * erroneous fault occurring in a code path which already holds mmap_sem
873c61e211dSHarvey Harrison 	 * we will deadlock attempting to validate the fault against the
874c61e211dSHarvey Harrison 	 * address space.  Luckily the kernel only validly references user
875c61e211dSHarvey Harrison 	 * space from well defined areas of code, which are listed in the
876c61e211dSHarvey Harrison 	 * exceptions table.
877c61e211dSHarvey Harrison 	 *
878c61e211dSHarvey Harrison 	 * As the vast majority of faults will be valid we will only perform
879c61e211dSHarvey Harrison 	 * the source reference check when there is a possibility of a deadlock.
880c61e211dSHarvey Harrison 	 * Attempt to lock the address space, if we cannot we then validate the
881c61e211dSHarvey Harrison 	 * source.  If this is invalid we can skip the address space check,
882c61e211dSHarvey Harrison 	 * thus avoiding the deadlock.
883c61e211dSHarvey Harrison 	 */
88492181f19SNick Piggin 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
885c61e211dSHarvey Harrison 		if ((error_code & PF_USER) == 0 &&
88692181f19SNick Piggin 		    !search_exception_tables(regs->ip)) {
88792181f19SNick Piggin 			bad_area_nosemaphore(regs, error_code, address);
88892181f19SNick Piggin 			return;
88992181f19SNick Piggin 		}
890c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
891*01006074SPeter Zijlstra 	} else {
892*01006074SPeter Zijlstra 		/*
893*01006074SPeter Zijlstra 		 * The above down_read_trylock() might have succeeded in which
894*01006074SPeter Zijlstra 		 * case we'll have missed the might_sleep() from down_read().
895*01006074SPeter Zijlstra 		 */
896*01006074SPeter Zijlstra 		might_sleep();
897c61e211dSHarvey Harrison 	}
898c61e211dSHarvey Harrison 
899c61e211dSHarvey Harrison 	vma = find_vma(mm, address);
90092181f19SNick Piggin 	if (unlikely(!vma)) {
90192181f19SNick Piggin 		bad_area(regs, error_code, address);
90292181f19SNick Piggin 		return;
90392181f19SNick Piggin 	}
90492181f19SNick Piggin 	if (likely(vma->vm_start <= address))
905c61e211dSHarvey Harrison 		goto good_area;
90692181f19SNick Piggin 	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
90792181f19SNick Piggin 		bad_area(regs, error_code, address);
90892181f19SNick Piggin 		return;
90992181f19SNick Piggin 	}
910c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
911c61e211dSHarvey Harrison 		/*
912c61e211dSHarvey Harrison 		 * Accessing the stack below %sp is always a bug.
913c61e211dSHarvey Harrison 		 * The large cushion allows instructions like enter
914c61e211dSHarvey Harrison 		 * and pusha to work.  ("enter $65535,$31" pushes
915c61e211dSHarvey Harrison 		 * 32 pointers and then decrements %sp by 65535.)
916c61e211dSHarvey Harrison 		 */
91792181f19SNick Piggin 		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
91892181f19SNick Piggin 			bad_area(regs, error_code, address);
91992181f19SNick Piggin 			return;
920c61e211dSHarvey Harrison 		}
92192181f19SNick Piggin 	}
92292181f19SNick Piggin 	if (unlikely(expand_stack(vma, address))) {
92392181f19SNick Piggin 		bad_area(regs, error_code, address);
92492181f19SNick Piggin 		return;
92592181f19SNick Piggin 	}
92692181f19SNick Piggin 
927c61e211dSHarvey Harrison 	/*
928c61e211dSHarvey Harrison 	 * Ok, we have a good vm_area for this memory access, so
929c61e211dSHarvey Harrison 	 * we can handle it..
930c61e211dSHarvey Harrison 	 */
931c61e211dSHarvey Harrison good_area:
93292181f19SNick Piggin 	write = error_code & PF_WRITE;
93392181f19SNick Piggin 	if (unlikely(access_error(error_code, write, vma))) {
93492181f19SNick Piggin 		bad_area_access_error(regs, error_code, address);
93592181f19SNick Piggin 		return;
936c61e211dSHarvey Harrison 	}
937c61e211dSHarvey Harrison 
938c61e211dSHarvey Harrison 	/*
939c61e211dSHarvey Harrison 	 * If for any reason at all we couldn't handle the fault,
940c61e211dSHarvey Harrison 	 * make sure we exit gracefully rather than endlessly redo
941c61e211dSHarvey Harrison 	 * the fault.
942c61e211dSHarvey Harrison 	 */
943c61e211dSHarvey Harrison 	fault = handle_mm_fault(mm, vma, address, write);
944c61e211dSHarvey Harrison 	if (unlikely(fault & VM_FAULT_ERROR)) {
94592181f19SNick Piggin 		mm_fault_error(regs, error_code, address, fault);
94692181f19SNick Piggin 		return;
947c61e211dSHarvey Harrison 	}
948c61e211dSHarvey Harrison 	if (fault & VM_FAULT_MAJOR)
949c61e211dSHarvey Harrison 		tsk->maj_flt++;
950c61e211dSHarvey Harrison 	else
951c61e211dSHarvey Harrison 		tsk->min_flt++;
952c61e211dSHarvey Harrison 
953c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
954c61e211dSHarvey Harrison 	/*
955c61e211dSHarvey Harrison 	 * Did it hit the DOS screen memory VA from vm86 mode?
956c61e211dSHarvey Harrison 	 */
957c61e211dSHarvey Harrison 	if (v8086_mode(regs)) {
958c61e211dSHarvey Harrison 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
959c61e211dSHarvey Harrison 		if (bit < 32)
960c61e211dSHarvey Harrison 			tsk->thread.screen_bitmap |= 1 << bit;
961c61e211dSHarvey Harrison 	}
962c61e211dSHarvey Harrison #endif
963c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
964c61e211dSHarvey Harrison }
965c61e211dSHarvey Harrison 
966c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock);
967c61e211dSHarvey Harrison LIST_HEAD(pgd_list);
968c61e211dSHarvey Harrison 
969c61e211dSHarvey Harrison void vmalloc_sync_all(void)
970c61e211dSHarvey Harrison {
971c61e211dSHarvey Harrison 	unsigned long address;
972c61e211dSHarvey Harrison 
973cc643d46SJan Beulich #ifdef CONFIG_X86_32
974c61e211dSHarvey Harrison 	if (SHARED_KERNEL_PMD)
975c61e211dSHarvey Harrison 		return;
976c61e211dSHarvey Harrison 
977cc643d46SJan Beulich 	for (address = VMALLOC_START & PMD_MASK;
978cc643d46SJan Beulich 	     address >= TASK_SIZE && address < FIXADDR_TOP;
979cc643d46SJan Beulich 	     address += PMD_SIZE) {
980c61e211dSHarvey Harrison 		unsigned long flags;
981c61e211dSHarvey Harrison 		struct page *page;
982c61e211dSHarvey Harrison 
983c61e211dSHarvey Harrison 		spin_lock_irqsave(&pgd_lock, flags);
984e3ed910dSJeremy Fitzhardinge 		list_for_each_entry(page, &pgd_list, lru) {
985c61e211dSHarvey Harrison 			if (!vmalloc_sync_one(page_address(page),
986e3ed910dSJeremy Fitzhardinge 					      address))
987c61e211dSHarvey Harrison 				break;
988c61e211dSHarvey Harrison 		}
989c61e211dSHarvey Harrison 		spin_unlock_irqrestore(&pgd_lock, flags);
990c61e211dSHarvey Harrison 	}
991c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
992cc643d46SJan Beulich 	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
993cc643d46SJan Beulich 	     address += PGDIR_SIZE) {
994c61e211dSHarvey Harrison 		const pgd_t *pgd_ref = pgd_offset_k(address);
99558d5d0d8SIngo Molnar 		unsigned long flags;
996c61e211dSHarvey Harrison 		struct page *page;
997c61e211dSHarvey Harrison 
998c61e211dSHarvey Harrison 		if (pgd_none(*pgd_ref))
999c61e211dSHarvey Harrison 			continue;
100058d5d0d8SIngo Molnar 		spin_lock_irqsave(&pgd_lock, flags);
1001c61e211dSHarvey Harrison 		list_for_each_entry(page, &pgd_list, lru) {
1002c61e211dSHarvey Harrison 			pgd_t *pgd;
1003c61e211dSHarvey Harrison 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
1004c61e211dSHarvey Harrison 			if (pgd_none(*pgd))
1005c61e211dSHarvey Harrison 				set_pgd(pgd, *pgd_ref);
1006c61e211dSHarvey Harrison 			else
1007c61e211dSHarvey Harrison 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1008c61e211dSHarvey Harrison 		}
100958d5d0d8SIngo Molnar 		spin_unlock_irqrestore(&pgd_lock, flags);
1010c61e211dSHarvey Harrison 	}
1011c61e211dSHarvey Harrison #endif
1012c61e211dSHarvey Harrison }
1013