xref: /openbmc/linux/arch/x86/mm/fault.c (revision 86069782d62e731b4835a0cf8eb7d1d0e17cf306)
1c61e211dSHarvey Harrison /*
2c61e211dSHarvey Harrison  *  Copyright (C) 1995  Linus Torvalds
3c61e211dSHarvey Harrison  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4c61e211dSHarvey Harrison  */
5c61e211dSHarvey Harrison 
6c61e211dSHarvey Harrison #include <linux/signal.h>
7c61e211dSHarvey Harrison #include <linux/sched.h>
8c61e211dSHarvey Harrison #include <linux/kernel.h>
9c61e211dSHarvey Harrison #include <linux/errno.h>
10c61e211dSHarvey Harrison #include <linux/string.h>
11c61e211dSHarvey Harrison #include <linux/types.h>
12c61e211dSHarvey Harrison #include <linux/ptrace.h>
13c61e211dSHarvey Harrison #include <linux/mman.h>
14c61e211dSHarvey Harrison #include <linux/mm.h>
15c61e211dSHarvey Harrison #include <linux/smp.h>
16c61e211dSHarvey Harrison #include <linux/interrupt.h>
17c61e211dSHarvey Harrison #include <linux/init.h>
18c61e211dSHarvey Harrison #include <linux/tty.h>
19c61e211dSHarvey Harrison #include <linux/vt_kern.h>		/* For unblank_screen() */
20c61e211dSHarvey Harrison #include <linux/compiler.h>
21c61e211dSHarvey Harrison #include <linux/highmem.h>
22c61e211dSHarvey Harrison #include <linux/bootmem.h>		/* for max_low_pfn */
23c61e211dSHarvey Harrison #include <linux/vmalloc.h>
24c61e211dSHarvey Harrison #include <linux/module.h>
25c61e211dSHarvey Harrison #include <linux/kprobes.h>
26c61e211dSHarvey Harrison #include <linux/uaccess.h>
27c61e211dSHarvey Harrison #include <linux/kdebug.h>
28c61e211dSHarvey Harrison 
29c61e211dSHarvey Harrison #include <asm/system.h>
30c61e211dSHarvey Harrison #include <asm/desc.h>
31c61e211dSHarvey Harrison #include <asm/segment.h>
32c61e211dSHarvey Harrison #include <asm/pgalloc.h>
33c61e211dSHarvey Harrison #include <asm/smp.h>
34c61e211dSHarvey Harrison #include <asm/tlbflush.h>
35c61e211dSHarvey Harrison #include <asm/proto.h>
36c61e211dSHarvey Harrison #include <asm-generic/sections.h>
37c61e211dSHarvey Harrison 
38c61e211dSHarvey Harrison /*
39c61e211dSHarvey Harrison  * Page fault error code bits
40c61e211dSHarvey Harrison  *	bit 0 == 0 means no page found, 1 means protection fault
41c61e211dSHarvey Harrison  *	bit 1 == 0 means read, 1 means write
42c61e211dSHarvey Harrison  *	bit 2 == 0 means kernel, 1 means user-mode
43c61e211dSHarvey Harrison  *	bit 3 == 1 means use of reserved bit detected
44c61e211dSHarvey Harrison  *	bit 4 == 1 means fault was an instruction fetch
45c61e211dSHarvey Harrison  */
46c61e211dSHarvey Harrison #define PF_PROT		(1<<0)
47c61e211dSHarvey Harrison #define PF_WRITE	(1<<1)
48c61e211dSHarvey Harrison #define PF_USER		(1<<2)
49c61e211dSHarvey Harrison #define PF_RSVD		(1<<3)
50c61e211dSHarvey Harrison #define PF_INSTR	(1<<4)
51c61e211dSHarvey Harrison 
52*86069782SPekka Paalanen #ifdef CONFIG_PAGE_FAULT_HANDLERS
53*86069782SPekka Paalanen static HLIST_HEAD(pf_handlers); /* protected by RCU */
54*86069782SPekka Paalanen static DEFINE_SPINLOCK(pf_handlers_writer);
55*86069782SPekka Paalanen 
56*86069782SPekka Paalanen void register_page_fault_handler(struct pf_handler *new_pfh)
57*86069782SPekka Paalanen {
58*86069782SPekka Paalanen 	unsigned long flags;
59*86069782SPekka Paalanen 	spin_lock_irqsave(&pf_handlers_writer, flags);
60*86069782SPekka Paalanen 	hlist_add_head_rcu(&new_pfh->hlist, &pf_handlers);
61*86069782SPekka Paalanen 	spin_unlock_irqrestore(&pf_handlers_writer, flags);
62*86069782SPekka Paalanen }
63*86069782SPekka Paalanen EXPORT_SYMBOL_GPL(register_page_fault_handler);
64*86069782SPekka Paalanen 
65*86069782SPekka Paalanen /**
66*86069782SPekka Paalanen  * unregister_page_fault_handler:
67*86069782SPekka Paalanen  * The caller must ensure @old_pfh is not in use anymore before freeing it.
68*86069782SPekka Paalanen  * This function does not guarantee it. The list of handlers is protected by
69*86069782SPekka Paalanen  * RCU, so you can do this by e.g. calling synchronize_rcu().
70*86069782SPekka Paalanen  */
71*86069782SPekka Paalanen void unregister_page_fault_handler(struct pf_handler *old_pfh)
72*86069782SPekka Paalanen {
73*86069782SPekka Paalanen 	unsigned long flags;
74*86069782SPekka Paalanen 	spin_lock_irqsave(&pf_handlers_writer, flags);
75*86069782SPekka Paalanen 	hlist_del_rcu(&old_pfh->hlist);
76*86069782SPekka Paalanen 	spin_unlock_irqrestore(&pf_handlers_writer, flags);
77*86069782SPekka Paalanen }
78*86069782SPekka Paalanen EXPORT_SYMBOL_GPL(unregister_page_fault_handler);
79*86069782SPekka Paalanen #endif
80*86069782SPekka Paalanen 
81*86069782SPekka Paalanen /* returns non-zero if do_page_fault() should return */
82*86069782SPekka Paalanen static int handle_custom_pf(struct pt_regs *regs, unsigned long error_code,
83*86069782SPekka Paalanen 							unsigned long address)
84*86069782SPekka Paalanen {
85*86069782SPekka Paalanen #ifdef CONFIG_PAGE_FAULT_HANDLERS
86*86069782SPekka Paalanen 	int ret = 0;
87*86069782SPekka Paalanen 	struct pf_handler *cur;
88*86069782SPekka Paalanen 	struct hlist_node *ncur;
89*86069782SPekka Paalanen 
90*86069782SPekka Paalanen 	if (hlist_empty(&pf_handlers))
91*86069782SPekka Paalanen 		return 0;
92*86069782SPekka Paalanen 
93*86069782SPekka Paalanen 	rcu_read_lock();
94*86069782SPekka Paalanen 	hlist_for_each_entry_rcu(cur, ncur, &pf_handlers, hlist) {
95*86069782SPekka Paalanen 		ret = cur->handler(regs, error_code, address);
96*86069782SPekka Paalanen 		if (ret)
97*86069782SPekka Paalanen 			break;
98*86069782SPekka Paalanen 	}
99*86069782SPekka Paalanen 	rcu_read_unlock();
100*86069782SPekka Paalanen 	return ret;
101*86069782SPekka Paalanen #else
102*86069782SPekka Paalanen 	return 0;
103*86069782SPekka Paalanen #endif
104*86069782SPekka Paalanen }
105*86069782SPekka Paalanen 
106c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs)
107c61e211dSHarvey Harrison {
108c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES
109c61e211dSHarvey Harrison 	int ret = 0;
110c61e211dSHarvey Harrison 
111c61e211dSHarvey Harrison 	/* kprobe_running() needs smp_processor_id() */
112c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
113c61e211dSHarvey Harrison 	if (!user_mode_vm(regs)) {
114c61e211dSHarvey Harrison #else
115c61e211dSHarvey Harrison 	if (!user_mode(regs)) {
116c61e211dSHarvey Harrison #endif
117c61e211dSHarvey Harrison 		preempt_disable();
118c61e211dSHarvey Harrison 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
119c61e211dSHarvey Harrison 			ret = 1;
120c61e211dSHarvey Harrison 		preempt_enable();
121c61e211dSHarvey Harrison 	}
122c61e211dSHarvey Harrison 
123c61e211dSHarvey Harrison 	return ret;
124c61e211dSHarvey Harrison #else
125c61e211dSHarvey Harrison 	return 0;
126c61e211dSHarvey Harrison #endif
127c61e211dSHarvey Harrison }
128c61e211dSHarvey Harrison 
129c61e211dSHarvey Harrison /*
130c61e211dSHarvey Harrison  * X86_32
131c61e211dSHarvey Harrison  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
132c61e211dSHarvey Harrison  * Check that here and ignore it.
133c61e211dSHarvey Harrison  *
134c61e211dSHarvey Harrison  * X86_64
135c61e211dSHarvey Harrison  * Sometimes the CPU reports invalid exceptions on prefetch.
136c61e211dSHarvey Harrison  * Check that here and ignore it.
137c61e211dSHarvey Harrison  *
138c61e211dSHarvey Harrison  * Opcode checker based on code by Richard Brunner
139c61e211dSHarvey Harrison  */
140c61e211dSHarvey Harrison static int is_prefetch(struct pt_regs *regs, unsigned long addr,
141c61e211dSHarvey Harrison 		       unsigned long error_code)
142c61e211dSHarvey Harrison {
143c61e211dSHarvey Harrison 	unsigned char *instr;
144c61e211dSHarvey Harrison 	int scan_more = 1;
145c61e211dSHarvey Harrison 	int prefetch = 0;
146c61e211dSHarvey Harrison 	unsigned char *max_instr;
147c61e211dSHarvey Harrison 
1483085354dSIngo Molnar 	/*
1493085354dSIngo Molnar 	 * If it was a exec (instruction fetch) fault on NX page, then
1503085354dSIngo Molnar 	 * do not ignore the fault:
1513085354dSIngo Molnar 	 */
152c61e211dSHarvey Harrison 	if (error_code & PF_INSTR)
153c61e211dSHarvey Harrison 		return 0;
154c61e211dSHarvey Harrison 
155c61e211dSHarvey Harrison 	instr = (unsigned char *)convert_ip_to_linear(current, regs);
156c61e211dSHarvey Harrison 	max_instr = instr + 15;
157c61e211dSHarvey Harrison 
158c61e211dSHarvey Harrison 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
159c61e211dSHarvey Harrison 		return 0;
160c61e211dSHarvey Harrison 
161c61e211dSHarvey Harrison 	while (scan_more && instr < max_instr) {
162c61e211dSHarvey Harrison 		unsigned char opcode;
163c61e211dSHarvey Harrison 		unsigned char instr_hi;
164c61e211dSHarvey Harrison 		unsigned char instr_lo;
165c61e211dSHarvey Harrison 
166c61e211dSHarvey Harrison 		if (probe_kernel_address(instr, opcode))
167c61e211dSHarvey Harrison 			break;
168c61e211dSHarvey Harrison 
169c61e211dSHarvey Harrison 		instr_hi = opcode & 0xf0;
170c61e211dSHarvey Harrison 		instr_lo = opcode & 0x0f;
171c61e211dSHarvey Harrison 		instr++;
172c61e211dSHarvey Harrison 
173c61e211dSHarvey Harrison 		switch (instr_hi) {
174c61e211dSHarvey Harrison 		case 0x20:
175c61e211dSHarvey Harrison 		case 0x30:
176c61e211dSHarvey Harrison 			/*
177c61e211dSHarvey Harrison 			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
178c61e211dSHarvey Harrison 			 * In X86_64 long mode, the CPU will signal invalid
179c61e211dSHarvey Harrison 			 * opcode if some of these prefixes are present so
180c61e211dSHarvey Harrison 			 * X86_64 will never get here anyway
181c61e211dSHarvey Harrison 			 */
182c61e211dSHarvey Harrison 			scan_more = ((instr_lo & 7) == 0x6);
183c61e211dSHarvey Harrison 			break;
184c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
185c61e211dSHarvey Harrison 		case 0x40:
186c61e211dSHarvey Harrison 			/*
187c61e211dSHarvey Harrison 			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
188c61e211dSHarvey Harrison 			 * Need to figure out under what instruction mode the
189c61e211dSHarvey Harrison 			 * instruction was issued. Could check the LDT for lm,
190c61e211dSHarvey Harrison 			 * but for now it's good enough to assume that long
191c61e211dSHarvey Harrison 			 * mode only uses well known segments or kernel.
192c61e211dSHarvey Harrison 			 */
193c61e211dSHarvey Harrison 			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
194c61e211dSHarvey Harrison 			break;
195c61e211dSHarvey Harrison #endif
196c61e211dSHarvey Harrison 		case 0x60:
197c61e211dSHarvey Harrison 			/* 0x64 thru 0x67 are valid prefixes in all modes. */
198c61e211dSHarvey Harrison 			scan_more = (instr_lo & 0xC) == 0x4;
199c61e211dSHarvey Harrison 			break;
200c61e211dSHarvey Harrison 		case 0xF0:
201c61e211dSHarvey Harrison 			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
202c61e211dSHarvey Harrison 			scan_more = !instr_lo || (instr_lo>>1) == 1;
203c61e211dSHarvey Harrison 			break;
204c61e211dSHarvey Harrison 		case 0x00:
205c61e211dSHarvey Harrison 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
206c61e211dSHarvey Harrison 			scan_more = 0;
207c61e211dSHarvey Harrison 
208c61e211dSHarvey Harrison 			if (probe_kernel_address(instr, opcode))
209c61e211dSHarvey Harrison 				break;
210c61e211dSHarvey Harrison 			prefetch = (instr_lo == 0xF) &&
211c61e211dSHarvey Harrison 				(opcode == 0x0D || opcode == 0x18);
212c61e211dSHarvey Harrison 			break;
213c61e211dSHarvey Harrison 		default:
214c61e211dSHarvey Harrison 			scan_more = 0;
215c61e211dSHarvey Harrison 			break;
216c61e211dSHarvey Harrison 		}
217c61e211dSHarvey Harrison 	}
218c61e211dSHarvey Harrison 	return prefetch;
219c61e211dSHarvey Harrison }
220c61e211dSHarvey Harrison 
221c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code,
222c61e211dSHarvey Harrison 	unsigned long address, struct task_struct *tsk)
223c61e211dSHarvey Harrison {
224c61e211dSHarvey Harrison 	siginfo_t info;
225c61e211dSHarvey Harrison 
226c61e211dSHarvey Harrison 	info.si_signo = si_signo;
227c61e211dSHarvey Harrison 	info.si_errno = 0;
228c61e211dSHarvey Harrison 	info.si_code = si_code;
229c61e211dSHarvey Harrison 	info.si_addr = (void __user *)address;
230c61e211dSHarvey Harrison 	force_sig_info(si_signo, &info, tsk);
231c61e211dSHarvey Harrison }
232c61e211dSHarvey Harrison 
233c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
234c61e211dSHarvey Harrison static int bad_address(void *p)
235c61e211dSHarvey Harrison {
236c61e211dSHarvey Harrison 	unsigned long dummy;
237c61e211dSHarvey Harrison 	return probe_kernel_address((unsigned long *)p, dummy);
238c61e211dSHarvey Harrison }
239c61e211dSHarvey Harrison #endif
240c61e211dSHarvey Harrison 
241cae30f82SAdrian Bunk static void dump_pagetable(unsigned long address)
242c61e211dSHarvey Harrison {
243c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
244c61e211dSHarvey Harrison 	__typeof__(pte_val(__pte(0))) page;
245c61e211dSHarvey Harrison 
246c61e211dSHarvey Harrison 	page = read_cr3();
247c61e211dSHarvey Harrison 	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
248c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
249c61e211dSHarvey Harrison 	printk("*pdpt = %016Lx ", page);
250c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
251c61e211dSHarvey Harrison 	    && page & _PAGE_PRESENT) {
252c61e211dSHarvey Harrison 		page &= PAGE_MASK;
253c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
254c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PMD - 1)];
255c61e211dSHarvey Harrison 		printk(KERN_CONT "*pde = %016Lx ", page);
256c61e211dSHarvey Harrison 		page &= ~_PAGE_NX;
257c61e211dSHarvey Harrison 	}
258c61e211dSHarvey Harrison #else
259c61e211dSHarvey Harrison 	printk("*pde = %08lx ", page);
260c61e211dSHarvey Harrison #endif
261c61e211dSHarvey Harrison 
262c61e211dSHarvey Harrison 	/*
263c61e211dSHarvey Harrison 	 * We must not directly access the pte in the highpte
264c61e211dSHarvey Harrison 	 * case if the page table is located in highmem.
265c61e211dSHarvey Harrison 	 * And let's rather not kmap-atomic the pte, just in case
266c61e211dSHarvey Harrison 	 * it's allocated already.
267c61e211dSHarvey Harrison 	 */
268c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
269c61e211dSHarvey Harrison 	    && (page & _PAGE_PRESENT)
270c61e211dSHarvey Harrison 	    && !(page & _PAGE_PSE)) {
271c61e211dSHarvey Harrison 		page &= PAGE_MASK;
272c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
273c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PTE - 1)];
274c61e211dSHarvey Harrison 		printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
275c61e211dSHarvey Harrison 	}
276c61e211dSHarvey Harrison 
277c61e211dSHarvey Harrison 	printk("\n");
278c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
279c61e211dSHarvey Harrison 	pgd_t *pgd;
280c61e211dSHarvey Harrison 	pud_t *pud;
281c61e211dSHarvey Harrison 	pmd_t *pmd;
282c61e211dSHarvey Harrison 	pte_t *pte;
283c61e211dSHarvey Harrison 
284c61e211dSHarvey Harrison 	pgd = (pgd_t *)read_cr3();
285c61e211dSHarvey Harrison 
286c61e211dSHarvey Harrison 	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
287c61e211dSHarvey Harrison 	pgd += pgd_index(address);
288c61e211dSHarvey Harrison 	if (bad_address(pgd)) goto bad;
289c61e211dSHarvey Harrison 	printk("PGD %lx ", pgd_val(*pgd));
290c61e211dSHarvey Harrison 	if (!pgd_present(*pgd)) goto ret;
291c61e211dSHarvey Harrison 
292c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
293c61e211dSHarvey Harrison 	if (bad_address(pud)) goto bad;
294c61e211dSHarvey Harrison 	printk("PUD %lx ", pud_val(*pud));
295b5360222SAndi Kleen 	if (!pud_present(*pud) || pud_large(*pud))
296b5360222SAndi Kleen 		goto ret;
297c61e211dSHarvey Harrison 
298c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
299c61e211dSHarvey Harrison 	if (bad_address(pmd)) goto bad;
300c61e211dSHarvey Harrison 	printk("PMD %lx ", pmd_val(*pmd));
301c61e211dSHarvey Harrison 	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
302c61e211dSHarvey Harrison 
303c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
304c61e211dSHarvey Harrison 	if (bad_address(pte)) goto bad;
305c61e211dSHarvey Harrison 	printk("PTE %lx", pte_val(*pte));
306c61e211dSHarvey Harrison ret:
307c61e211dSHarvey Harrison 	printk("\n");
308c61e211dSHarvey Harrison 	return;
309c61e211dSHarvey Harrison bad:
310c61e211dSHarvey Harrison 	printk("BAD\n");
311c61e211dSHarvey Harrison #endif
312c61e211dSHarvey Harrison }
313c61e211dSHarvey Harrison 
314c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
315c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
316c61e211dSHarvey Harrison {
317c61e211dSHarvey Harrison 	unsigned index = pgd_index(address);
318c61e211dSHarvey Harrison 	pgd_t *pgd_k;
319c61e211dSHarvey Harrison 	pud_t *pud, *pud_k;
320c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_k;
321c61e211dSHarvey Harrison 
322c61e211dSHarvey Harrison 	pgd += index;
323c61e211dSHarvey Harrison 	pgd_k = init_mm.pgd + index;
324c61e211dSHarvey Harrison 
325c61e211dSHarvey Harrison 	if (!pgd_present(*pgd_k))
326c61e211dSHarvey Harrison 		return NULL;
327c61e211dSHarvey Harrison 
328c61e211dSHarvey Harrison 	/*
329c61e211dSHarvey Harrison 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
330c61e211dSHarvey Harrison 	 * and redundant with the set_pmd() on non-PAE. As would
331c61e211dSHarvey Harrison 	 * set_pud.
332c61e211dSHarvey Harrison 	 */
333c61e211dSHarvey Harrison 
334c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
335c61e211dSHarvey Harrison 	pud_k = pud_offset(pgd_k, address);
336c61e211dSHarvey Harrison 	if (!pud_present(*pud_k))
337c61e211dSHarvey Harrison 		return NULL;
338c61e211dSHarvey Harrison 
339c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
340c61e211dSHarvey Harrison 	pmd_k = pmd_offset(pud_k, address);
341c61e211dSHarvey Harrison 	if (!pmd_present(*pmd_k))
342c61e211dSHarvey Harrison 		return NULL;
343c61e211dSHarvey Harrison 	if (!pmd_present(*pmd)) {
344c61e211dSHarvey Harrison 		set_pmd(pmd, *pmd_k);
345c61e211dSHarvey Harrison 		arch_flush_lazy_mmu_mode();
346c61e211dSHarvey Harrison 	} else
347c61e211dSHarvey Harrison 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
348c61e211dSHarvey Harrison 	return pmd_k;
349c61e211dSHarvey Harrison }
350c61e211dSHarvey Harrison #endif
351c61e211dSHarvey Harrison 
352c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
353c61e211dSHarvey Harrison static const char errata93_warning[] =
354c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
355c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
356c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n"
357c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
358c61e211dSHarvey Harrison #endif
359c61e211dSHarvey Harrison 
360c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS.
361c61e211dSHarvey Harrison    BIOS SMM functions are required to use a specific workaround
362c61e211dSHarvey Harrison    to avoid corruption of the 64bit RIP register on C stepping K8.
363c61e211dSHarvey Harrison    A lot of BIOS that didn't get tested properly miss this.
364c61e211dSHarvey Harrison    The OS sees this as a page fault with the upper 32bits of RIP cleared.
365c61e211dSHarvey Harrison    Try to work around it here.
366c61e211dSHarvey Harrison    Note we only handle faults in kernel here.
367c61e211dSHarvey Harrison    Does nothing for X86_32
368c61e211dSHarvey Harrison  */
369c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address)
370c61e211dSHarvey Harrison {
371c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
372c61e211dSHarvey Harrison 	static int warned;
373c61e211dSHarvey Harrison 	if (address != regs->ip)
374c61e211dSHarvey Harrison 		return 0;
375c61e211dSHarvey Harrison 	if ((address >> 32) != 0)
376c61e211dSHarvey Harrison 		return 0;
377c61e211dSHarvey Harrison 	address |= 0xffffffffUL << 32;
378c61e211dSHarvey Harrison 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
379c61e211dSHarvey Harrison 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
380c61e211dSHarvey Harrison 		if (!warned) {
381c61e211dSHarvey Harrison 			printk(errata93_warning);
382c61e211dSHarvey Harrison 			warned = 1;
383c61e211dSHarvey Harrison 		}
384c61e211dSHarvey Harrison 		regs->ip = address;
385c61e211dSHarvey Harrison 		return 1;
386c61e211dSHarvey Harrison 	}
387c61e211dSHarvey Harrison #endif
388c61e211dSHarvey Harrison 	return 0;
389c61e211dSHarvey Harrison }
390c61e211dSHarvey Harrison 
391c61e211dSHarvey Harrison /*
392c61e211dSHarvey Harrison  * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
393c61e211dSHarvey Harrison  * addresses >4GB.  We catch this in the page fault handler because these
394c61e211dSHarvey Harrison  * addresses are not reachable. Just detect this case and return.  Any code
395c61e211dSHarvey Harrison  * segment in LDT is compatibility mode.
396c61e211dSHarvey Harrison  */
397c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address)
398c61e211dSHarvey Harrison {
399c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
400c61e211dSHarvey Harrison 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
401c61e211dSHarvey Harrison 	    (address >> 32))
402c61e211dSHarvey Harrison 		return 1;
403c61e211dSHarvey Harrison #endif
404c61e211dSHarvey Harrison 	return 0;
405c61e211dSHarvey Harrison }
406c61e211dSHarvey Harrison 
407c61e211dSHarvey Harrison void do_invalid_op(struct pt_regs *, unsigned long);
408c61e211dSHarvey Harrison 
409c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
410c61e211dSHarvey Harrison {
411c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG
412c61e211dSHarvey Harrison 	unsigned long nr;
413c61e211dSHarvey Harrison 	/*
414c61e211dSHarvey Harrison 	 * Pentium F0 0F C7 C8 bug workaround.
415c61e211dSHarvey Harrison 	 */
416c61e211dSHarvey Harrison 	if (boot_cpu_data.f00f_bug) {
417c61e211dSHarvey Harrison 		nr = (address - idt_descr.address) >> 3;
418c61e211dSHarvey Harrison 
419c61e211dSHarvey Harrison 		if (nr == 6) {
420c61e211dSHarvey Harrison 			do_invalid_op(regs, 0);
421c61e211dSHarvey Harrison 			return 1;
422c61e211dSHarvey Harrison 		}
423c61e211dSHarvey Harrison 	}
424c61e211dSHarvey Harrison #endif
425c61e211dSHarvey Harrison 	return 0;
426c61e211dSHarvey Harrison }
427c61e211dSHarvey Harrison 
428c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
429c61e211dSHarvey Harrison 			    unsigned long address)
430c61e211dSHarvey Harrison {
431c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
432c61e211dSHarvey Harrison 	if (!oops_may_print())
433c61e211dSHarvey Harrison 		return;
434fd40d6e3SHarvey Harrison #endif
435c61e211dSHarvey Harrison 
436c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
437c61e211dSHarvey Harrison 	if (error_code & PF_INSTR) {
43893809be8SHarvey Harrison 		unsigned int level;
439c61e211dSHarvey Harrison 		pte_t *pte = lookup_address(address, &level);
440c61e211dSHarvey Harrison 
441c61e211dSHarvey Harrison 		if (pte && pte_present(*pte) && !pte_exec(*pte))
442c61e211dSHarvey Harrison 			printk(KERN_CRIT "kernel tried to execute "
443c61e211dSHarvey Harrison 				"NX-protected page - exploit attempt? "
444c61e211dSHarvey Harrison 				"(uid: %d)\n", current->uid);
445c61e211dSHarvey Harrison 	}
446c61e211dSHarvey Harrison #endif
447fd40d6e3SHarvey Harrison 
448c61e211dSHarvey Harrison 	printk(KERN_ALERT "BUG: unable to handle kernel ");
449c61e211dSHarvey Harrison 	if (address < PAGE_SIZE)
450c61e211dSHarvey Harrison 		printk(KERN_CONT "NULL pointer dereference");
451c61e211dSHarvey Harrison 	else
452c61e211dSHarvey Harrison 		printk(KERN_CONT "paging request");
453fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
454c61e211dSHarvey Harrison 	printk(KERN_CONT " at %08lx\n", address);
455fd40d6e3SHarvey Harrison #else
456c61e211dSHarvey Harrison 	printk(KERN_CONT " at %016lx\n", address);
457fd40d6e3SHarvey Harrison #endif
458c61e211dSHarvey Harrison 	printk(KERN_ALERT "IP:");
459c61e211dSHarvey Harrison 	printk_address(regs->ip, 1);
460c61e211dSHarvey Harrison 	dump_pagetable(address);
461c61e211dSHarvey Harrison }
462c61e211dSHarvey Harrison 
463c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
464c61e211dSHarvey Harrison static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
465c61e211dSHarvey Harrison 				 unsigned long error_code)
466c61e211dSHarvey Harrison {
467c61e211dSHarvey Harrison 	unsigned long flags = oops_begin();
468c61e211dSHarvey Harrison 	struct task_struct *tsk;
469c61e211dSHarvey Harrison 
470c61e211dSHarvey Harrison 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
471c61e211dSHarvey Harrison 	       current->comm, address);
472c61e211dSHarvey Harrison 	dump_pagetable(address);
473c61e211dSHarvey Harrison 	tsk = current;
474c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
475c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
476c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
477c61e211dSHarvey Harrison 	if (__die("Bad pagetable", regs, error_code))
478c61e211dSHarvey Harrison 		regs = NULL;
479c61e211dSHarvey Harrison 	oops_end(flags, regs, SIGKILL);
480c61e211dSHarvey Harrison }
481c61e211dSHarvey Harrison #endif
482c61e211dSHarvey Harrison 
483d8b57bb7SThomas Gleixner static int spurious_fault_check(unsigned long error_code, pte_t *pte)
484d8b57bb7SThomas Gleixner {
485d8b57bb7SThomas Gleixner 	if ((error_code & PF_WRITE) && !pte_write(*pte))
486d8b57bb7SThomas Gleixner 		return 0;
487d8b57bb7SThomas Gleixner 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
488d8b57bb7SThomas Gleixner 		return 0;
489d8b57bb7SThomas Gleixner 
490d8b57bb7SThomas Gleixner 	return 1;
491d8b57bb7SThomas Gleixner }
492d8b57bb7SThomas Gleixner 
493c61e211dSHarvey Harrison /*
4945b727a3bSJeremy Fitzhardinge  * Handle a spurious fault caused by a stale TLB entry.  This allows
4955b727a3bSJeremy Fitzhardinge  * us to lazily refresh the TLB when increasing the permissions of a
4965b727a3bSJeremy Fitzhardinge  * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
4975b727a3bSJeremy Fitzhardinge  * expensive since that implies doing a full cross-processor TLB
4985b727a3bSJeremy Fitzhardinge  * flush, even if no stale TLB entries exist on other processors.
4995b727a3bSJeremy Fitzhardinge  * There are no security implications to leaving a stale TLB when
5005b727a3bSJeremy Fitzhardinge  * increasing the permissions on a page.
5015b727a3bSJeremy Fitzhardinge  */
5025b727a3bSJeremy Fitzhardinge static int spurious_fault(unsigned long address,
5035b727a3bSJeremy Fitzhardinge 			  unsigned long error_code)
5045b727a3bSJeremy Fitzhardinge {
5055b727a3bSJeremy Fitzhardinge 	pgd_t *pgd;
5065b727a3bSJeremy Fitzhardinge 	pud_t *pud;
5075b727a3bSJeremy Fitzhardinge 	pmd_t *pmd;
5085b727a3bSJeremy Fitzhardinge 	pte_t *pte;
5095b727a3bSJeremy Fitzhardinge 
5105b727a3bSJeremy Fitzhardinge 	/* Reserved-bit violation or user access to kernel space? */
5115b727a3bSJeremy Fitzhardinge 	if (error_code & (PF_USER | PF_RSVD))
5125b727a3bSJeremy Fitzhardinge 		return 0;
5135b727a3bSJeremy Fitzhardinge 
5145b727a3bSJeremy Fitzhardinge 	pgd = init_mm.pgd + pgd_index(address);
5155b727a3bSJeremy Fitzhardinge 	if (!pgd_present(*pgd))
5165b727a3bSJeremy Fitzhardinge 		return 0;
5175b727a3bSJeremy Fitzhardinge 
5185b727a3bSJeremy Fitzhardinge 	pud = pud_offset(pgd, address);
5195b727a3bSJeremy Fitzhardinge 	if (!pud_present(*pud))
5205b727a3bSJeremy Fitzhardinge 		return 0;
5215b727a3bSJeremy Fitzhardinge 
522d8b57bb7SThomas Gleixner 	if (pud_large(*pud))
523d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pud);
524d8b57bb7SThomas Gleixner 
5255b727a3bSJeremy Fitzhardinge 	pmd = pmd_offset(pud, address);
5265b727a3bSJeremy Fitzhardinge 	if (!pmd_present(*pmd))
5275b727a3bSJeremy Fitzhardinge 		return 0;
5285b727a3bSJeremy Fitzhardinge 
529d8b57bb7SThomas Gleixner 	if (pmd_large(*pmd))
530d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pmd);
531d8b57bb7SThomas Gleixner 
5325b727a3bSJeremy Fitzhardinge 	pte = pte_offset_kernel(pmd, address);
5335b727a3bSJeremy Fitzhardinge 	if (!pte_present(*pte))
5345b727a3bSJeremy Fitzhardinge 		return 0;
5355b727a3bSJeremy Fitzhardinge 
536d8b57bb7SThomas Gleixner 	return spurious_fault_check(error_code, pte);
5375b727a3bSJeremy Fitzhardinge }
5385b727a3bSJeremy Fitzhardinge 
5395b727a3bSJeremy Fitzhardinge /*
540c61e211dSHarvey Harrison  * X86_32
541c61e211dSHarvey Harrison  * Handle a fault on the vmalloc or module mapping area
542c61e211dSHarvey Harrison  *
543c61e211dSHarvey Harrison  * X86_64
544c61e211dSHarvey Harrison  * Handle a fault on the vmalloc area
545c61e211dSHarvey Harrison  *
546c61e211dSHarvey Harrison  * This assumes no large pages in there.
547c61e211dSHarvey Harrison  */
548c61e211dSHarvey Harrison static int vmalloc_fault(unsigned long address)
549c61e211dSHarvey Harrison {
550c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
551c61e211dSHarvey Harrison 	unsigned long pgd_paddr;
552c61e211dSHarvey Harrison 	pmd_t *pmd_k;
553c61e211dSHarvey Harrison 	pte_t *pte_k;
554c61e211dSHarvey Harrison 	/*
555c61e211dSHarvey Harrison 	 * Synchronize this task's top level page-table
556c61e211dSHarvey Harrison 	 * with the 'reference' page table.
557c61e211dSHarvey Harrison 	 *
558c61e211dSHarvey Harrison 	 * Do _not_ use "current" here. We might be inside
559c61e211dSHarvey Harrison 	 * an interrupt in the middle of a task switch..
560c61e211dSHarvey Harrison 	 */
561c61e211dSHarvey Harrison 	pgd_paddr = read_cr3();
562c61e211dSHarvey Harrison 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
563c61e211dSHarvey Harrison 	if (!pmd_k)
564c61e211dSHarvey Harrison 		return -1;
565c61e211dSHarvey Harrison 	pte_k = pte_offset_kernel(pmd_k, address);
566c61e211dSHarvey Harrison 	if (!pte_present(*pte_k))
567c61e211dSHarvey Harrison 		return -1;
568c61e211dSHarvey Harrison 	return 0;
569c61e211dSHarvey Harrison #else
570c61e211dSHarvey Harrison 	pgd_t *pgd, *pgd_ref;
571c61e211dSHarvey Harrison 	pud_t *pud, *pud_ref;
572c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_ref;
573c61e211dSHarvey Harrison 	pte_t *pte, *pte_ref;
574c61e211dSHarvey Harrison 
575cf89ec92SHarvey Harrison 	/* Make sure we are in vmalloc area */
576cf89ec92SHarvey Harrison 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
577cf89ec92SHarvey Harrison 		return -1;
578cf89ec92SHarvey Harrison 
579c61e211dSHarvey Harrison 	/* Copy kernel mappings over when needed. This can also
580c61e211dSHarvey Harrison 	   happen within a race in page table update. In the later
581c61e211dSHarvey Harrison 	   case just flush. */
582c61e211dSHarvey Harrison 
583c61e211dSHarvey Harrison 	pgd = pgd_offset(current->mm ?: &init_mm, address);
584c61e211dSHarvey Harrison 	pgd_ref = pgd_offset_k(address);
585c61e211dSHarvey Harrison 	if (pgd_none(*pgd_ref))
586c61e211dSHarvey Harrison 		return -1;
587c61e211dSHarvey Harrison 	if (pgd_none(*pgd))
588c61e211dSHarvey Harrison 		set_pgd(pgd, *pgd_ref);
589c61e211dSHarvey Harrison 	else
590c61e211dSHarvey Harrison 		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
591c61e211dSHarvey Harrison 
592c61e211dSHarvey Harrison 	/* Below here mismatches are bugs because these lower tables
593c61e211dSHarvey Harrison 	   are shared */
594c61e211dSHarvey Harrison 
595c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
596c61e211dSHarvey Harrison 	pud_ref = pud_offset(pgd_ref, address);
597c61e211dSHarvey Harrison 	if (pud_none(*pud_ref))
598c61e211dSHarvey Harrison 		return -1;
599c61e211dSHarvey Harrison 	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
600c61e211dSHarvey Harrison 		BUG();
601c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
602c61e211dSHarvey Harrison 	pmd_ref = pmd_offset(pud_ref, address);
603c61e211dSHarvey Harrison 	if (pmd_none(*pmd_ref))
604c61e211dSHarvey Harrison 		return -1;
605c61e211dSHarvey Harrison 	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
606c61e211dSHarvey Harrison 		BUG();
607c61e211dSHarvey Harrison 	pte_ref = pte_offset_kernel(pmd_ref, address);
608c61e211dSHarvey Harrison 	if (!pte_present(*pte_ref))
609c61e211dSHarvey Harrison 		return -1;
610c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
611c61e211dSHarvey Harrison 	/* Don't use pte_page here, because the mappings can point
612c61e211dSHarvey Harrison 	   outside mem_map, and the NUMA hash lookup cannot handle
613c61e211dSHarvey Harrison 	   that. */
614c61e211dSHarvey Harrison 	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
615c61e211dSHarvey Harrison 		BUG();
616c61e211dSHarvey Harrison 	return 0;
617c61e211dSHarvey Harrison #endif
618c61e211dSHarvey Harrison }
619c61e211dSHarvey Harrison 
620c61e211dSHarvey Harrison int show_unhandled_signals = 1;
621c61e211dSHarvey Harrison 
622c61e211dSHarvey Harrison /*
623c61e211dSHarvey Harrison  * This routine handles page faults.  It determines the address,
624c61e211dSHarvey Harrison  * and the problem, and then passes it off to one of the appropriate
625c61e211dSHarvey Harrison  * routines.
626c61e211dSHarvey Harrison  */
627c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
628c61e211dSHarvey Harrison asmlinkage
629c61e211dSHarvey Harrison #endif
630c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
631c61e211dSHarvey Harrison {
632c61e211dSHarvey Harrison 	struct task_struct *tsk;
633c61e211dSHarvey Harrison 	struct mm_struct *mm;
634c61e211dSHarvey Harrison 	struct vm_area_struct *vma;
635c61e211dSHarvey Harrison 	unsigned long address;
636c61e211dSHarvey Harrison 	int write, si_code;
637c61e211dSHarvey Harrison 	int fault;
638c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
639c61e211dSHarvey Harrison 	unsigned long flags;
640c61e211dSHarvey Harrison #endif
641c61e211dSHarvey Harrison 
642c61e211dSHarvey Harrison 	/*
643c61e211dSHarvey Harrison 	 * We can fault from pretty much anywhere, with unknown IRQ state.
644c61e211dSHarvey Harrison 	 */
645c61e211dSHarvey Harrison 	trace_hardirqs_fixup();
646c61e211dSHarvey Harrison 
647c61e211dSHarvey Harrison 	tsk = current;
648c61e211dSHarvey Harrison 	mm = tsk->mm;
649c61e211dSHarvey Harrison 	prefetchw(&mm->mmap_sem);
650c61e211dSHarvey Harrison 
651c61e211dSHarvey Harrison 	/* get the address */
652c61e211dSHarvey Harrison 	address = read_cr2();
653c61e211dSHarvey Harrison 
654c61e211dSHarvey Harrison 	si_code = SEGV_MAPERR;
655c61e211dSHarvey Harrison 
656c61e211dSHarvey Harrison 	if (notify_page_fault(regs))
657c61e211dSHarvey Harrison 		return;
658*86069782SPekka Paalanen 	if (handle_custom_pf(regs, error_code, address))
659*86069782SPekka Paalanen 		return;
660c61e211dSHarvey Harrison 
661c61e211dSHarvey Harrison 	/*
662c61e211dSHarvey Harrison 	 * We fault-in kernel-space virtual memory on-demand. The
663c61e211dSHarvey Harrison 	 * 'reference' page table is init_mm.pgd.
664c61e211dSHarvey Harrison 	 *
665c61e211dSHarvey Harrison 	 * NOTE! We MUST NOT take any locks for this case. We may
666c61e211dSHarvey Harrison 	 * be in an interrupt or a critical region, and should
667c61e211dSHarvey Harrison 	 * only copy the information from the master page table,
668c61e211dSHarvey Harrison 	 * nothing more.
669c61e211dSHarvey Harrison 	 *
670c61e211dSHarvey Harrison 	 * This verifies that the fault happens in kernel space
671c61e211dSHarvey Harrison 	 * (error_code & 4) == 0, and that the fault was not a
672c61e211dSHarvey Harrison 	 * protection error (error_code & 9) == 0.
673c61e211dSHarvey Harrison 	 */
674c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
675c61e211dSHarvey Harrison 	if (unlikely(address >= TASK_SIZE)) {
676cf89ec92SHarvey Harrison #else
677cf89ec92SHarvey Harrison 	if (unlikely(address >= TASK_SIZE64)) {
678cf89ec92SHarvey Harrison #endif
679c61e211dSHarvey Harrison 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
680c61e211dSHarvey Harrison 		    vmalloc_fault(address) >= 0)
681c61e211dSHarvey Harrison 			return;
6825b727a3bSJeremy Fitzhardinge 
6835b727a3bSJeremy Fitzhardinge 		/* Can handle a stale RO->RW TLB */
6845b727a3bSJeremy Fitzhardinge 		if (spurious_fault(address, error_code))
6855b727a3bSJeremy Fitzhardinge 			return;
6865b727a3bSJeremy Fitzhardinge 
687c61e211dSHarvey Harrison 		/*
688c61e211dSHarvey Harrison 		 * Don't take the mm semaphore here. If we fixup a prefetch
689c61e211dSHarvey Harrison 		 * fault we could otherwise deadlock.
690c61e211dSHarvey Harrison 		 */
691c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
692c61e211dSHarvey Harrison 	}
693c61e211dSHarvey Harrison 
694cf89ec92SHarvey Harrison 
695cf89ec92SHarvey Harrison #ifdef CONFIG_X86_32
696c61e211dSHarvey Harrison 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
697c61e211dSHarvey Harrison 	   fault has been handled. */
6986b6891f9Sgorcunov@gmail.com 	if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
699c61e211dSHarvey Harrison 		local_irq_enable();
700c61e211dSHarvey Harrison 
701c61e211dSHarvey Harrison 	/*
702c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
703c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
704c61e211dSHarvey Harrison 	 */
705c61e211dSHarvey Harrison 	if (in_atomic() || !mm)
706c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
707c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
708c61e211dSHarvey Harrison 	if (likely(regs->flags & X86_EFLAGS_IF))
709c61e211dSHarvey Harrison 		local_irq_enable();
710c61e211dSHarvey Harrison 
711c61e211dSHarvey Harrison 	if (unlikely(error_code & PF_RSVD))
712c61e211dSHarvey Harrison 		pgtable_bad(address, regs, error_code);
713c61e211dSHarvey Harrison 
714c61e211dSHarvey Harrison 	/*
715c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
716c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
717c61e211dSHarvey Harrison 	 */
718c61e211dSHarvey Harrison 	if (unlikely(in_atomic() || !mm))
719c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
720c61e211dSHarvey Harrison 
721c61e211dSHarvey Harrison 	/*
722c61e211dSHarvey Harrison 	 * User-mode registers count as a user access even for any
723c61e211dSHarvey Harrison 	 * potential system fault or CPU buglet.
724c61e211dSHarvey Harrison 	 */
725c61e211dSHarvey Harrison 	if (user_mode_vm(regs))
726c61e211dSHarvey Harrison 		error_code |= PF_USER;
727c61e211dSHarvey Harrison again:
728c61e211dSHarvey Harrison #endif
729c61e211dSHarvey Harrison 	/* When running in the kernel we expect faults to occur only to
730c61e211dSHarvey Harrison 	 * addresses in user space.  All other faults represent errors in the
731c61e211dSHarvey Harrison 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
732c61e211dSHarvey Harrison 	 * erroneous fault occurring in a code path which already holds mmap_sem
733c61e211dSHarvey Harrison 	 * we will deadlock attempting to validate the fault against the
734c61e211dSHarvey Harrison 	 * address space.  Luckily the kernel only validly references user
735c61e211dSHarvey Harrison 	 * space from well defined areas of code, which are listed in the
736c61e211dSHarvey Harrison 	 * exceptions table.
737c61e211dSHarvey Harrison 	 *
738c61e211dSHarvey Harrison 	 * As the vast majority of faults will be valid we will only perform
739c61e211dSHarvey Harrison 	 * the source reference check when there is a possibility of a deadlock.
740c61e211dSHarvey Harrison 	 * Attempt to lock the address space, if we cannot we then validate the
741c61e211dSHarvey Harrison 	 * source.  If this is invalid we can skip the address space check,
742c61e211dSHarvey Harrison 	 * thus avoiding the deadlock.
743c61e211dSHarvey Harrison 	 */
744c61e211dSHarvey Harrison 	if (!down_read_trylock(&mm->mmap_sem)) {
745c61e211dSHarvey Harrison 		if ((error_code & PF_USER) == 0 &&
746c61e211dSHarvey Harrison 		    !search_exception_tables(regs->ip))
747c61e211dSHarvey Harrison 			goto bad_area_nosemaphore;
748c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
749c61e211dSHarvey Harrison 	}
750c61e211dSHarvey Harrison 
751c61e211dSHarvey Harrison 	vma = find_vma(mm, address);
752c61e211dSHarvey Harrison 	if (!vma)
753c61e211dSHarvey Harrison 		goto bad_area;
754c61e211dSHarvey Harrison 	if (vma->vm_start <= address)
755c61e211dSHarvey Harrison 		goto good_area;
756c61e211dSHarvey Harrison 	if (!(vma->vm_flags & VM_GROWSDOWN))
757c61e211dSHarvey Harrison 		goto bad_area;
758c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
759c61e211dSHarvey Harrison 		/*
760c61e211dSHarvey Harrison 		 * Accessing the stack below %sp is always a bug.
761c61e211dSHarvey Harrison 		 * The large cushion allows instructions like enter
762c61e211dSHarvey Harrison 		 * and pusha to work.  ("enter $65535,$31" pushes
763c61e211dSHarvey Harrison 		 * 32 pointers and then decrements %sp by 65535.)
764c61e211dSHarvey Harrison 		 */
765c61e211dSHarvey Harrison 		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
766c61e211dSHarvey Harrison 			goto bad_area;
767c61e211dSHarvey Harrison 	}
768c61e211dSHarvey Harrison 	if (expand_stack(vma, address))
769c61e211dSHarvey Harrison 		goto bad_area;
770c61e211dSHarvey Harrison /*
771c61e211dSHarvey Harrison  * Ok, we have a good vm_area for this memory access, so
772c61e211dSHarvey Harrison  * we can handle it..
773c61e211dSHarvey Harrison  */
774c61e211dSHarvey Harrison good_area:
775c61e211dSHarvey Harrison 	si_code = SEGV_ACCERR;
776c61e211dSHarvey Harrison 	write = 0;
777c61e211dSHarvey Harrison 	switch (error_code & (PF_PROT|PF_WRITE)) {
778c61e211dSHarvey Harrison 	default:	/* 3: write, present */
779c61e211dSHarvey Harrison 		/* fall through */
780c61e211dSHarvey Harrison 	case PF_WRITE:		/* write, not present */
781c61e211dSHarvey Harrison 		if (!(vma->vm_flags & VM_WRITE))
782c61e211dSHarvey Harrison 			goto bad_area;
783c61e211dSHarvey Harrison 		write++;
784c61e211dSHarvey Harrison 		break;
785c61e211dSHarvey Harrison 	case PF_PROT:		/* read, present */
786c61e211dSHarvey Harrison 		goto bad_area;
787c61e211dSHarvey Harrison 	case 0:			/* read, not present */
788c61e211dSHarvey Harrison 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
789c61e211dSHarvey Harrison 			goto bad_area;
790c61e211dSHarvey Harrison 	}
791c61e211dSHarvey Harrison 
792c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
793c61e211dSHarvey Harrison survive:
794c61e211dSHarvey Harrison #endif
795c61e211dSHarvey Harrison 	/*
796c61e211dSHarvey Harrison 	 * If for any reason at all we couldn't handle the fault,
797c61e211dSHarvey Harrison 	 * make sure we exit gracefully rather than endlessly redo
798c61e211dSHarvey Harrison 	 * the fault.
799c61e211dSHarvey Harrison 	 */
800c61e211dSHarvey Harrison 	fault = handle_mm_fault(mm, vma, address, write);
801c61e211dSHarvey Harrison 	if (unlikely(fault & VM_FAULT_ERROR)) {
802c61e211dSHarvey Harrison 		if (fault & VM_FAULT_OOM)
803c61e211dSHarvey Harrison 			goto out_of_memory;
804c61e211dSHarvey Harrison 		else if (fault & VM_FAULT_SIGBUS)
805c61e211dSHarvey Harrison 			goto do_sigbus;
806c61e211dSHarvey Harrison 		BUG();
807c61e211dSHarvey Harrison 	}
808c61e211dSHarvey Harrison 	if (fault & VM_FAULT_MAJOR)
809c61e211dSHarvey Harrison 		tsk->maj_flt++;
810c61e211dSHarvey Harrison 	else
811c61e211dSHarvey Harrison 		tsk->min_flt++;
812c61e211dSHarvey Harrison 
813c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
814c61e211dSHarvey Harrison 	/*
815c61e211dSHarvey Harrison 	 * Did it hit the DOS screen memory VA from vm86 mode?
816c61e211dSHarvey Harrison 	 */
817c61e211dSHarvey Harrison 	if (v8086_mode(regs)) {
818c61e211dSHarvey Harrison 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
819c61e211dSHarvey Harrison 		if (bit < 32)
820c61e211dSHarvey Harrison 			tsk->thread.screen_bitmap |= 1 << bit;
821c61e211dSHarvey Harrison 	}
822c61e211dSHarvey Harrison #endif
823c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
824c61e211dSHarvey Harrison 	return;
825c61e211dSHarvey Harrison 
826c61e211dSHarvey Harrison /*
827c61e211dSHarvey Harrison  * Something tried to access memory that isn't in our memory map..
828c61e211dSHarvey Harrison  * Fix it, but check if it's kernel or user first..
829c61e211dSHarvey Harrison  */
830c61e211dSHarvey Harrison bad_area:
831c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
832c61e211dSHarvey Harrison 
833c61e211dSHarvey Harrison bad_area_nosemaphore:
834c61e211dSHarvey Harrison 	/* User mode accesses just cause a SIGSEGV */
835c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
836c61e211dSHarvey Harrison 		/*
837c61e211dSHarvey Harrison 		 * It's possible to have interrupts off here.
838c61e211dSHarvey Harrison 		 */
839c61e211dSHarvey Harrison 		local_irq_enable();
840c61e211dSHarvey Harrison 
841c61e211dSHarvey Harrison 		/*
842c61e211dSHarvey Harrison 		 * Valid to do another page fault here because this one came
843c61e211dSHarvey Harrison 		 * from user space.
844c61e211dSHarvey Harrison 		 */
845c61e211dSHarvey Harrison 		if (is_prefetch(regs, address, error_code))
846c61e211dSHarvey Harrison 			return;
847c61e211dSHarvey Harrison 
848c61e211dSHarvey Harrison 		if (is_errata100(regs, address))
849c61e211dSHarvey Harrison 			return;
850c61e211dSHarvey Harrison 
851c61e211dSHarvey Harrison 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
852c61e211dSHarvey Harrison 		    printk_ratelimit()) {
853c61e211dSHarvey Harrison 			printk(
854c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
855c61e211dSHarvey Harrison 			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
856c61e211dSHarvey Harrison #else
857c61e211dSHarvey Harrison 			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
858c61e211dSHarvey Harrison #endif
859c61e211dSHarvey Harrison 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
860c61e211dSHarvey Harrison 			tsk->comm, task_pid_nr(tsk), address, regs->ip,
861c61e211dSHarvey Harrison 			regs->sp, error_code);
862c61e211dSHarvey Harrison 			print_vma_addr(" in ", regs->ip);
863c61e211dSHarvey Harrison 			printk("\n");
864c61e211dSHarvey Harrison 		}
865c61e211dSHarvey Harrison 
866c61e211dSHarvey Harrison 		tsk->thread.cr2 = address;
867c61e211dSHarvey Harrison 		/* Kernel addresses are always protection faults */
868c61e211dSHarvey Harrison 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
869c61e211dSHarvey Harrison 		tsk->thread.trap_no = 14;
870c61e211dSHarvey Harrison 		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
871c61e211dSHarvey Harrison 		return;
872c61e211dSHarvey Harrison 	}
873c61e211dSHarvey Harrison 
874c61e211dSHarvey Harrison 	if (is_f00f_bug(regs, address))
875c61e211dSHarvey Harrison 		return;
876c61e211dSHarvey Harrison 
877c61e211dSHarvey Harrison no_context:
878c61e211dSHarvey Harrison 	/* Are we prepared to handle this kernel fault?  */
879c61e211dSHarvey Harrison 	if (fixup_exception(regs))
880c61e211dSHarvey Harrison 		return;
881c61e211dSHarvey Harrison 
882c61e211dSHarvey Harrison 	/*
883c61e211dSHarvey Harrison 	 * X86_32
884c61e211dSHarvey Harrison 	 * Valid to do another page fault here, because if this fault
885c61e211dSHarvey Harrison 	 * had been triggered by is_prefetch fixup_exception would have
886c61e211dSHarvey Harrison 	 * handled it.
887c61e211dSHarvey Harrison 	 *
888c61e211dSHarvey Harrison 	 * X86_64
889c61e211dSHarvey Harrison 	 * Hall of shame of CPU/BIOS bugs.
890c61e211dSHarvey Harrison 	 */
891c61e211dSHarvey Harrison 	if (is_prefetch(regs, address, error_code))
892c61e211dSHarvey Harrison 		return;
893c61e211dSHarvey Harrison 
894c61e211dSHarvey Harrison 	if (is_errata93(regs, address))
895c61e211dSHarvey Harrison 		return;
896c61e211dSHarvey Harrison 
897c61e211dSHarvey Harrison /*
898c61e211dSHarvey Harrison  * Oops. The kernel tried to access some bad page. We'll have to
899c61e211dSHarvey Harrison  * terminate things with extreme prejudice.
900c61e211dSHarvey Harrison  */
901c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
902c61e211dSHarvey Harrison 	bust_spinlocks(1);
903fd40d6e3SHarvey Harrison #else
904fd40d6e3SHarvey Harrison 	flags = oops_begin();
905fd40d6e3SHarvey Harrison #endif
906c61e211dSHarvey Harrison 
907c61e211dSHarvey Harrison 	show_fault_oops(regs, error_code, address);
908c61e211dSHarvey Harrison 
909c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
910c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
911c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
912fd40d6e3SHarvey Harrison 
913fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
914c61e211dSHarvey Harrison 	die("Oops", regs, error_code);
915c61e211dSHarvey Harrison 	bust_spinlocks(0);
916c61e211dSHarvey Harrison 	do_exit(SIGKILL);
917fd40d6e3SHarvey Harrison #else
918c61e211dSHarvey Harrison 	if (__die("Oops", regs, error_code))
919c61e211dSHarvey Harrison 		regs = NULL;
920c61e211dSHarvey Harrison 	/* Executive summary in case the body of the oops scrolled away */
921c61e211dSHarvey Harrison 	printk(KERN_EMERG "CR2: %016lx\n", address);
922c61e211dSHarvey Harrison 	oops_end(flags, regs, SIGKILL);
923c61e211dSHarvey Harrison #endif
924c61e211dSHarvey Harrison 
925c61e211dSHarvey Harrison /*
926c61e211dSHarvey Harrison  * We ran out of memory, or some other thing happened to us that made
927c61e211dSHarvey Harrison  * us unable to handle the page fault gracefully.
928c61e211dSHarvey Harrison  */
929c61e211dSHarvey Harrison out_of_memory:
930c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
931c61e211dSHarvey Harrison 	if (is_global_init(tsk)) {
932c61e211dSHarvey Harrison 		yield();
933fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
934c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
935c61e211dSHarvey Harrison 		goto survive;
936c61e211dSHarvey Harrison #else
937c61e211dSHarvey Harrison 		goto again;
938c61e211dSHarvey Harrison #endif
939fd40d6e3SHarvey Harrison 	}
940fd40d6e3SHarvey Harrison 
941c61e211dSHarvey Harrison 	printk("VM: killing process %s\n", tsk->comm);
942c61e211dSHarvey Harrison 	if (error_code & PF_USER)
943c61e211dSHarvey Harrison 		do_group_exit(SIGKILL);
944c61e211dSHarvey Harrison 	goto no_context;
945c61e211dSHarvey Harrison 
946c61e211dSHarvey Harrison do_sigbus:
947c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
948c61e211dSHarvey Harrison 
949c61e211dSHarvey Harrison 	/* Kernel mode? Handle exceptions or die */
950c61e211dSHarvey Harrison 	if (!(error_code & PF_USER))
951c61e211dSHarvey Harrison 		goto no_context;
952c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
953c61e211dSHarvey Harrison 	/* User space => ok to do another page fault */
954c61e211dSHarvey Harrison 	if (is_prefetch(regs, address, error_code))
955c61e211dSHarvey Harrison 		return;
956c61e211dSHarvey Harrison #endif
957c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
958c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
959c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
960c61e211dSHarvey Harrison 	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
961c61e211dSHarvey Harrison }
962c61e211dSHarvey Harrison 
963c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock);
964c61e211dSHarvey Harrison LIST_HEAD(pgd_list);
965c61e211dSHarvey Harrison 
966c61e211dSHarvey Harrison void vmalloc_sync_all(void)
967c61e211dSHarvey Harrison {
968c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
969c61e211dSHarvey Harrison 	/*
970c61e211dSHarvey Harrison 	 * Note that races in the updates of insync and start aren't
971c61e211dSHarvey Harrison 	 * problematic: insync can only get set bits added, and updates to
972c61e211dSHarvey Harrison 	 * start are only improving performance (without affecting correctness
973c61e211dSHarvey Harrison 	 * if undone).
974c61e211dSHarvey Harrison 	 */
975c61e211dSHarvey Harrison 	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
976c61e211dSHarvey Harrison 	static unsigned long start = TASK_SIZE;
977c61e211dSHarvey Harrison 	unsigned long address;
978c61e211dSHarvey Harrison 
979c61e211dSHarvey Harrison 	if (SHARED_KERNEL_PMD)
980c61e211dSHarvey Harrison 		return;
981c61e211dSHarvey Harrison 
982c61e211dSHarvey Harrison 	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
983c61e211dSHarvey Harrison 	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
984c61e211dSHarvey Harrison 		if (!test_bit(pgd_index(address), insync)) {
985c61e211dSHarvey Harrison 			unsigned long flags;
986c61e211dSHarvey Harrison 			struct page *page;
987c61e211dSHarvey Harrison 
988c61e211dSHarvey Harrison 			spin_lock_irqsave(&pgd_lock, flags);
989e3ed910dSJeremy Fitzhardinge 			list_for_each_entry(page, &pgd_list, lru) {
990c61e211dSHarvey Harrison 				if (!vmalloc_sync_one(page_address(page),
991e3ed910dSJeremy Fitzhardinge 						      address))
992c61e211dSHarvey Harrison 					break;
993c61e211dSHarvey Harrison 			}
994c61e211dSHarvey Harrison 			spin_unlock_irqrestore(&pgd_lock, flags);
995c61e211dSHarvey Harrison 			if (!page)
996c61e211dSHarvey Harrison 				set_bit(pgd_index(address), insync);
997c61e211dSHarvey Harrison 		}
998c61e211dSHarvey Harrison 		if (address == start && test_bit(pgd_index(address), insync))
999c61e211dSHarvey Harrison 			start = address + PGDIR_SIZE;
1000c61e211dSHarvey Harrison 	}
1001c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
1002c61e211dSHarvey Harrison 	/*
1003c61e211dSHarvey Harrison 	 * Note that races in the updates of insync and start aren't
1004c61e211dSHarvey Harrison 	 * problematic: insync can only get set bits added, and updates to
1005c61e211dSHarvey Harrison 	 * start are only improving performance (without affecting correctness
1006c61e211dSHarvey Harrison 	 * if undone).
1007c61e211dSHarvey Harrison 	 */
1008c61e211dSHarvey Harrison 	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
1009c61e211dSHarvey Harrison 	static unsigned long start = VMALLOC_START & PGDIR_MASK;
1010c61e211dSHarvey Harrison 	unsigned long address;
1011c61e211dSHarvey Harrison 
1012c61e211dSHarvey Harrison 	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
1013c61e211dSHarvey Harrison 		if (!test_bit(pgd_index(address), insync)) {
1014c61e211dSHarvey Harrison 			const pgd_t *pgd_ref = pgd_offset_k(address);
101558d5d0d8SIngo Molnar 			unsigned long flags;
1016c61e211dSHarvey Harrison 			struct page *page;
1017c61e211dSHarvey Harrison 
1018c61e211dSHarvey Harrison 			if (pgd_none(*pgd_ref))
1019c61e211dSHarvey Harrison 				continue;
102058d5d0d8SIngo Molnar 			spin_lock_irqsave(&pgd_lock, flags);
1021c61e211dSHarvey Harrison 			list_for_each_entry(page, &pgd_list, lru) {
1022c61e211dSHarvey Harrison 				pgd_t *pgd;
1023c61e211dSHarvey Harrison 				pgd = (pgd_t *)page_address(page) + pgd_index(address);
1024c61e211dSHarvey Harrison 				if (pgd_none(*pgd))
1025c61e211dSHarvey Harrison 					set_pgd(pgd, *pgd_ref);
1026c61e211dSHarvey Harrison 				else
1027c61e211dSHarvey Harrison 					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1028c61e211dSHarvey Harrison 			}
102958d5d0d8SIngo Molnar 			spin_unlock_irqrestore(&pgd_lock, flags);
1030c61e211dSHarvey Harrison 			set_bit(pgd_index(address), insync);
1031c61e211dSHarvey Harrison 		}
1032c61e211dSHarvey Harrison 		if (address == start)
1033c61e211dSHarvey Harrison 			start = address + PGDIR_SIZE;
1034c61e211dSHarvey Harrison 	}
1035c61e211dSHarvey Harrison #endif
1036c61e211dSHarvey Harrison }
1037