xref: /openbmc/linux/arch/x86/mm/fault.c (revision 10c43d2eb50c9a5ad60388b9d3c41c31150049e6)
1c61e211dSHarvey Harrison /*
2c61e211dSHarvey Harrison  *  Copyright (C) 1995  Linus Torvalds
3c61e211dSHarvey Harrison  *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4c61e211dSHarvey Harrison  */
5c61e211dSHarvey Harrison 
6c61e211dSHarvey Harrison #include <linux/signal.h>
7c61e211dSHarvey Harrison #include <linux/sched.h>
8c61e211dSHarvey Harrison #include <linux/kernel.h>
9c61e211dSHarvey Harrison #include <linux/errno.h>
10c61e211dSHarvey Harrison #include <linux/string.h>
11c61e211dSHarvey Harrison #include <linux/types.h>
12c61e211dSHarvey Harrison #include <linux/ptrace.h>
13c61e211dSHarvey Harrison #include <linux/mman.h>
14c61e211dSHarvey Harrison #include <linux/mm.h>
15c61e211dSHarvey Harrison #include <linux/smp.h>
16c61e211dSHarvey Harrison #include <linux/interrupt.h>
17c61e211dSHarvey Harrison #include <linux/init.h>
18c61e211dSHarvey Harrison #include <linux/tty.h>
19c61e211dSHarvey Harrison #include <linux/vt_kern.h>		/* For unblank_screen() */
20c61e211dSHarvey Harrison #include <linux/compiler.h>
21c61e211dSHarvey Harrison #include <linux/highmem.h>
22c61e211dSHarvey Harrison #include <linux/bootmem.h>		/* for max_low_pfn */
23c61e211dSHarvey Harrison #include <linux/vmalloc.h>
24c61e211dSHarvey Harrison #include <linux/module.h>
25c61e211dSHarvey Harrison #include <linux/kprobes.h>
26c61e211dSHarvey Harrison #include <linux/uaccess.h>
27c61e211dSHarvey Harrison #include <linux/kdebug.h>
28c61e211dSHarvey Harrison 
29c61e211dSHarvey Harrison #include <asm/system.h>
30c61e211dSHarvey Harrison #include <asm/desc.h>
31c61e211dSHarvey Harrison #include <asm/segment.h>
32c61e211dSHarvey Harrison #include <asm/pgalloc.h>
33c61e211dSHarvey Harrison #include <asm/smp.h>
34c61e211dSHarvey Harrison #include <asm/tlbflush.h>
35c61e211dSHarvey Harrison #include <asm/proto.h>
36c61e211dSHarvey Harrison #include <asm-generic/sections.h>
37c61e211dSHarvey Harrison 
38c61e211dSHarvey Harrison /*
39c61e211dSHarvey Harrison  * Page fault error code bits
40c61e211dSHarvey Harrison  *	bit 0 == 0 means no page found, 1 means protection fault
41c61e211dSHarvey Harrison  *	bit 1 == 0 means read, 1 means write
42c61e211dSHarvey Harrison  *	bit 2 == 0 means kernel, 1 means user-mode
43c61e211dSHarvey Harrison  *	bit 3 == 1 means use of reserved bit detected
44c61e211dSHarvey Harrison  *	bit 4 == 1 means fault was an instruction fetch
45c61e211dSHarvey Harrison  */
46c61e211dSHarvey Harrison #define PF_PROT		(1<<0)
47c61e211dSHarvey Harrison #define PF_WRITE	(1<<1)
48c61e211dSHarvey Harrison #define PF_USER		(1<<2)
49c61e211dSHarvey Harrison #define PF_RSVD		(1<<3)
50c61e211dSHarvey Harrison #define PF_INSTR	(1<<4)
51c61e211dSHarvey Harrison 
52*10c43d2eSPekka Paalanen #ifdef CONFIG_MMIOTRACE_HOOKS
53*10c43d2eSPekka Paalanen static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */
54*10c43d2eSPekka Paalanen static DEFINE_SPINLOCK(mmiotrace_handler_lock);
5586069782SPekka Paalanen 
56*10c43d2eSPekka Paalanen int mmiotrace_register_pf(pf_handler_func new_pfh)
5786069782SPekka Paalanen {
58*10c43d2eSPekka Paalanen 	int ret = 0;
5986069782SPekka Paalanen 	unsigned long flags;
60*10c43d2eSPekka Paalanen 	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
61*10c43d2eSPekka Paalanen 	if (mmiotrace_pf_handler)
62*10c43d2eSPekka Paalanen 		ret = -EBUSY;
63*10c43d2eSPekka Paalanen 	else
64*10c43d2eSPekka Paalanen 		mmiotrace_pf_handler = new_pfh;
65*10c43d2eSPekka Paalanen 	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
66*10c43d2eSPekka Paalanen 	return ret;
6786069782SPekka Paalanen }
68*10c43d2eSPekka Paalanen EXPORT_SYMBOL_GPL(mmiotrace_register_pf);
6986069782SPekka Paalanen 
7086069782SPekka Paalanen /**
71*10c43d2eSPekka Paalanen  * mmiotrace_unregister_pf:
7286069782SPekka Paalanen  * The caller must ensure @old_pfh is not in use anymore before freeing it.
73*10c43d2eSPekka Paalanen  * This function does not guarantee it. The handler function pointer is
74*10c43d2eSPekka Paalanen  * protected by RCU, so you can do this by e.g. calling synchronize_rcu().
7586069782SPekka Paalanen  */
76*10c43d2eSPekka Paalanen int mmiotrace_unregister_pf(pf_handler_func old_pfh)
7786069782SPekka Paalanen {
78*10c43d2eSPekka Paalanen 	int ret = 0;
7986069782SPekka Paalanen 	unsigned long flags;
80*10c43d2eSPekka Paalanen 	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
81*10c43d2eSPekka Paalanen 	if (mmiotrace_pf_handler != old_pfh)
82*10c43d2eSPekka Paalanen 		ret = -EPERM;
83*10c43d2eSPekka Paalanen 	else
84*10c43d2eSPekka Paalanen 		mmiotrace_pf_handler = NULL;
85*10c43d2eSPekka Paalanen 	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
86*10c43d2eSPekka Paalanen 	return ret;
8786069782SPekka Paalanen }
88*10c43d2eSPekka Paalanen EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf);
89*10c43d2eSPekka Paalanen #endif /* CONFIG_MMIOTRACE_HOOKS */
9086069782SPekka Paalanen 
9186069782SPekka Paalanen /* returns non-zero if do_page_fault() should return */
92*10c43d2eSPekka Paalanen static inline int call_mmiotrace(struct pt_regs *regs,
93*10c43d2eSPekka Paalanen 					unsigned long error_code,
9486069782SPekka Paalanen 					unsigned long address)
9586069782SPekka Paalanen {
96*10c43d2eSPekka Paalanen #ifdef CONFIG_MMIOTRACE_HOOKS
9786069782SPekka Paalanen 	int ret = 0;
9886069782SPekka Paalanen 	rcu_read_lock();
99*10c43d2eSPekka Paalanen 	if (mmiotrace_pf_handler)
100*10c43d2eSPekka Paalanen 		ret = mmiotrace_pf_handler(regs, error_code, address);
10186069782SPekka Paalanen 	rcu_read_unlock();
10286069782SPekka Paalanen 	return ret;
10386069782SPekka Paalanen #else
10486069782SPekka Paalanen 	return 0;
10586069782SPekka Paalanen #endif
10686069782SPekka Paalanen }
10786069782SPekka Paalanen 
108c61e211dSHarvey Harrison static inline int notify_page_fault(struct pt_regs *regs)
109c61e211dSHarvey Harrison {
110c61e211dSHarvey Harrison #ifdef CONFIG_KPROBES
111c61e211dSHarvey Harrison 	int ret = 0;
112c61e211dSHarvey Harrison 
113c61e211dSHarvey Harrison 	/* kprobe_running() needs smp_processor_id() */
114c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
115c61e211dSHarvey Harrison 	if (!user_mode_vm(regs)) {
116c61e211dSHarvey Harrison #else
117c61e211dSHarvey Harrison 	if (!user_mode(regs)) {
118c61e211dSHarvey Harrison #endif
119c61e211dSHarvey Harrison 		preempt_disable();
120c61e211dSHarvey Harrison 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
121c61e211dSHarvey Harrison 			ret = 1;
122c61e211dSHarvey Harrison 		preempt_enable();
123c61e211dSHarvey Harrison 	}
124c61e211dSHarvey Harrison 
125c61e211dSHarvey Harrison 	return ret;
126c61e211dSHarvey Harrison #else
127c61e211dSHarvey Harrison 	return 0;
128c61e211dSHarvey Harrison #endif
129c61e211dSHarvey Harrison }
130c61e211dSHarvey Harrison 
131c61e211dSHarvey Harrison /*
132c61e211dSHarvey Harrison  * X86_32
133c61e211dSHarvey Harrison  * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
134c61e211dSHarvey Harrison  * Check that here and ignore it.
135c61e211dSHarvey Harrison  *
136c61e211dSHarvey Harrison  * X86_64
137c61e211dSHarvey Harrison  * Sometimes the CPU reports invalid exceptions on prefetch.
138c61e211dSHarvey Harrison  * Check that here and ignore it.
139c61e211dSHarvey Harrison  *
140c61e211dSHarvey Harrison  * Opcode checker based on code by Richard Brunner
141c61e211dSHarvey Harrison  */
142c61e211dSHarvey Harrison static int is_prefetch(struct pt_regs *regs, unsigned long addr,
143c61e211dSHarvey Harrison 		       unsigned long error_code)
144c61e211dSHarvey Harrison {
145c61e211dSHarvey Harrison 	unsigned char *instr;
146c61e211dSHarvey Harrison 	int scan_more = 1;
147c61e211dSHarvey Harrison 	int prefetch = 0;
148c61e211dSHarvey Harrison 	unsigned char *max_instr;
149c61e211dSHarvey Harrison 
1503085354dSIngo Molnar 	/*
1513085354dSIngo Molnar 	 * If it was a exec (instruction fetch) fault on NX page, then
1523085354dSIngo Molnar 	 * do not ignore the fault:
1533085354dSIngo Molnar 	 */
154c61e211dSHarvey Harrison 	if (error_code & PF_INSTR)
155c61e211dSHarvey Harrison 		return 0;
156c61e211dSHarvey Harrison 
157c61e211dSHarvey Harrison 	instr = (unsigned char *)convert_ip_to_linear(current, regs);
158c61e211dSHarvey Harrison 	max_instr = instr + 15;
159c61e211dSHarvey Harrison 
160c61e211dSHarvey Harrison 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
161c61e211dSHarvey Harrison 		return 0;
162c61e211dSHarvey Harrison 
163c61e211dSHarvey Harrison 	while (scan_more && instr < max_instr) {
164c61e211dSHarvey Harrison 		unsigned char opcode;
165c61e211dSHarvey Harrison 		unsigned char instr_hi;
166c61e211dSHarvey Harrison 		unsigned char instr_lo;
167c61e211dSHarvey Harrison 
168c61e211dSHarvey Harrison 		if (probe_kernel_address(instr, opcode))
169c61e211dSHarvey Harrison 			break;
170c61e211dSHarvey Harrison 
171c61e211dSHarvey Harrison 		instr_hi = opcode & 0xf0;
172c61e211dSHarvey Harrison 		instr_lo = opcode & 0x0f;
173c61e211dSHarvey Harrison 		instr++;
174c61e211dSHarvey Harrison 
175c61e211dSHarvey Harrison 		switch (instr_hi) {
176c61e211dSHarvey Harrison 		case 0x20:
177c61e211dSHarvey Harrison 		case 0x30:
178c61e211dSHarvey Harrison 			/*
179c61e211dSHarvey Harrison 			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
180c61e211dSHarvey Harrison 			 * In X86_64 long mode, the CPU will signal invalid
181c61e211dSHarvey Harrison 			 * opcode if some of these prefixes are present so
182c61e211dSHarvey Harrison 			 * X86_64 will never get here anyway
183c61e211dSHarvey Harrison 			 */
184c61e211dSHarvey Harrison 			scan_more = ((instr_lo & 7) == 0x6);
185c61e211dSHarvey Harrison 			break;
186c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
187c61e211dSHarvey Harrison 		case 0x40:
188c61e211dSHarvey Harrison 			/*
189c61e211dSHarvey Harrison 			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
190c61e211dSHarvey Harrison 			 * Need to figure out under what instruction mode the
191c61e211dSHarvey Harrison 			 * instruction was issued. Could check the LDT for lm,
192c61e211dSHarvey Harrison 			 * but for now it's good enough to assume that long
193c61e211dSHarvey Harrison 			 * mode only uses well known segments or kernel.
194c61e211dSHarvey Harrison 			 */
195c61e211dSHarvey Harrison 			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
196c61e211dSHarvey Harrison 			break;
197c61e211dSHarvey Harrison #endif
198c61e211dSHarvey Harrison 		case 0x60:
199c61e211dSHarvey Harrison 			/* 0x64 thru 0x67 are valid prefixes in all modes. */
200c61e211dSHarvey Harrison 			scan_more = (instr_lo & 0xC) == 0x4;
201c61e211dSHarvey Harrison 			break;
202c61e211dSHarvey Harrison 		case 0xF0:
203c61e211dSHarvey Harrison 			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
204c61e211dSHarvey Harrison 			scan_more = !instr_lo || (instr_lo>>1) == 1;
205c61e211dSHarvey Harrison 			break;
206c61e211dSHarvey Harrison 		case 0x00:
207c61e211dSHarvey Harrison 			/* Prefetch instruction is 0x0F0D or 0x0F18 */
208c61e211dSHarvey Harrison 			scan_more = 0;
209c61e211dSHarvey Harrison 
210c61e211dSHarvey Harrison 			if (probe_kernel_address(instr, opcode))
211c61e211dSHarvey Harrison 				break;
212c61e211dSHarvey Harrison 			prefetch = (instr_lo == 0xF) &&
213c61e211dSHarvey Harrison 				(opcode == 0x0D || opcode == 0x18);
214c61e211dSHarvey Harrison 			break;
215c61e211dSHarvey Harrison 		default:
216c61e211dSHarvey Harrison 			scan_more = 0;
217c61e211dSHarvey Harrison 			break;
218c61e211dSHarvey Harrison 		}
219c61e211dSHarvey Harrison 	}
220c61e211dSHarvey Harrison 	return prefetch;
221c61e211dSHarvey Harrison }
222c61e211dSHarvey Harrison 
223c61e211dSHarvey Harrison static void force_sig_info_fault(int si_signo, int si_code,
224c61e211dSHarvey Harrison 	unsigned long address, struct task_struct *tsk)
225c61e211dSHarvey Harrison {
226c61e211dSHarvey Harrison 	siginfo_t info;
227c61e211dSHarvey Harrison 
228c61e211dSHarvey Harrison 	info.si_signo = si_signo;
229c61e211dSHarvey Harrison 	info.si_errno = 0;
230c61e211dSHarvey Harrison 	info.si_code = si_code;
231c61e211dSHarvey Harrison 	info.si_addr = (void __user *)address;
232c61e211dSHarvey Harrison 	force_sig_info(si_signo, &info, tsk);
233c61e211dSHarvey Harrison }
234c61e211dSHarvey Harrison 
235c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
236c61e211dSHarvey Harrison static int bad_address(void *p)
237c61e211dSHarvey Harrison {
238c61e211dSHarvey Harrison 	unsigned long dummy;
239c61e211dSHarvey Harrison 	return probe_kernel_address((unsigned long *)p, dummy);
240c61e211dSHarvey Harrison }
241c61e211dSHarvey Harrison #endif
242c61e211dSHarvey Harrison 
243cae30f82SAdrian Bunk static void dump_pagetable(unsigned long address)
244c61e211dSHarvey Harrison {
245c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
246c61e211dSHarvey Harrison 	__typeof__(pte_val(__pte(0))) page;
247c61e211dSHarvey Harrison 
248c61e211dSHarvey Harrison 	page = read_cr3();
249c61e211dSHarvey Harrison 	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
250c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
251c61e211dSHarvey Harrison 	printk("*pdpt = %016Lx ", page);
252c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
253c61e211dSHarvey Harrison 	    && page & _PAGE_PRESENT) {
254c61e211dSHarvey Harrison 		page &= PAGE_MASK;
255c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
256c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PMD - 1)];
257c61e211dSHarvey Harrison 		printk(KERN_CONT "*pde = %016Lx ", page);
258c61e211dSHarvey Harrison 		page &= ~_PAGE_NX;
259c61e211dSHarvey Harrison 	}
260c61e211dSHarvey Harrison #else
261c61e211dSHarvey Harrison 	printk("*pde = %08lx ", page);
262c61e211dSHarvey Harrison #endif
263c61e211dSHarvey Harrison 
264c61e211dSHarvey Harrison 	/*
265c61e211dSHarvey Harrison 	 * We must not directly access the pte in the highpte
266c61e211dSHarvey Harrison 	 * case if the page table is located in highmem.
267c61e211dSHarvey Harrison 	 * And let's rather not kmap-atomic the pte, just in case
268c61e211dSHarvey Harrison 	 * it's allocated already.
269c61e211dSHarvey Harrison 	 */
270c61e211dSHarvey Harrison 	if ((page >> PAGE_SHIFT) < max_low_pfn
271c61e211dSHarvey Harrison 	    && (page & _PAGE_PRESENT)
272c61e211dSHarvey Harrison 	    && !(page & _PAGE_PSE)) {
273c61e211dSHarvey Harrison 		page &= PAGE_MASK;
274c61e211dSHarvey Harrison 		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
275c61e211dSHarvey Harrison 		                                         & (PTRS_PER_PTE - 1)];
276c61e211dSHarvey Harrison 		printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
277c61e211dSHarvey Harrison 	}
278c61e211dSHarvey Harrison 
279c61e211dSHarvey Harrison 	printk("\n");
280c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
281c61e211dSHarvey Harrison 	pgd_t *pgd;
282c61e211dSHarvey Harrison 	pud_t *pud;
283c61e211dSHarvey Harrison 	pmd_t *pmd;
284c61e211dSHarvey Harrison 	pte_t *pte;
285c61e211dSHarvey Harrison 
286c61e211dSHarvey Harrison 	pgd = (pgd_t *)read_cr3();
287c61e211dSHarvey Harrison 
288c61e211dSHarvey Harrison 	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
289c61e211dSHarvey Harrison 	pgd += pgd_index(address);
290c61e211dSHarvey Harrison 	if (bad_address(pgd)) goto bad;
291c61e211dSHarvey Harrison 	printk("PGD %lx ", pgd_val(*pgd));
292c61e211dSHarvey Harrison 	if (!pgd_present(*pgd)) goto ret;
293c61e211dSHarvey Harrison 
294c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
295c61e211dSHarvey Harrison 	if (bad_address(pud)) goto bad;
296c61e211dSHarvey Harrison 	printk("PUD %lx ", pud_val(*pud));
297b5360222SAndi Kleen 	if (!pud_present(*pud) || pud_large(*pud))
298b5360222SAndi Kleen 		goto ret;
299c61e211dSHarvey Harrison 
300c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
301c61e211dSHarvey Harrison 	if (bad_address(pmd)) goto bad;
302c61e211dSHarvey Harrison 	printk("PMD %lx ", pmd_val(*pmd));
303c61e211dSHarvey Harrison 	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
304c61e211dSHarvey Harrison 
305c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
306c61e211dSHarvey Harrison 	if (bad_address(pte)) goto bad;
307c61e211dSHarvey Harrison 	printk("PTE %lx", pte_val(*pte));
308c61e211dSHarvey Harrison ret:
309c61e211dSHarvey Harrison 	printk("\n");
310c61e211dSHarvey Harrison 	return;
311c61e211dSHarvey Harrison bad:
312c61e211dSHarvey Harrison 	printk("BAD\n");
313c61e211dSHarvey Harrison #endif
314c61e211dSHarvey Harrison }
315c61e211dSHarvey Harrison 
316c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
317c61e211dSHarvey Harrison static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
318c61e211dSHarvey Harrison {
319c61e211dSHarvey Harrison 	unsigned index = pgd_index(address);
320c61e211dSHarvey Harrison 	pgd_t *pgd_k;
321c61e211dSHarvey Harrison 	pud_t *pud, *pud_k;
322c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_k;
323c61e211dSHarvey Harrison 
324c61e211dSHarvey Harrison 	pgd += index;
325c61e211dSHarvey Harrison 	pgd_k = init_mm.pgd + index;
326c61e211dSHarvey Harrison 
327c61e211dSHarvey Harrison 	if (!pgd_present(*pgd_k))
328c61e211dSHarvey Harrison 		return NULL;
329c61e211dSHarvey Harrison 
330c61e211dSHarvey Harrison 	/*
331c61e211dSHarvey Harrison 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
332c61e211dSHarvey Harrison 	 * and redundant with the set_pmd() on non-PAE. As would
333c61e211dSHarvey Harrison 	 * set_pud.
334c61e211dSHarvey Harrison 	 */
335c61e211dSHarvey Harrison 
336c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
337c61e211dSHarvey Harrison 	pud_k = pud_offset(pgd_k, address);
338c61e211dSHarvey Harrison 	if (!pud_present(*pud_k))
339c61e211dSHarvey Harrison 		return NULL;
340c61e211dSHarvey Harrison 
341c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
342c61e211dSHarvey Harrison 	pmd_k = pmd_offset(pud_k, address);
343c61e211dSHarvey Harrison 	if (!pmd_present(*pmd_k))
344c61e211dSHarvey Harrison 		return NULL;
345c61e211dSHarvey Harrison 	if (!pmd_present(*pmd)) {
346c61e211dSHarvey Harrison 		set_pmd(pmd, *pmd_k);
347c61e211dSHarvey Harrison 		arch_flush_lazy_mmu_mode();
348c61e211dSHarvey Harrison 	} else
349c61e211dSHarvey Harrison 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
350c61e211dSHarvey Harrison 	return pmd_k;
351c61e211dSHarvey Harrison }
352c61e211dSHarvey Harrison #endif
353c61e211dSHarvey Harrison 
354c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
355c61e211dSHarvey Harrison static const char errata93_warning[] =
356c61e211dSHarvey Harrison KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
357c61e211dSHarvey Harrison KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
358c61e211dSHarvey Harrison KERN_ERR "******* Please consider a BIOS update.\n"
359c61e211dSHarvey Harrison KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
360c61e211dSHarvey Harrison #endif
361c61e211dSHarvey Harrison 
362c61e211dSHarvey Harrison /* Workaround for K8 erratum #93 & buggy BIOS.
363c61e211dSHarvey Harrison    BIOS SMM functions are required to use a specific workaround
364c61e211dSHarvey Harrison    to avoid corruption of the 64bit RIP register on C stepping K8.
365c61e211dSHarvey Harrison    A lot of BIOS that didn't get tested properly miss this.
366c61e211dSHarvey Harrison    The OS sees this as a page fault with the upper 32bits of RIP cleared.
367c61e211dSHarvey Harrison    Try to work around it here.
368c61e211dSHarvey Harrison    Note we only handle faults in kernel here.
369c61e211dSHarvey Harrison    Does nothing for X86_32
370c61e211dSHarvey Harrison  */
371c61e211dSHarvey Harrison static int is_errata93(struct pt_regs *regs, unsigned long address)
372c61e211dSHarvey Harrison {
373c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
374c61e211dSHarvey Harrison 	static int warned;
375c61e211dSHarvey Harrison 	if (address != regs->ip)
376c61e211dSHarvey Harrison 		return 0;
377c61e211dSHarvey Harrison 	if ((address >> 32) != 0)
378c61e211dSHarvey Harrison 		return 0;
379c61e211dSHarvey Harrison 	address |= 0xffffffffUL << 32;
380c61e211dSHarvey Harrison 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
381c61e211dSHarvey Harrison 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
382c61e211dSHarvey Harrison 		if (!warned) {
383c61e211dSHarvey Harrison 			printk(errata93_warning);
384c61e211dSHarvey Harrison 			warned = 1;
385c61e211dSHarvey Harrison 		}
386c61e211dSHarvey Harrison 		regs->ip = address;
387c61e211dSHarvey Harrison 		return 1;
388c61e211dSHarvey Harrison 	}
389c61e211dSHarvey Harrison #endif
390c61e211dSHarvey Harrison 	return 0;
391c61e211dSHarvey Harrison }
392c61e211dSHarvey Harrison 
393c61e211dSHarvey Harrison /*
394c61e211dSHarvey Harrison  * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
395c61e211dSHarvey Harrison  * addresses >4GB.  We catch this in the page fault handler because these
396c61e211dSHarvey Harrison  * addresses are not reachable. Just detect this case and return.  Any code
397c61e211dSHarvey Harrison  * segment in LDT is compatibility mode.
398c61e211dSHarvey Harrison  */
399c61e211dSHarvey Harrison static int is_errata100(struct pt_regs *regs, unsigned long address)
400c61e211dSHarvey Harrison {
401c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
402c61e211dSHarvey Harrison 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
403c61e211dSHarvey Harrison 	    (address >> 32))
404c61e211dSHarvey Harrison 		return 1;
405c61e211dSHarvey Harrison #endif
406c61e211dSHarvey Harrison 	return 0;
407c61e211dSHarvey Harrison }
408c61e211dSHarvey Harrison 
409c61e211dSHarvey Harrison void do_invalid_op(struct pt_regs *, unsigned long);
410c61e211dSHarvey Harrison 
411c61e211dSHarvey Harrison static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
412c61e211dSHarvey Harrison {
413c61e211dSHarvey Harrison #ifdef CONFIG_X86_F00F_BUG
414c61e211dSHarvey Harrison 	unsigned long nr;
415c61e211dSHarvey Harrison 	/*
416c61e211dSHarvey Harrison 	 * Pentium F0 0F C7 C8 bug workaround.
417c61e211dSHarvey Harrison 	 */
418c61e211dSHarvey Harrison 	if (boot_cpu_data.f00f_bug) {
419c61e211dSHarvey Harrison 		nr = (address - idt_descr.address) >> 3;
420c61e211dSHarvey Harrison 
421c61e211dSHarvey Harrison 		if (nr == 6) {
422c61e211dSHarvey Harrison 			do_invalid_op(regs, 0);
423c61e211dSHarvey Harrison 			return 1;
424c61e211dSHarvey Harrison 		}
425c61e211dSHarvey Harrison 	}
426c61e211dSHarvey Harrison #endif
427c61e211dSHarvey Harrison 	return 0;
428c61e211dSHarvey Harrison }
429c61e211dSHarvey Harrison 
430c61e211dSHarvey Harrison static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
431c61e211dSHarvey Harrison 			    unsigned long address)
432c61e211dSHarvey Harrison {
433c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
434c61e211dSHarvey Harrison 	if (!oops_may_print())
435c61e211dSHarvey Harrison 		return;
436fd40d6e3SHarvey Harrison #endif
437c61e211dSHarvey Harrison 
438c61e211dSHarvey Harrison #ifdef CONFIG_X86_PAE
439c61e211dSHarvey Harrison 	if (error_code & PF_INSTR) {
44093809be8SHarvey Harrison 		unsigned int level;
441c61e211dSHarvey Harrison 		pte_t *pte = lookup_address(address, &level);
442c61e211dSHarvey Harrison 
443c61e211dSHarvey Harrison 		if (pte && pte_present(*pte) && !pte_exec(*pte))
444c61e211dSHarvey Harrison 			printk(KERN_CRIT "kernel tried to execute "
445c61e211dSHarvey Harrison 				"NX-protected page - exploit attempt? "
446c61e211dSHarvey Harrison 				"(uid: %d)\n", current->uid);
447c61e211dSHarvey Harrison 	}
448c61e211dSHarvey Harrison #endif
449fd40d6e3SHarvey Harrison 
450c61e211dSHarvey Harrison 	printk(KERN_ALERT "BUG: unable to handle kernel ");
451c61e211dSHarvey Harrison 	if (address < PAGE_SIZE)
452c61e211dSHarvey Harrison 		printk(KERN_CONT "NULL pointer dereference");
453c61e211dSHarvey Harrison 	else
454c61e211dSHarvey Harrison 		printk(KERN_CONT "paging request");
455fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
456c61e211dSHarvey Harrison 	printk(KERN_CONT " at %08lx\n", address);
457fd40d6e3SHarvey Harrison #else
458c61e211dSHarvey Harrison 	printk(KERN_CONT " at %016lx\n", address);
459fd40d6e3SHarvey Harrison #endif
460c61e211dSHarvey Harrison 	printk(KERN_ALERT "IP:");
461c61e211dSHarvey Harrison 	printk_address(regs->ip, 1);
462c61e211dSHarvey Harrison 	dump_pagetable(address);
463c61e211dSHarvey Harrison }
464c61e211dSHarvey Harrison 
465c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
466c61e211dSHarvey Harrison static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
467c61e211dSHarvey Harrison 				 unsigned long error_code)
468c61e211dSHarvey Harrison {
469c61e211dSHarvey Harrison 	unsigned long flags = oops_begin();
470c61e211dSHarvey Harrison 	struct task_struct *tsk;
471c61e211dSHarvey Harrison 
472c61e211dSHarvey Harrison 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
473c61e211dSHarvey Harrison 	       current->comm, address);
474c61e211dSHarvey Harrison 	dump_pagetable(address);
475c61e211dSHarvey Harrison 	tsk = current;
476c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
477c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
478c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
479c61e211dSHarvey Harrison 	if (__die("Bad pagetable", regs, error_code))
480c61e211dSHarvey Harrison 		regs = NULL;
481c61e211dSHarvey Harrison 	oops_end(flags, regs, SIGKILL);
482c61e211dSHarvey Harrison }
483c61e211dSHarvey Harrison #endif
484c61e211dSHarvey Harrison 
485d8b57bb7SThomas Gleixner static int spurious_fault_check(unsigned long error_code, pte_t *pte)
486d8b57bb7SThomas Gleixner {
487d8b57bb7SThomas Gleixner 	if ((error_code & PF_WRITE) && !pte_write(*pte))
488d8b57bb7SThomas Gleixner 		return 0;
489d8b57bb7SThomas Gleixner 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
490d8b57bb7SThomas Gleixner 		return 0;
491d8b57bb7SThomas Gleixner 
492d8b57bb7SThomas Gleixner 	return 1;
493d8b57bb7SThomas Gleixner }
494d8b57bb7SThomas Gleixner 
495c61e211dSHarvey Harrison /*
4965b727a3bSJeremy Fitzhardinge  * Handle a spurious fault caused by a stale TLB entry.  This allows
4975b727a3bSJeremy Fitzhardinge  * us to lazily refresh the TLB when increasing the permissions of a
4985b727a3bSJeremy Fitzhardinge  * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
4995b727a3bSJeremy Fitzhardinge  * expensive since that implies doing a full cross-processor TLB
5005b727a3bSJeremy Fitzhardinge  * flush, even if no stale TLB entries exist on other processors.
5015b727a3bSJeremy Fitzhardinge  * There are no security implications to leaving a stale TLB when
5025b727a3bSJeremy Fitzhardinge  * increasing the permissions on a page.
5035b727a3bSJeremy Fitzhardinge  */
5045b727a3bSJeremy Fitzhardinge static int spurious_fault(unsigned long address,
5055b727a3bSJeremy Fitzhardinge 			  unsigned long error_code)
5065b727a3bSJeremy Fitzhardinge {
5075b727a3bSJeremy Fitzhardinge 	pgd_t *pgd;
5085b727a3bSJeremy Fitzhardinge 	pud_t *pud;
5095b727a3bSJeremy Fitzhardinge 	pmd_t *pmd;
5105b727a3bSJeremy Fitzhardinge 	pte_t *pte;
5115b727a3bSJeremy Fitzhardinge 
5125b727a3bSJeremy Fitzhardinge 	/* Reserved-bit violation or user access to kernel space? */
5135b727a3bSJeremy Fitzhardinge 	if (error_code & (PF_USER | PF_RSVD))
5145b727a3bSJeremy Fitzhardinge 		return 0;
5155b727a3bSJeremy Fitzhardinge 
5165b727a3bSJeremy Fitzhardinge 	pgd = init_mm.pgd + pgd_index(address);
5175b727a3bSJeremy Fitzhardinge 	if (!pgd_present(*pgd))
5185b727a3bSJeremy Fitzhardinge 		return 0;
5195b727a3bSJeremy Fitzhardinge 
5205b727a3bSJeremy Fitzhardinge 	pud = pud_offset(pgd, address);
5215b727a3bSJeremy Fitzhardinge 	if (!pud_present(*pud))
5225b727a3bSJeremy Fitzhardinge 		return 0;
5235b727a3bSJeremy Fitzhardinge 
524d8b57bb7SThomas Gleixner 	if (pud_large(*pud))
525d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pud);
526d8b57bb7SThomas Gleixner 
5275b727a3bSJeremy Fitzhardinge 	pmd = pmd_offset(pud, address);
5285b727a3bSJeremy Fitzhardinge 	if (!pmd_present(*pmd))
5295b727a3bSJeremy Fitzhardinge 		return 0;
5305b727a3bSJeremy Fitzhardinge 
531d8b57bb7SThomas Gleixner 	if (pmd_large(*pmd))
532d8b57bb7SThomas Gleixner 		return spurious_fault_check(error_code, (pte_t *) pmd);
533d8b57bb7SThomas Gleixner 
5345b727a3bSJeremy Fitzhardinge 	pte = pte_offset_kernel(pmd, address);
5355b727a3bSJeremy Fitzhardinge 	if (!pte_present(*pte))
5365b727a3bSJeremy Fitzhardinge 		return 0;
5375b727a3bSJeremy Fitzhardinge 
538d8b57bb7SThomas Gleixner 	return spurious_fault_check(error_code, pte);
5395b727a3bSJeremy Fitzhardinge }
5405b727a3bSJeremy Fitzhardinge 
5415b727a3bSJeremy Fitzhardinge /*
542c61e211dSHarvey Harrison  * X86_32
543c61e211dSHarvey Harrison  * Handle a fault on the vmalloc or module mapping area
544c61e211dSHarvey Harrison  *
545c61e211dSHarvey Harrison  * X86_64
546c61e211dSHarvey Harrison  * Handle a fault on the vmalloc area
547c61e211dSHarvey Harrison  *
548c61e211dSHarvey Harrison  * This assumes no large pages in there.
549c61e211dSHarvey Harrison  */
550c61e211dSHarvey Harrison static int vmalloc_fault(unsigned long address)
551c61e211dSHarvey Harrison {
552c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
553c61e211dSHarvey Harrison 	unsigned long pgd_paddr;
554c61e211dSHarvey Harrison 	pmd_t *pmd_k;
555c61e211dSHarvey Harrison 	pte_t *pte_k;
556c61e211dSHarvey Harrison 	/*
557c61e211dSHarvey Harrison 	 * Synchronize this task's top level page-table
558c61e211dSHarvey Harrison 	 * with the 'reference' page table.
559c61e211dSHarvey Harrison 	 *
560c61e211dSHarvey Harrison 	 * Do _not_ use "current" here. We might be inside
561c61e211dSHarvey Harrison 	 * an interrupt in the middle of a task switch..
562c61e211dSHarvey Harrison 	 */
563c61e211dSHarvey Harrison 	pgd_paddr = read_cr3();
564c61e211dSHarvey Harrison 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
565c61e211dSHarvey Harrison 	if (!pmd_k)
566c61e211dSHarvey Harrison 		return -1;
567c61e211dSHarvey Harrison 	pte_k = pte_offset_kernel(pmd_k, address);
568c61e211dSHarvey Harrison 	if (!pte_present(*pte_k))
569c61e211dSHarvey Harrison 		return -1;
570c61e211dSHarvey Harrison 	return 0;
571c61e211dSHarvey Harrison #else
572c61e211dSHarvey Harrison 	pgd_t *pgd, *pgd_ref;
573c61e211dSHarvey Harrison 	pud_t *pud, *pud_ref;
574c61e211dSHarvey Harrison 	pmd_t *pmd, *pmd_ref;
575c61e211dSHarvey Harrison 	pte_t *pte, *pte_ref;
576c61e211dSHarvey Harrison 
577cf89ec92SHarvey Harrison 	/* Make sure we are in vmalloc area */
578cf89ec92SHarvey Harrison 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
579cf89ec92SHarvey Harrison 		return -1;
580cf89ec92SHarvey Harrison 
581c61e211dSHarvey Harrison 	/* Copy kernel mappings over when needed. This can also
582c61e211dSHarvey Harrison 	   happen within a race in page table update. In the later
583c61e211dSHarvey Harrison 	   case just flush. */
584c61e211dSHarvey Harrison 
585c61e211dSHarvey Harrison 	pgd = pgd_offset(current->mm ?: &init_mm, address);
586c61e211dSHarvey Harrison 	pgd_ref = pgd_offset_k(address);
587c61e211dSHarvey Harrison 	if (pgd_none(*pgd_ref))
588c61e211dSHarvey Harrison 		return -1;
589c61e211dSHarvey Harrison 	if (pgd_none(*pgd))
590c61e211dSHarvey Harrison 		set_pgd(pgd, *pgd_ref);
591c61e211dSHarvey Harrison 	else
592c61e211dSHarvey Harrison 		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
593c61e211dSHarvey Harrison 
594c61e211dSHarvey Harrison 	/* Below here mismatches are bugs because these lower tables
595c61e211dSHarvey Harrison 	   are shared */
596c61e211dSHarvey Harrison 
597c61e211dSHarvey Harrison 	pud = pud_offset(pgd, address);
598c61e211dSHarvey Harrison 	pud_ref = pud_offset(pgd_ref, address);
599c61e211dSHarvey Harrison 	if (pud_none(*pud_ref))
600c61e211dSHarvey Harrison 		return -1;
601c61e211dSHarvey Harrison 	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
602c61e211dSHarvey Harrison 		BUG();
603c61e211dSHarvey Harrison 	pmd = pmd_offset(pud, address);
604c61e211dSHarvey Harrison 	pmd_ref = pmd_offset(pud_ref, address);
605c61e211dSHarvey Harrison 	if (pmd_none(*pmd_ref))
606c61e211dSHarvey Harrison 		return -1;
607c61e211dSHarvey Harrison 	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
608c61e211dSHarvey Harrison 		BUG();
609c61e211dSHarvey Harrison 	pte_ref = pte_offset_kernel(pmd_ref, address);
610c61e211dSHarvey Harrison 	if (!pte_present(*pte_ref))
611c61e211dSHarvey Harrison 		return -1;
612c61e211dSHarvey Harrison 	pte = pte_offset_kernel(pmd, address);
613c61e211dSHarvey Harrison 	/* Don't use pte_page here, because the mappings can point
614c61e211dSHarvey Harrison 	   outside mem_map, and the NUMA hash lookup cannot handle
615c61e211dSHarvey Harrison 	   that. */
616c61e211dSHarvey Harrison 	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
617c61e211dSHarvey Harrison 		BUG();
618c61e211dSHarvey Harrison 	return 0;
619c61e211dSHarvey Harrison #endif
620c61e211dSHarvey Harrison }
621c61e211dSHarvey Harrison 
622c61e211dSHarvey Harrison int show_unhandled_signals = 1;
623c61e211dSHarvey Harrison 
624c61e211dSHarvey Harrison /*
625c61e211dSHarvey Harrison  * This routine handles page faults.  It determines the address,
626c61e211dSHarvey Harrison  * and the problem, and then passes it off to one of the appropriate
627c61e211dSHarvey Harrison  * routines.
628c61e211dSHarvey Harrison  */
629c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
630c61e211dSHarvey Harrison asmlinkage
631c61e211dSHarvey Harrison #endif
632c61e211dSHarvey Harrison void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
633c61e211dSHarvey Harrison {
634c61e211dSHarvey Harrison 	struct task_struct *tsk;
635c61e211dSHarvey Harrison 	struct mm_struct *mm;
636c61e211dSHarvey Harrison 	struct vm_area_struct *vma;
637c61e211dSHarvey Harrison 	unsigned long address;
638c61e211dSHarvey Harrison 	int write, si_code;
639c61e211dSHarvey Harrison 	int fault;
640c61e211dSHarvey Harrison #ifdef CONFIG_X86_64
641c61e211dSHarvey Harrison 	unsigned long flags;
642c61e211dSHarvey Harrison #endif
643c61e211dSHarvey Harrison 
644c61e211dSHarvey Harrison 	/*
645c61e211dSHarvey Harrison 	 * We can fault from pretty much anywhere, with unknown IRQ state.
646c61e211dSHarvey Harrison 	 */
647c61e211dSHarvey Harrison 	trace_hardirqs_fixup();
648c61e211dSHarvey Harrison 
649c61e211dSHarvey Harrison 	tsk = current;
650c61e211dSHarvey Harrison 	mm = tsk->mm;
651c61e211dSHarvey Harrison 	prefetchw(&mm->mmap_sem);
652c61e211dSHarvey Harrison 
653c61e211dSHarvey Harrison 	/* get the address */
654c61e211dSHarvey Harrison 	address = read_cr2();
655c61e211dSHarvey Harrison 
656c61e211dSHarvey Harrison 	si_code = SEGV_MAPERR;
657c61e211dSHarvey Harrison 
658c61e211dSHarvey Harrison 	if (notify_page_fault(regs))
659c61e211dSHarvey Harrison 		return;
660*10c43d2eSPekka Paalanen 	if (call_mmiotrace(regs, error_code, address))
66186069782SPekka Paalanen 		return;
662c61e211dSHarvey Harrison 
663c61e211dSHarvey Harrison 	/*
664c61e211dSHarvey Harrison 	 * We fault-in kernel-space virtual memory on-demand. The
665c61e211dSHarvey Harrison 	 * 'reference' page table is init_mm.pgd.
666c61e211dSHarvey Harrison 	 *
667c61e211dSHarvey Harrison 	 * NOTE! We MUST NOT take any locks for this case. We may
668c61e211dSHarvey Harrison 	 * be in an interrupt or a critical region, and should
669c61e211dSHarvey Harrison 	 * only copy the information from the master page table,
670c61e211dSHarvey Harrison 	 * nothing more.
671c61e211dSHarvey Harrison 	 *
672c61e211dSHarvey Harrison 	 * This verifies that the fault happens in kernel space
673c61e211dSHarvey Harrison 	 * (error_code & 4) == 0, and that the fault was not a
674c61e211dSHarvey Harrison 	 * protection error (error_code & 9) == 0.
675c61e211dSHarvey Harrison 	 */
676c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
677c61e211dSHarvey Harrison 	if (unlikely(address >= TASK_SIZE)) {
678cf89ec92SHarvey Harrison #else
679cf89ec92SHarvey Harrison 	if (unlikely(address >= TASK_SIZE64)) {
680cf89ec92SHarvey Harrison #endif
681c61e211dSHarvey Harrison 		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
682c61e211dSHarvey Harrison 		    vmalloc_fault(address) >= 0)
683c61e211dSHarvey Harrison 			return;
6845b727a3bSJeremy Fitzhardinge 
6855b727a3bSJeremy Fitzhardinge 		/* Can handle a stale RO->RW TLB */
6865b727a3bSJeremy Fitzhardinge 		if (spurious_fault(address, error_code))
6875b727a3bSJeremy Fitzhardinge 			return;
6885b727a3bSJeremy Fitzhardinge 
689c61e211dSHarvey Harrison 		/*
690c61e211dSHarvey Harrison 		 * Don't take the mm semaphore here. If we fixup a prefetch
691c61e211dSHarvey Harrison 		 * fault we could otherwise deadlock.
692c61e211dSHarvey Harrison 		 */
693c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
694c61e211dSHarvey Harrison 	}
695c61e211dSHarvey Harrison 
696cf89ec92SHarvey Harrison 
697cf89ec92SHarvey Harrison #ifdef CONFIG_X86_32
698c61e211dSHarvey Harrison 	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
699c61e211dSHarvey Harrison 	   fault has been handled. */
7006b6891f9Sgorcunov@gmail.com 	if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
701c61e211dSHarvey Harrison 		local_irq_enable();
702c61e211dSHarvey Harrison 
703c61e211dSHarvey Harrison 	/*
704c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
705c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
706c61e211dSHarvey Harrison 	 */
707c61e211dSHarvey Harrison 	if (in_atomic() || !mm)
708c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
709c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
710c61e211dSHarvey Harrison 	if (likely(regs->flags & X86_EFLAGS_IF))
711c61e211dSHarvey Harrison 		local_irq_enable();
712c61e211dSHarvey Harrison 
713c61e211dSHarvey Harrison 	if (unlikely(error_code & PF_RSVD))
714c61e211dSHarvey Harrison 		pgtable_bad(address, regs, error_code);
715c61e211dSHarvey Harrison 
716c61e211dSHarvey Harrison 	/*
717c61e211dSHarvey Harrison 	 * If we're in an interrupt, have no user context or are running in an
718c61e211dSHarvey Harrison 	 * atomic region then we must not take the fault.
719c61e211dSHarvey Harrison 	 */
720c61e211dSHarvey Harrison 	if (unlikely(in_atomic() || !mm))
721c61e211dSHarvey Harrison 		goto bad_area_nosemaphore;
722c61e211dSHarvey Harrison 
723c61e211dSHarvey Harrison 	/*
724c61e211dSHarvey Harrison 	 * User-mode registers count as a user access even for any
725c61e211dSHarvey Harrison 	 * potential system fault or CPU buglet.
726c61e211dSHarvey Harrison 	 */
727c61e211dSHarvey Harrison 	if (user_mode_vm(regs))
728c61e211dSHarvey Harrison 		error_code |= PF_USER;
729c61e211dSHarvey Harrison again:
730c61e211dSHarvey Harrison #endif
731c61e211dSHarvey Harrison 	/* When running in the kernel we expect faults to occur only to
732c61e211dSHarvey Harrison 	 * addresses in user space.  All other faults represent errors in the
733c61e211dSHarvey Harrison 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
734c61e211dSHarvey Harrison 	 * erroneous fault occurring in a code path which already holds mmap_sem
735c61e211dSHarvey Harrison 	 * we will deadlock attempting to validate the fault against the
736c61e211dSHarvey Harrison 	 * address space.  Luckily the kernel only validly references user
737c61e211dSHarvey Harrison 	 * space from well defined areas of code, which are listed in the
738c61e211dSHarvey Harrison 	 * exceptions table.
739c61e211dSHarvey Harrison 	 *
740c61e211dSHarvey Harrison 	 * As the vast majority of faults will be valid we will only perform
741c61e211dSHarvey Harrison 	 * the source reference check when there is a possibility of a deadlock.
742c61e211dSHarvey Harrison 	 * Attempt to lock the address space, if we cannot we then validate the
743c61e211dSHarvey Harrison 	 * source.  If this is invalid we can skip the address space check,
744c61e211dSHarvey Harrison 	 * thus avoiding the deadlock.
745c61e211dSHarvey Harrison 	 */
746c61e211dSHarvey Harrison 	if (!down_read_trylock(&mm->mmap_sem)) {
747c61e211dSHarvey Harrison 		if ((error_code & PF_USER) == 0 &&
748c61e211dSHarvey Harrison 		    !search_exception_tables(regs->ip))
749c61e211dSHarvey Harrison 			goto bad_area_nosemaphore;
750c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
751c61e211dSHarvey Harrison 	}
752c61e211dSHarvey Harrison 
753c61e211dSHarvey Harrison 	vma = find_vma(mm, address);
754c61e211dSHarvey Harrison 	if (!vma)
755c61e211dSHarvey Harrison 		goto bad_area;
756c61e211dSHarvey Harrison 	if (vma->vm_start <= address)
757c61e211dSHarvey Harrison 		goto good_area;
758c61e211dSHarvey Harrison 	if (!(vma->vm_flags & VM_GROWSDOWN))
759c61e211dSHarvey Harrison 		goto bad_area;
760c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
761c61e211dSHarvey Harrison 		/*
762c61e211dSHarvey Harrison 		 * Accessing the stack below %sp is always a bug.
763c61e211dSHarvey Harrison 		 * The large cushion allows instructions like enter
764c61e211dSHarvey Harrison 		 * and pusha to work.  ("enter $65535,$31" pushes
765c61e211dSHarvey Harrison 		 * 32 pointers and then decrements %sp by 65535.)
766c61e211dSHarvey Harrison 		 */
767c61e211dSHarvey Harrison 		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
768c61e211dSHarvey Harrison 			goto bad_area;
769c61e211dSHarvey Harrison 	}
770c61e211dSHarvey Harrison 	if (expand_stack(vma, address))
771c61e211dSHarvey Harrison 		goto bad_area;
772c61e211dSHarvey Harrison /*
773c61e211dSHarvey Harrison  * Ok, we have a good vm_area for this memory access, so
774c61e211dSHarvey Harrison  * we can handle it..
775c61e211dSHarvey Harrison  */
776c61e211dSHarvey Harrison good_area:
777c61e211dSHarvey Harrison 	si_code = SEGV_ACCERR;
778c61e211dSHarvey Harrison 	write = 0;
779c61e211dSHarvey Harrison 	switch (error_code & (PF_PROT|PF_WRITE)) {
780c61e211dSHarvey Harrison 	default:	/* 3: write, present */
781c61e211dSHarvey Harrison 		/* fall through */
782c61e211dSHarvey Harrison 	case PF_WRITE:		/* write, not present */
783c61e211dSHarvey Harrison 		if (!(vma->vm_flags & VM_WRITE))
784c61e211dSHarvey Harrison 			goto bad_area;
785c61e211dSHarvey Harrison 		write++;
786c61e211dSHarvey Harrison 		break;
787c61e211dSHarvey Harrison 	case PF_PROT:		/* read, present */
788c61e211dSHarvey Harrison 		goto bad_area;
789c61e211dSHarvey Harrison 	case 0:			/* read, not present */
790c61e211dSHarvey Harrison 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
791c61e211dSHarvey Harrison 			goto bad_area;
792c61e211dSHarvey Harrison 	}
793c61e211dSHarvey Harrison 
794c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
795c61e211dSHarvey Harrison survive:
796c61e211dSHarvey Harrison #endif
797c61e211dSHarvey Harrison 	/*
798c61e211dSHarvey Harrison 	 * If for any reason at all we couldn't handle the fault,
799c61e211dSHarvey Harrison 	 * make sure we exit gracefully rather than endlessly redo
800c61e211dSHarvey Harrison 	 * the fault.
801c61e211dSHarvey Harrison 	 */
802c61e211dSHarvey Harrison 	fault = handle_mm_fault(mm, vma, address, write);
803c61e211dSHarvey Harrison 	if (unlikely(fault & VM_FAULT_ERROR)) {
804c61e211dSHarvey Harrison 		if (fault & VM_FAULT_OOM)
805c61e211dSHarvey Harrison 			goto out_of_memory;
806c61e211dSHarvey Harrison 		else if (fault & VM_FAULT_SIGBUS)
807c61e211dSHarvey Harrison 			goto do_sigbus;
808c61e211dSHarvey Harrison 		BUG();
809c61e211dSHarvey Harrison 	}
810c61e211dSHarvey Harrison 	if (fault & VM_FAULT_MAJOR)
811c61e211dSHarvey Harrison 		tsk->maj_flt++;
812c61e211dSHarvey Harrison 	else
813c61e211dSHarvey Harrison 		tsk->min_flt++;
814c61e211dSHarvey Harrison 
815c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
816c61e211dSHarvey Harrison 	/*
817c61e211dSHarvey Harrison 	 * Did it hit the DOS screen memory VA from vm86 mode?
818c61e211dSHarvey Harrison 	 */
819c61e211dSHarvey Harrison 	if (v8086_mode(regs)) {
820c61e211dSHarvey Harrison 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
821c61e211dSHarvey Harrison 		if (bit < 32)
822c61e211dSHarvey Harrison 			tsk->thread.screen_bitmap |= 1 << bit;
823c61e211dSHarvey Harrison 	}
824c61e211dSHarvey Harrison #endif
825c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
826c61e211dSHarvey Harrison 	return;
827c61e211dSHarvey Harrison 
828c61e211dSHarvey Harrison /*
829c61e211dSHarvey Harrison  * Something tried to access memory that isn't in our memory map..
830c61e211dSHarvey Harrison  * Fix it, but check if it's kernel or user first..
831c61e211dSHarvey Harrison  */
832c61e211dSHarvey Harrison bad_area:
833c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
834c61e211dSHarvey Harrison 
835c61e211dSHarvey Harrison bad_area_nosemaphore:
836c61e211dSHarvey Harrison 	/* User mode accesses just cause a SIGSEGV */
837c61e211dSHarvey Harrison 	if (error_code & PF_USER) {
838c61e211dSHarvey Harrison 		/*
839c61e211dSHarvey Harrison 		 * It's possible to have interrupts off here.
840c61e211dSHarvey Harrison 		 */
841c61e211dSHarvey Harrison 		local_irq_enable();
842c61e211dSHarvey Harrison 
843c61e211dSHarvey Harrison 		/*
844c61e211dSHarvey Harrison 		 * Valid to do another page fault here because this one came
845c61e211dSHarvey Harrison 		 * from user space.
846c61e211dSHarvey Harrison 		 */
847c61e211dSHarvey Harrison 		if (is_prefetch(regs, address, error_code))
848c61e211dSHarvey Harrison 			return;
849c61e211dSHarvey Harrison 
850c61e211dSHarvey Harrison 		if (is_errata100(regs, address))
851c61e211dSHarvey Harrison 			return;
852c61e211dSHarvey Harrison 
853c61e211dSHarvey Harrison 		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
854c61e211dSHarvey Harrison 		    printk_ratelimit()) {
855c61e211dSHarvey Harrison 			printk(
856c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
857c61e211dSHarvey Harrison 			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
858c61e211dSHarvey Harrison #else
859c61e211dSHarvey Harrison 			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
860c61e211dSHarvey Harrison #endif
861c61e211dSHarvey Harrison 			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
862c61e211dSHarvey Harrison 			tsk->comm, task_pid_nr(tsk), address, regs->ip,
863c61e211dSHarvey Harrison 			regs->sp, error_code);
864c61e211dSHarvey Harrison 			print_vma_addr(" in ", regs->ip);
865c61e211dSHarvey Harrison 			printk("\n");
866c61e211dSHarvey Harrison 		}
867c61e211dSHarvey Harrison 
868c61e211dSHarvey Harrison 		tsk->thread.cr2 = address;
869c61e211dSHarvey Harrison 		/* Kernel addresses are always protection faults */
870c61e211dSHarvey Harrison 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
871c61e211dSHarvey Harrison 		tsk->thread.trap_no = 14;
872c61e211dSHarvey Harrison 		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
873c61e211dSHarvey Harrison 		return;
874c61e211dSHarvey Harrison 	}
875c61e211dSHarvey Harrison 
876c61e211dSHarvey Harrison 	if (is_f00f_bug(regs, address))
877c61e211dSHarvey Harrison 		return;
878c61e211dSHarvey Harrison 
879c61e211dSHarvey Harrison no_context:
880c61e211dSHarvey Harrison 	/* Are we prepared to handle this kernel fault?  */
881c61e211dSHarvey Harrison 	if (fixup_exception(regs))
882c61e211dSHarvey Harrison 		return;
883c61e211dSHarvey Harrison 
884c61e211dSHarvey Harrison 	/*
885c61e211dSHarvey Harrison 	 * X86_32
886c61e211dSHarvey Harrison 	 * Valid to do another page fault here, because if this fault
887c61e211dSHarvey Harrison 	 * had been triggered by is_prefetch fixup_exception would have
888c61e211dSHarvey Harrison 	 * handled it.
889c61e211dSHarvey Harrison 	 *
890c61e211dSHarvey Harrison 	 * X86_64
891c61e211dSHarvey Harrison 	 * Hall of shame of CPU/BIOS bugs.
892c61e211dSHarvey Harrison 	 */
893c61e211dSHarvey Harrison 	if (is_prefetch(regs, address, error_code))
894c61e211dSHarvey Harrison 		return;
895c61e211dSHarvey Harrison 
896c61e211dSHarvey Harrison 	if (is_errata93(regs, address))
897c61e211dSHarvey Harrison 		return;
898c61e211dSHarvey Harrison 
899c61e211dSHarvey Harrison /*
900c61e211dSHarvey Harrison  * Oops. The kernel tried to access some bad page. We'll have to
901c61e211dSHarvey Harrison  * terminate things with extreme prejudice.
902c61e211dSHarvey Harrison  */
903c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
904c61e211dSHarvey Harrison 	bust_spinlocks(1);
905fd40d6e3SHarvey Harrison #else
906fd40d6e3SHarvey Harrison 	flags = oops_begin();
907fd40d6e3SHarvey Harrison #endif
908c61e211dSHarvey Harrison 
909c61e211dSHarvey Harrison 	show_fault_oops(regs, error_code, address);
910c61e211dSHarvey Harrison 
911c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
912c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
913c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
914fd40d6e3SHarvey Harrison 
915fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
916c61e211dSHarvey Harrison 	die("Oops", regs, error_code);
917c61e211dSHarvey Harrison 	bust_spinlocks(0);
918c61e211dSHarvey Harrison 	do_exit(SIGKILL);
919fd40d6e3SHarvey Harrison #else
920c61e211dSHarvey Harrison 	if (__die("Oops", regs, error_code))
921c61e211dSHarvey Harrison 		regs = NULL;
922c61e211dSHarvey Harrison 	/* Executive summary in case the body of the oops scrolled away */
923c61e211dSHarvey Harrison 	printk(KERN_EMERG "CR2: %016lx\n", address);
924c61e211dSHarvey Harrison 	oops_end(flags, regs, SIGKILL);
925c61e211dSHarvey Harrison #endif
926c61e211dSHarvey Harrison 
927c61e211dSHarvey Harrison /*
928c61e211dSHarvey Harrison  * We ran out of memory, or some other thing happened to us that made
929c61e211dSHarvey Harrison  * us unable to handle the page fault gracefully.
930c61e211dSHarvey Harrison  */
931c61e211dSHarvey Harrison out_of_memory:
932c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
933c61e211dSHarvey Harrison 	if (is_global_init(tsk)) {
934c61e211dSHarvey Harrison 		yield();
935fd40d6e3SHarvey Harrison #ifdef CONFIG_X86_32
936c61e211dSHarvey Harrison 		down_read(&mm->mmap_sem);
937c61e211dSHarvey Harrison 		goto survive;
938c61e211dSHarvey Harrison #else
939c61e211dSHarvey Harrison 		goto again;
940c61e211dSHarvey Harrison #endif
941fd40d6e3SHarvey Harrison 	}
942fd40d6e3SHarvey Harrison 
943c61e211dSHarvey Harrison 	printk("VM: killing process %s\n", tsk->comm);
944c61e211dSHarvey Harrison 	if (error_code & PF_USER)
945c61e211dSHarvey Harrison 		do_group_exit(SIGKILL);
946c61e211dSHarvey Harrison 	goto no_context;
947c61e211dSHarvey Harrison 
948c61e211dSHarvey Harrison do_sigbus:
949c61e211dSHarvey Harrison 	up_read(&mm->mmap_sem);
950c61e211dSHarvey Harrison 
951c61e211dSHarvey Harrison 	/* Kernel mode? Handle exceptions or die */
952c61e211dSHarvey Harrison 	if (!(error_code & PF_USER))
953c61e211dSHarvey Harrison 		goto no_context;
954c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
955c61e211dSHarvey Harrison 	/* User space => ok to do another page fault */
956c61e211dSHarvey Harrison 	if (is_prefetch(regs, address, error_code))
957c61e211dSHarvey Harrison 		return;
958c61e211dSHarvey Harrison #endif
959c61e211dSHarvey Harrison 	tsk->thread.cr2 = address;
960c61e211dSHarvey Harrison 	tsk->thread.error_code = error_code;
961c61e211dSHarvey Harrison 	tsk->thread.trap_no = 14;
962c61e211dSHarvey Harrison 	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
963c61e211dSHarvey Harrison }
964c61e211dSHarvey Harrison 
965c61e211dSHarvey Harrison DEFINE_SPINLOCK(pgd_lock);
966c61e211dSHarvey Harrison LIST_HEAD(pgd_list);
967c61e211dSHarvey Harrison 
968c61e211dSHarvey Harrison void vmalloc_sync_all(void)
969c61e211dSHarvey Harrison {
970c61e211dSHarvey Harrison #ifdef CONFIG_X86_32
971c61e211dSHarvey Harrison 	/*
972c61e211dSHarvey Harrison 	 * Note that races in the updates of insync and start aren't
973c61e211dSHarvey Harrison 	 * problematic: insync can only get set bits added, and updates to
974c61e211dSHarvey Harrison 	 * start are only improving performance (without affecting correctness
975c61e211dSHarvey Harrison 	 * if undone).
976c61e211dSHarvey Harrison 	 */
977c61e211dSHarvey Harrison 	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
978c61e211dSHarvey Harrison 	static unsigned long start = TASK_SIZE;
979c61e211dSHarvey Harrison 	unsigned long address;
980c61e211dSHarvey Harrison 
981c61e211dSHarvey Harrison 	if (SHARED_KERNEL_PMD)
982c61e211dSHarvey Harrison 		return;
983c61e211dSHarvey Harrison 
984c61e211dSHarvey Harrison 	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
985c61e211dSHarvey Harrison 	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
986c61e211dSHarvey Harrison 		if (!test_bit(pgd_index(address), insync)) {
987c61e211dSHarvey Harrison 			unsigned long flags;
988c61e211dSHarvey Harrison 			struct page *page;
989c61e211dSHarvey Harrison 
990c61e211dSHarvey Harrison 			spin_lock_irqsave(&pgd_lock, flags);
991e3ed910dSJeremy Fitzhardinge 			list_for_each_entry(page, &pgd_list, lru) {
992c61e211dSHarvey Harrison 				if (!vmalloc_sync_one(page_address(page),
993e3ed910dSJeremy Fitzhardinge 						      address))
994c61e211dSHarvey Harrison 					break;
995c61e211dSHarvey Harrison 			}
996c61e211dSHarvey Harrison 			spin_unlock_irqrestore(&pgd_lock, flags);
997c61e211dSHarvey Harrison 			if (!page)
998c61e211dSHarvey Harrison 				set_bit(pgd_index(address), insync);
999c61e211dSHarvey Harrison 		}
1000c61e211dSHarvey Harrison 		if (address == start && test_bit(pgd_index(address), insync))
1001c61e211dSHarvey Harrison 			start = address + PGDIR_SIZE;
1002c61e211dSHarvey Harrison 	}
1003c61e211dSHarvey Harrison #else /* CONFIG_X86_64 */
1004c61e211dSHarvey Harrison 	/*
1005c61e211dSHarvey Harrison 	 * Note that races in the updates of insync and start aren't
1006c61e211dSHarvey Harrison 	 * problematic: insync can only get set bits added, and updates to
1007c61e211dSHarvey Harrison 	 * start are only improving performance (without affecting correctness
1008c61e211dSHarvey Harrison 	 * if undone).
1009c61e211dSHarvey Harrison 	 */
1010c61e211dSHarvey Harrison 	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
1011c61e211dSHarvey Harrison 	static unsigned long start = VMALLOC_START & PGDIR_MASK;
1012c61e211dSHarvey Harrison 	unsigned long address;
1013c61e211dSHarvey Harrison 
1014c61e211dSHarvey Harrison 	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
1015c61e211dSHarvey Harrison 		if (!test_bit(pgd_index(address), insync)) {
1016c61e211dSHarvey Harrison 			const pgd_t *pgd_ref = pgd_offset_k(address);
101758d5d0d8SIngo Molnar 			unsigned long flags;
1018c61e211dSHarvey Harrison 			struct page *page;
1019c61e211dSHarvey Harrison 
1020c61e211dSHarvey Harrison 			if (pgd_none(*pgd_ref))
1021c61e211dSHarvey Harrison 				continue;
102258d5d0d8SIngo Molnar 			spin_lock_irqsave(&pgd_lock, flags);
1023c61e211dSHarvey Harrison 			list_for_each_entry(page, &pgd_list, lru) {
1024c61e211dSHarvey Harrison 				pgd_t *pgd;
1025c61e211dSHarvey Harrison 				pgd = (pgd_t *)page_address(page) + pgd_index(address);
1026c61e211dSHarvey Harrison 				if (pgd_none(*pgd))
1027c61e211dSHarvey Harrison 					set_pgd(pgd, *pgd_ref);
1028c61e211dSHarvey Harrison 				else
1029c61e211dSHarvey Harrison 					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1030c61e211dSHarvey Harrison 			}
103158d5d0d8SIngo Molnar 			spin_unlock_irqrestore(&pgd_lock, flags);
1032c61e211dSHarvey Harrison 			set_bit(pgd_index(address), insync);
1033c61e211dSHarvey Harrison 		}
1034c61e211dSHarvey Harrison 		if (address == start)
1035c61e211dSHarvey Harrison 			start = address + PGDIR_SIZE;
1036c61e211dSHarvey Harrison 	}
1037c61e211dSHarvey Harrison #endif
1038c61e211dSHarvey Harrison }
1039