xref: /openbmc/linux/arch/x86/mm/fault.c (revision e8e0929d)
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5  */
6 #include <linux/magic.h>		/* STACK_END_MAGIC		*/
7 #include <linux/sched.h>		/* test_thread_flag(), ...	*/
8 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
9 #include <linux/module.h>		/* search_exception_table	*/
10 #include <linux/bootmem.h>		/* max_low_pfn			*/
11 #include <linux/kprobes.h>		/* __kprobes, ...		*/
12 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
13 #include <linux/perf_event.h>		/* perf_sw_event		*/
14 
15 #include <asm/traps.h>			/* dotraplinkage, ...		*/
16 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
17 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
18 
19 /*
20  * Page fault error code bits:
21  *
22  *   bit 0 ==	 0: no page found	1: protection fault
23  *   bit 1 ==	 0: read access		1: write access
24  *   bit 2 ==	 0: kernel-mode access	1: user-mode access
25  *   bit 3 ==				1: use of reserved bit detected
26  *   bit 4 ==				1: fault was an instruction fetch
27  */
28 enum x86_pf_error_code {
29 
30 	PF_PROT		=		1 << 0,
31 	PF_WRITE	=		1 << 1,
32 	PF_USER		=		1 << 2,
33 	PF_RSVD		=		1 << 3,
34 	PF_INSTR	=		1 << 4,
35 };
36 
37 /*
38  * Returns 0 if mmiotrace is disabled, or if the fault is not
39  * handled by mmiotrace:
40  */
41 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
42 {
43 	if (unlikely(is_kmmio_active()))
44 		if (kmmio_handler(regs, addr) == 1)
45 			return -1;
46 	return 0;
47 }
48 
49 static inline int notify_page_fault(struct pt_regs *regs)
50 {
51 	int ret = 0;
52 
53 	/* kprobe_running() needs smp_processor_id() */
54 	if (kprobes_built_in() && !user_mode_vm(regs)) {
55 		preempt_disable();
56 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 			ret = 1;
58 		preempt_enable();
59 	}
60 
61 	return ret;
62 }
63 
64 /*
65  * Prefetch quirks:
66  *
67  * 32-bit mode:
68  *
69  *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70  *   Check that here and ignore it.
71  *
72  * 64-bit mode:
73  *
74  *   Sometimes the CPU reports invalid exceptions on prefetch.
75  *   Check that here and ignore it.
76  *
77  * Opcode checker based on code by Richard Brunner.
78  */
79 static inline int
80 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
81 		      unsigned char opcode, int *prefetch)
82 {
83 	unsigned char instr_hi = opcode & 0xf0;
84 	unsigned char instr_lo = opcode & 0x0f;
85 
86 	switch (instr_hi) {
87 	case 0x20:
88 	case 0x30:
89 		/*
90 		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
91 		 * In X86_64 long mode, the CPU will signal invalid
92 		 * opcode if some of these prefixes are present so
93 		 * X86_64 will never get here anyway
94 		 */
95 		return ((instr_lo & 7) == 0x6);
96 #ifdef CONFIG_X86_64
97 	case 0x40:
98 		/*
99 		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
100 		 * Need to figure out under what instruction mode the
101 		 * instruction was issued. Could check the LDT for lm,
102 		 * but for now it's good enough to assume that long
103 		 * mode only uses well known segments or kernel.
104 		 */
105 		return (!user_mode(regs)) || (regs->cs == __USER_CS);
106 #endif
107 	case 0x60:
108 		/* 0x64 thru 0x67 are valid prefixes in all modes. */
109 		return (instr_lo & 0xC) == 0x4;
110 	case 0xF0:
111 		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
112 		return !instr_lo || (instr_lo>>1) == 1;
113 	case 0x00:
114 		/* Prefetch instruction is 0x0F0D or 0x0F18 */
115 		if (probe_kernel_address(instr, opcode))
116 			return 0;
117 
118 		*prefetch = (instr_lo == 0xF) &&
119 			(opcode == 0x0D || opcode == 0x18);
120 		return 0;
121 	default:
122 		return 0;
123 	}
124 }
125 
126 static int
127 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
128 {
129 	unsigned char *max_instr;
130 	unsigned char *instr;
131 	int prefetch = 0;
132 
133 	/*
134 	 * If it was a exec (instruction fetch) fault on NX page, then
135 	 * do not ignore the fault:
136 	 */
137 	if (error_code & PF_INSTR)
138 		return 0;
139 
140 	instr = (void *)convert_ip_to_linear(current, regs);
141 	max_instr = instr + 15;
142 
143 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
144 		return 0;
145 
146 	while (instr < max_instr) {
147 		unsigned char opcode;
148 
149 		if (probe_kernel_address(instr, opcode))
150 			break;
151 
152 		instr++;
153 
154 		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
155 			break;
156 	}
157 	return prefetch;
158 }
159 
160 static void
161 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
162 		     struct task_struct *tsk)
163 {
164 	siginfo_t info;
165 
166 	info.si_signo	= si_signo;
167 	info.si_errno	= 0;
168 	info.si_code	= si_code;
169 	info.si_addr	= (void __user *)address;
170 	info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
171 
172 	force_sig_info(si_signo, &info, tsk);
173 }
174 
175 DEFINE_SPINLOCK(pgd_lock);
176 LIST_HEAD(pgd_list);
177 
178 #ifdef CONFIG_X86_32
179 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
180 {
181 	unsigned index = pgd_index(address);
182 	pgd_t *pgd_k;
183 	pud_t *pud, *pud_k;
184 	pmd_t *pmd, *pmd_k;
185 
186 	pgd += index;
187 	pgd_k = init_mm.pgd + index;
188 
189 	if (!pgd_present(*pgd_k))
190 		return NULL;
191 
192 	/*
193 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
194 	 * and redundant with the set_pmd() on non-PAE. As would
195 	 * set_pud.
196 	 */
197 	pud = pud_offset(pgd, address);
198 	pud_k = pud_offset(pgd_k, address);
199 	if (!pud_present(*pud_k))
200 		return NULL;
201 
202 	pmd = pmd_offset(pud, address);
203 	pmd_k = pmd_offset(pud_k, address);
204 	if (!pmd_present(*pmd_k))
205 		return NULL;
206 
207 	if (!pmd_present(*pmd))
208 		set_pmd(pmd, *pmd_k);
209 	else
210 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
211 
212 	return pmd_k;
213 }
214 
215 void vmalloc_sync_all(void)
216 {
217 	unsigned long address;
218 
219 	if (SHARED_KERNEL_PMD)
220 		return;
221 
222 	for (address = VMALLOC_START & PMD_MASK;
223 	     address >= TASK_SIZE && address < FIXADDR_TOP;
224 	     address += PMD_SIZE) {
225 
226 		unsigned long flags;
227 		struct page *page;
228 
229 		spin_lock_irqsave(&pgd_lock, flags);
230 		list_for_each_entry(page, &pgd_list, lru) {
231 			if (!vmalloc_sync_one(page_address(page), address))
232 				break;
233 		}
234 		spin_unlock_irqrestore(&pgd_lock, flags);
235 	}
236 }
237 
238 /*
239  * 32-bit:
240  *
241  *   Handle a fault on the vmalloc or module mapping area
242  */
243 static noinline int vmalloc_fault(unsigned long address)
244 {
245 	unsigned long pgd_paddr;
246 	pmd_t *pmd_k;
247 	pte_t *pte_k;
248 
249 	/* Make sure we are in vmalloc area: */
250 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
251 		return -1;
252 
253 	/*
254 	 * Synchronize this task's top level page-table
255 	 * with the 'reference' page table.
256 	 *
257 	 * Do _not_ use "current" here. We might be inside
258 	 * an interrupt in the middle of a task switch..
259 	 */
260 	pgd_paddr = read_cr3();
261 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
262 	if (!pmd_k)
263 		return -1;
264 
265 	pte_k = pte_offset_kernel(pmd_k, address);
266 	if (!pte_present(*pte_k))
267 		return -1;
268 
269 	return 0;
270 }
271 
272 /*
273  * Did it hit the DOS screen memory VA from vm86 mode?
274  */
275 static inline void
276 check_v8086_mode(struct pt_regs *regs, unsigned long address,
277 		 struct task_struct *tsk)
278 {
279 	unsigned long bit;
280 
281 	if (!v8086_mode(regs))
282 		return;
283 
284 	bit = (address - 0xA0000) >> PAGE_SHIFT;
285 	if (bit < 32)
286 		tsk->thread.screen_bitmap |= 1 << bit;
287 }
288 
289 static bool low_pfn(unsigned long pfn)
290 {
291 	return pfn < max_low_pfn;
292 }
293 
294 static void dump_pagetable(unsigned long address)
295 {
296 	pgd_t *base = __va(read_cr3());
297 	pgd_t *pgd = &base[pgd_index(address)];
298 	pmd_t *pmd;
299 	pte_t *pte;
300 
301 #ifdef CONFIG_X86_PAE
302 	printk("*pdpt = %016Lx ", pgd_val(*pgd));
303 	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
304 		goto out;
305 #endif
306 	pmd = pmd_offset(pud_offset(pgd, address), address);
307 	printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
308 
309 	/*
310 	 * We must not directly access the pte in the highpte
311 	 * case if the page table is located in highmem.
312 	 * And let's rather not kmap-atomic the pte, just in case
313 	 * it's allocated already:
314 	 */
315 	if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
316 		goto out;
317 
318 	pte = pte_offset_kernel(pmd, address);
319 	printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
320 out:
321 	printk("\n");
322 }
323 
324 #else /* CONFIG_X86_64: */
325 
326 void vmalloc_sync_all(void)
327 {
328 	unsigned long address;
329 
330 	for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
331 	     address += PGDIR_SIZE) {
332 
333 		const pgd_t *pgd_ref = pgd_offset_k(address);
334 		unsigned long flags;
335 		struct page *page;
336 
337 		if (pgd_none(*pgd_ref))
338 			continue;
339 
340 		spin_lock_irqsave(&pgd_lock, flags);
341 		list_for_each_entry(page, &pgd_list, lru) {
342 			pgd_t *pgd;
343 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
344 			if (pgd_none(*pgd))
345 				set_pgd(pgd, *pgd_ref);
346 			else
347 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
348 		}
349 		spin_unlock_irqrestore(&pgd_lock, flags);
350 	}
351 }
352 
353 /*
354  * 64-bit:
355  *
356  *   Handle a fault on the vmalloc area
357  *
358  * This assumes no large pages in there.
359  */
360 static noinline int vmalloc_fault(unsigned long address)
361 {
362 	pgd_t *pgd, *pgd_ref;
363 	pud_t *pud, *pud_ref;
364 	pmd_t *pmd, *pmd_ref;
365 	pte_t *pte, *pte_ref;
366 
367 	/* Make sure we are in vmalloc area: */
368 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
369 		return -1;
370 
371 	/*
372 	 * Copy kernel mappings over when needed. This can also
373 	 * happen within a race in page table update. In the later
374 	 * case just flush:
375 	 */
376 	pgd = pgd_offset(current->active_mm, address);
377 	pgd_ref = pgd_offset_k(address);
378 	if (pgd_none(*pgd_ref))
379 		return -1;
380 
381 	if (pgd_none(*pgd))
382 		set_pgd(pgd, *pgd_ref);
383 	else
384 		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
385 
386 	/*
387 	 * Below here mismatches are bugs because these lower tables
388 	 * are shared:
389 	 */
390 
391 	pud = pud_offset(pgd, address);
392 	pud_ref = pud_offset(pgd_ref, address);
393 	if (pud_none(*pud_ref))
394 		return -1;
395 
396 	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
397 		BUG();
398 
399 	pmd = pmd_offset(pud, address);
400 	pmd_ref = pmd_offset(pud_ref, address);
401 	if (pmd_none(*pmd_ref))
402 		return -1;
403 
404 	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
405 		BUG();
406 
407 	pte_ref = pte_offset_kernel(pmd_ref, address);
408 	if (!pte_present(*pte_ref))
409 		return -1;
410 
411 	pte = pte_offset_kernel(pmd, address);
412 
413 	/*
414 	 * Don't use pte_page here, because the mappings can point
415 	 * outside mem_map, and the NUMA hash lookup cannot handle
416 	 * that:
417 	 */
418 	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
419 		BUG();
420 
421 	return 0;
422 }
423 
424 static const char errata93_warning[] =
425 KERN_ERR
426 "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
427 "******* Working around it, but it may cause SEGVs or burn power.\n"
428 "******* Please consider a BIOS update.\n"
429 "******* Disabling USB legacy in the BIOS may also help.\n";
430 
431 /*
432  * No vm86 mode in 64-bit mode:
433  */
434 static inline void
435 check_v8086_mode(struct pt_regs *regs, unsigned long address,
436 		 struct task_struct *tsk)
437 {
438 }
439 
440 static int bad_address(void *p)
441 {
442 	unsigned long dummy;
443 
444 	return probe_kernel_address((unsigned long *)p, dummy);
445 }
446 
447 static void dump_pagetable(unsigned long address)
448 {
449 	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
450 	pgd_t *pgd = base + pgd_index(address);
451 	pud_t *pud;
452 	pmd_t *pmd;
453 	pte_t *pte;
454 
455 	if (bad_address(pgd))
456 		goto bad;
457 
458 	printk("PGD %lx ", pgd_val(*pgd));
459 
460 	if (!pgd_present(*pgd))
461 		goto out;
462 
463 	pud = pud_offset(pgd, address);
464 	if (bad_address(pud))
465 		goto bad;
466 
467 	printk("PUD %lx ", pud_val(*pud));
468 	if (!pud_present(*pud) || pud_large(*pud))
469 		goto out;
470 
471 	pmd = pmd_offset(pud, address);
472 	if (bad_address(pmd))
473 		goto bad;
474 
475 	printk("PMD %lx ", pmd_val(*pmd));
476 	if (!pmd_present(*pmd) || pmd_large(*pmd))
477 		goto out;
478 
479 	pte = pte_offset_kernel(pmd, address);
480 	if (bad_address(pte))
481 		goto bad;
482 
483 	printk("PTE %lx", pte_val(*pte));
484 out:
485 	printk("\n");
486 	return;
487 bad:
488 	printk("BAD\n");
489 }
490 
491 #endif /* CONFIG_X86_64 */
492 
493 /*
494  * Workaround for K8 erratum #93 & buggy BIOS.
495  *
496  * BIOS SMM functions are required to use a specific workaround
497  * to avoid corruption of the 64bit RIP register on C stepping K8.
498  *
499  * A lot of BIOS that didn't get tested properly miss this.
500  *
501  * The OS sees this as a page fault with the upper 32bits of RIP cleared.
502  * Try to work around it here.
503  *
504  * Note we only handle faults in kernel here.
505  * Does nothing on 32-bit.
506  */
507 static int is_errata93(struct pt_regs *regs, unsigned long address)
508 {
509 #ifdef CONFIG_X86_64
510 	if (address != regs->ip)
511 		return 0;
512 
513 	if ((address >> 32) != 0)
514 		return 0;
515 
516 	address |= 0xffffffffUL << 32;
517 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
518 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
519 		printk_once(errata93_warning);
520 		regs->ip = address;
521 		return 1;
522 	}
523 #endif
524 	return 0;
525 }
526 
527 /*
528  * Work around K8 erratum #100 K8 in compat mode occasionally jumps
529  * to illegal addresses >4GB.
530  *
531  * We catch this in the page fault handler because these addresses
532  * are not reachable. Just detect this case and return.  Any code
533  * segment in LDT is compatibility mode.
534  */
535 static int is_errata100(struct pt_regs *regs, unsigned long address)
536 {
537 #ifdef CONFIG_X86_64
538 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
539 		return 1;
540 #endif
541 	return 0;
542 }
543 
544 static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
545 {
546 #ifdef CONFIG_X86_F00F_BUG
547 	unsigned long nr;
548 
549 	/*
550 	 * Pentium F0 0F C7 C8 bug workaround:
551 	 */
552 	if (boot_cpu_data.f00f_bug) {
553 		nr = (address - idt_descr.address) >> 3;
554 
555 		if (nr == 6) {
556 			do_invalid_op(regs, 0);
557 			return 1;
558 		}
559 	}
560 #endif
561 	return 0;
562 }
563 
564 static const char nx_warning[] = KERN_CRIT
565 "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
566 
567 static void
568 show_fault_oops(struct pt_regs *regs, unsigned long error_code,
569 		unsigned long address)
570 {
571 	if (!oops_may_print())
572 		return;
573 
574 	if (error_code & PF_INSTR) {
575 		unsigned int level;
576 
577 		pte_t *pte = lookup_address(address, &level);
578 
579 		if (pte && pte_present(*pte) && !pte_exec(*pte))
580 			printk(nx_warning, current_uid());
581 	}
582 
583 	printk(KERN_ALERT "BUG: unable to handle kernel ");
584 	if (address < PAGE_SIZE)
585 		printk(KERN_CONT "NULL pointer dereference");
586 	else
587 		printk(KERN_CONT "paging request");
588 
589 	printk(KERN_CONT " at %p\n", (void *) address);
590 	printk(KERN_ALERT "IP:");
591 	printk_address(regs->ip, 1);
592 
593 	dump_pagetable(address);
594 }
595 
596 static noinline void
597 pgtable_bad(struct pt_regs *regs, unsigned long error_code,
598 	    unsigned long address)
599 {
600 	struct task_struct *tsk;
601 	unsigned long flags;
602 	int sig;
603 
604 	flags = oops_begin();
605 	tsk = current;
606 	sig = SIGKILL;
607 
608 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
609 	       tsk->comm, address);
610 	dump_pagetable(address);
611 
612 	tsk->thread.cr2		= address;
613 	tsk->thread.trap_no	= 14;
614 	tsk->thread.error_code	= error_code;
615 
616 	if (__die("Bad pagetable", regs, error_code))
617 		sig = 0;
618 
619 	oops_end(flags, regs, sig);
620 }
621 
622 static noinline void
623 no_context(struct pt_regs *regs, unsigned long error_code,
624 	   unsigned long address)
625 {
626 	struct task_struct *tsk = current;
627 	unsigned long *stackend;
628 	unsigned long flags;
629 	int sig;
630 
631 	/* Are we prepared to handle this kernel fault? */
632 	if (fixup_exception(regs))
633 		return;
634 
635 	/*
636 	 * 32-bit:
637 	 *
638 	 *   Valid to do another page fault here, because if this fault
639 	 *   had been triggered by is_prefetch fixup_exception would have
640 	 *   handled it.
641 	 *
642 	 * 64-bit:
643 	 *
644 	 *   Hall of shame of CPU/BIOS bugs.
645 	 */
646 	if (is_prefetch(regs, error_code, address))
647 		return;
648 
649 	if (is_errata93(regs, address))
650 		return;
651 
652 	/*
653 	 * Oops. The kernel tried to access some bad page. We'll have to
654 	 * terminate things with extreme prejudice:
655 	 */
656 	flags = oops_begin();
657 
658 	show_fault_oops(regs, error_code, address);
659 
660 	stackend = end_of_stack(tsk);
661 	if (*stackend != STACK_END_MAGIC)
662 		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
663 
664 	tsk->thread.cr2		= address;
665 	tsk->thread.trap_no	= 14;
666 	tsk->thread.error_code	= error_code;
667 
668 	sig = SIGKILL;
669 	if (__die("Oops", regs, error_code))
670 		sig = 0;
671 
672 	/* Executive summary in case the body of the oops scrolled away */
673 	printk(KERN_EMERG "CR2: %016lx\n", address);
674 
675 	oops_end(flags, regs, sig);
676 }
677 
678 /*
679  * Print out info about fatal segfaults, if the show_unhandled_signals
680  * sysctl is set:
681  */
682 static inline void
683 show_signal_msg(struct pt_regs *regs, unsigned long error_code,
684 		unsigned long address, struct task_struct *tsk)
685 {
686 	if (!unhandled_signal(tsk, SIGSEGV))
687 		return;
688 
689 	if (!printk_ratelimit())
690 		return;
691 
692 	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
693 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
694 		tsk->comm, task_pid_nr(tsk), address,
695 		(void *)regs->ip, (void *)regs->sp, error_code);
696 
697 	print_vma_addr(KERN_CONT " in ", regs->ip);
698 
699 	printk(KERN_CONT "\n");
700 }
701 
702 static void
703 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
704 		       unsigned long address, int si_code)
705 {
706 	struct task_struct *tsk = current;
707 
708 	/* User mode accesses just cause a SIGSEGV */
709 	if (error_code & PF_USER) {
710 		/*
711 		 * It's possible to have interrupts off here:
712 		 */
713 		local_irq_enable();
714 
715 		/*
716 		 * Valid to do another page fault here because this one came
717 		 * from user space:
718 		 */
719 		if (is_prefetch(regs, error_code, address))
720 			return;
721 
722 		if (is_errata100(regs, address))
723 			return;
724 
725 		if (unlikely(show_unhandled_signals))
726 			show_signal_msg(regs, error_code, address, tsk);
727 
728 		/* Kernel addresses are always protection faults: */
729 		tsk->thread.cr2		= address;
730 		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
731 		tsk->thread.trap_no	= 14;
732 
733 		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
734 
735 		return;
736 	}
737 
738 	if (is_f00f_bug(regs, address))
739 		return;
740 
741 	no_context(regs, error_code, address);
742 }
743 
744 static noinline void
745 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
746 		     unsigned long address)
747 {
748 	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
749 }
750 
751 static void
752 __bad_area(struct pt_regs *regs, unsigned long error_code,
753 	   unsigned long address, int si_code)
754 {
755 	struct mm_struct *mm = current->mm;
756 
757 	/*
758 	 * Something tried to access memory that isn't in our memory map..
759 	 * Fix it, but check if it's kernel or user first..
760 	 */
761 	up_read(&mm->mmap_sem);
762 
763 	__bad_area_nosemaphore(regs, error_code, address, si_code);
764 }
765 
766 static noinline void
767 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
768 {
769 	__bad_area(regs, error_code, address, SEGV_MAPERR);
770 }
771 
772 static noinline void
773 bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
774 		      unsigned long address)
775 {
776 	__bad_area(regs, error_code, address, SEGV_ACCERR);
777 }
778 
779 /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
780 static void
781 out_of_memory(struct pt_regs *regs, unsigned long error_code,
782 	      unsigned long address)
783 {
784 	/*
785 	 * We ran out of memory, call the OOM killer, and return the userspace
786 	 * (which will retry the fault, or kill us if we got oom-killed):
787 	 */
788 	up_read(&current->mm->mmap_sem);
789 
790 	pagefault_out_of_memory();
791 }
792 
793 static void
794 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
795 	  unsigned int fault)
796 {
797 	struct task_struct *tsk = current;
798 	struct mm_struct *mm = tsk->mm;
799 	int code = BUS_ADRERR;
800 
801 	up_read(&mm->mmap_sem);
802 
803 	/* Kernel mode? Handle exceptions or die: */
804 	if (!(error_code & PF_USER))
805 		no_context(regs, error_code, address);
806 
807 	/* User-space => ok to do another page fault: */
808 	if (is_prefetch(regs, error_code, address))
809 		return;
810 
811 	tsk->thread.cr2		= address;
812 	tsk->thread.error_code	= error_code;
813 	tsk->thread.trap_no	= 14;
814 
815 #ifdef CONFIG_MEMORY_FAILURE
816 	if (fault & VM_FAULT_HWPOISON) {
817 		printk(KERN_ERR
818 	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
819 			tsk->comm, tsk->pid, address);
820 		code = BUS_MCEERR_AR;
821 	}
822 #endif
823 	force_sig_info_fault(SIGBUS, code, address, tsk);
824 }
825 
826 static noinline void
827 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
828 	       unsigned long address, unsigned int fault)
829 {
830 	if (fault & VM_FAULT_OOM) {
831 		out_of_memory(regs, error_code, address);
832 	} else {
833 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
834 			do_sigbus(regs, error_code, address, fault);
835 		else
836 			BUG();
837 	}
838 }
839 
840 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
841 {
842 	if ((error_code & PF_WRITE) && !pte_write(*pte))
843 		return 0;
844 
845 	if ((error_code & PF_INSTR) && !pte_exec(*pte))
846 		return 0;
847 
848 	return 1;
849 }
850 
851 /*
852  * Handle a spurious fault caused by a stale TLB entry.
853  *
854  * This allows us to lazily refresh the TLB when increasing the
855  * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
856  * eagerly is very expensive since that implies doing a full
857  * cross-processor TLB flush, even if no stale TLB entries exist
858  * on other processors.
859  *
860  * There are no security implications to leaving a stale TLB when
861  * increasing the permissions on a page.
862  */
863 static noinline int
864 spurious_fault(unsigned long error_code, unsigned long address)
865 {
866 	pgd_t *pgd;
867 	pud_t *pud;
868 	pmd_t *pmd;
869 	pte_t *pte;
870 	int ret;
871 
872 	/* Reserved-bit violation or user access to kernel space? */
873 	if (error_code & (PF_USER | PF_RSVD))
874 		return 0;
875 
876 	pgd = init_mm.pgd + pgd_index(address);
877 	if (!pgd_present(*pgd))
878 		return 0;
879 
880 	pud = pud_offset(pgd, address);
881 	if (!pud_present(*pud))
882 		return 0;
883 
884 	if (pud_large(*pud))
885 		return spurious_fault_check(error_code, (pte_t *) pud);
886 
887 	pmd = pmd_offset(pud, address);
888 	if (!pmd_present(*pmd))
889 		return 0;
890 
891 	if (pmd_large(*pmd))
892 		return spurious_fault_check(error_code, (pte_t *) pmd);
893 
894 	pte = pte_offset_kernel(pmd, address);
895 	if (!pte_present(*pte))
896 		return 0;
897 
898 	ret = spurious_fault_check(error_code, pte);
899 	if (!ret)
900 		return 0;
901 
902 	/*
903 	 * Make sure we have permissions in PMD.
904 	 * If not, then there's a bug in the page tables:
905 	 */
906 	ret = spurious_fault_check(error_code, (pte_t *) pmd);
907 	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
908 
909 	return ret;
910 }
911 
912 int show_unhandled_signals = 1;
913 
914 static inline int
915 access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
916 {
917 	if (write) {
918 		/* write, present and write, not present: */
919 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
920 			return 1;
921 		return 0;
922 	}
923 
924 	/* read, present: */
925 	if (unlikely(error_code & PF_PROT))
926 		return 1;
927 
928 	/* read, not present: */
929 	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
930 		return 1;
931 
932 	return 0;
933 }
934 
935 static int fault_in_kernel_space(unsigned long address)
936 {
937 	return address >= TASK_SIZE_MAX;
938 }
939 
940 /*
941  * This routine handles page faults.  It determines the address,
942  * and the problem, and then passes it off to one of the appropriate
943  * routines.
944  */
945 dotraplinkage void __kprobes
946 do_page_fault(struct pt_regs *regs, unsigned long error_code)
947 {
948 	struct vm_area_struct *vma;
949 	struct task_struct *tsk;
950 	unsigned long address;
951 	struct mm_struct *mm;
952 	int write;
953 	int fault;
954 
955 	tsk = current;
956 	mm = tsk->mm;
957 
958 	/* Get the faulting address: */
959 	address = read_cr2();
960 
961 	/*
962 	 * Detect and handle instructions that would cause a page fault for
963 	 * both a tracked kernel page and a userspace page.
964 	 */
965 	if (kmemcheck_active(regs))
966 		kmemcheck_hide(regs);
967 	prefetchw(&mm->mmap_sem);
968 
969 	if (unlikely(kmmio_fault(regs, address)))
970 		return;
971 
972 	/*
973 	 * We fault-in kernel-space virtual memory on-demand. The
974 	 * 'reference' page table is init_mm.pgd.
975 	 *
976 	 * NOTE! We MUST NOT take any locks for this case. We may
977 	 * be in an interrupt or a critical region, and should
978 	 * only copy the information from the master page table,
979 	 * nothing more.
980 	 *
981 	 * This verifies that the fault happens in kernel space
982 	 * (error_code & 4) == 0, and that the fault was not a
983 	 * protection error (error_code & 9) == 0.
984 	 */
985 	if (unlikely(fault_in_kernel_space(address))) {
986 		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
987 			if (vmalloc_fault(address) >= 0)
988 				return;
989 
990 			if (kmemcheck_fault(regs, address, error_code))
991 				return;
992 		}
993 
994 		/* Can handle a stale RO->RW TLB: */
995 		if (spurious_fault(error_code, address))
996 			return;
997 
998 		/* kprobes don't want to hook the spurious faults: */
999 		if (notify_page_fault(regs))
1000 			return;
1001 		/*
1002 		 * Don't take the mm semaphore here. If we fixup a prefetch
1003 		 * fault we could otherwise deadlock:
1004 		 */
1005 		bad_area_nosemaphore(regs, error_code, address);
1006 
1007 		return;
1008 	}
1009 
1010 	/* kprobes don't want to hook the spurious faults: */
1011 	if (unlikely(notify_page_fault(regs)))
1012 		return;
1013 	/*
1014 	 * It's safe to allow irq's after cr2 has been saved and the
1015 	 * vmalloc fault has been handled.
1016 	 *
1017 	 * User-mode registers count as a user access even for any
1018 	 * potential system fault or CPU buglet:
1019 	 */
1020 	if (user_mode_vm(regs)) {
1021 		local_irq_enable();
1022 		error_code |= PF_USER;
1023 	} else {
1024 		if (regs->flags & X86_EFLAGS_IF)
1025 			local_irq_enable();
1026 	}
1027 
1028 	if (unlikely(error_code & PF_RSVD))
1029 		pgtable_bad(regs, error_code, address);
1030 
1031 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
1032 
1033 	/*
1034 	 * If we're in an interrupt, have no user context or are running
1035 	 * in an atomic region then we must not take the fault:
1036 	 */
1037 	if (unlikely(in_atomic() || !mm)) {
1038 		bad_area_nosemaphore(regs, error_code, address);
1039 		return;
1040 	}
1041 
1042 	/*
1043 	 * When running in the kernel we expect faults to occur only to
1044 	 * addresses in user space.  All other faults represent errors in
1045 	 * the kernel and should generate an OOPS.  Unfortunately, in the
1046 	 * case of an erroneous fault occurring in a code path which already
1047 	 * holds mmap_sem we will deadlock attempting to validate the fault
1048 	 * against the address space.  Luckily the kernel only validly
1049 	 * references user space from well defined areas of code, which are
1050 	 * listed in the exceptions table.
1051 	 *
1052 	 * As the vast majority of faults will be valid we will only perform
1053 	 * the source reference check when there is a possibility of a
1054 	 * deadlock. Attempt to lock the address space, if we cannot we then
1055 	 * validate the source. If this is invalid we can skip the address
1056 	 * space check, thus avoiding the deadlock:
1057 	 */
1058 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1059 		if ((error_code & PF_USER) == 0 &&
1060 		    !search_exception_tables(regs->ip)) {
1061 			bad_area_nosemaphore(regs, error_code, address);
1062 			return;
1063 		}
1064 		down_read(&mm->mmap_sem);
1065 	} else {
1066 		/*
1067 		 * The above down_read_trylock() might have succeeded in
1068 		 * which case we'll have missed the might_sleep() from
1069 		 * down_read():
1070 		 */
1071 		might_sleep();
1072 	}
1073 
1074 	vma = find_vma(mm, address);
1075 	if (unlikely(!vma)) {
1076 		bad_area(regs, error_code, address);
1077 		return;
1078 	}
1079 	if (likely(vma->vm_start <= address))
1080 		goto good_area;
1081 	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1082 		bad_area(regs, error_code, address);
1083 		return;
1084 	}
1085 	if (error_code & PF_USER) {
1086 		/*
1087 		 * Accessing the stack below %sp is always a bug.
1088 		 * The large cushion allows instructions like enter
1089 		 * and pusha to work. ("enter $65535, $31" pushes
1090 		 * 32 pointers and then decrements %sp by 65535.)
1091 		 */
1092 		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1093 			bad_area(regs, error_code, address);
1094 			return;
1095 		}
1096 	}
1097 	if (unlikely(expand_stack(vma, address))) {
1098 		bad_area(regs, error_code, address);
1099 		return;
1100 	}
1101 
1102 	/*
1103 	 * Ok, we have a good vm_area for this memory access, so
1104 	 * we can handle it..
1105 	 */
1106 good_area:
1107 	write = error_code & PF_WRITE;
1108 
1109 	if (unlikely(access_error(error_code, write, vma))) {
1110 		bad_area_access_error(regs, error_code, address);
1111 		return;
1112 	}
1113 
1114 	/*
1115 	 * If for any reason at all we couldn't handle the fault,
1116 	 * make sure we exit gracefully rather than endlessly redo
1117 	 * the fault:
1118 	 */
1119 	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
1120 
1121 	if (unlikely(fault & VM_FAULT_ERROR)) {
1122 		mm_fault_error(regs, error_code, address, fault);
1123 		return;
1124 	}
1125 
1126 	if (fault & VM_FAULT_MAJOR) {
1127 		tsk->maj_flt++;
1128 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1129 				     regs, address);
1130 	} else {
1131 		tsk->min_flt++;
1132 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1133 				     regs, address);
1134 	}
1135 
1136 	check_v8086_mode(regs, address, tsk);
1137 
1138 	up_read(&mm->mmap_sem);
1139 }
1140