xref: /openbmc/linux/arch/x86/mm/fault.c (revision b2765275)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
5  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
6  */
7 #include <linux/sched.h>		/* test_thread_flag(), ...	*/
8 #include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
9 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
10 #include <linux/extable.h>		/* search_exception_tables	*/
11 #include <linux/memblock.h>		/* max_low_pfn			*/
12 #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
13 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
14 #include <linux/perf_event.h>		/* perf_sw_event		*/
15 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
16 #include <linux/prefetch.h>		/* prefetchw			*/
17 #include <linux/context_tracking.h>	/* exception_enter(), ...	*/
18 #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
19 #include <linux/efi.h>			/* efi_recover_from_page_fault()*/
20 #include <linux/mm_types.h>
21 
22 #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
23 #include <asm/traps.h>			/* dotraplinkage, ...		*/
24 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
25 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
26 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
27 #include <asm/vm86.h>			/* struct vm86			*/
28 #include <asm/mmu_context.h>		/* vma_pkey()			*/
29 #include <asm/efi.h>			/* efi_recover_from_page_fault()*/
30 #include <asm/desc.h>			/* store_idt(), ...		*/
31 #include <asm/cpu_entry_area.h>		/* exception stack		*/
32 #include <asm/pgtable_areas.h>		/* VMALLOC_START, ...		*/
33 
34 #define CREATE_TRACE_POINTS
35 #include <asm/trace/exceptions.h>
36 
37 /*
38  * Returns 0 if mmiotrace is disabled, or if the fault is not
39  * handled by mmiotrace:
40  */
41 static nokprobe_inline int
42 kmmio_fault(struct pt_regs *regs, unsigned long addr)
43 {
44 	if (unlikely(is_kmmio_active()))
45 		if (kmmio_handler(regs, addr) == 1)
46 			return -1;
47 	return 0;
48 }
49 
50 /*
51  * Prefetch quirks:
52  *
53  * 32-bit mode:
54  *
55  *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
56  *   Check that here and ignore it.
57  *
58  * 64-bit mode:
59  *
60  *   Sometimes the CPU reports invalid exceptions on prefetch.
61  *   Check that here and ignore it.
62  *
63  * Opcode checker based on code by Richard Brunner.
64  */
65 static inline int
66 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
67 		      unsigned char opcode, int *prefetch)
68 {
69 	unsigned char instr_hi = opcode & 0xf0;
70 	unsigned char instr_lo = opcode & 0x0f;
71 
72 	switch (instr_hi) {
73 	case 0x20:
74 	case 0x30:
75 		/*
76 		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
77 		 * In X86_64 long mode, the CPU will signal invalid
78 		 * opcode if some of these prefixes are present so
79 		 * X86_64 will never get here anyway
80 		 */
81 		return ((instr_lo & 7) == 0x6);
82 #ifdef CONFIG_X86_64
83 	case 0x40:
84 		/*
85 		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
86 		 * Need to figure out under what instruction mode the
87 		 * instruction was issued. Could check the LDT for lm,
88 		 * but for now it's good enough to assume that long
89 		 * mode only uses well known segments or kernel.
90 		 */
91 		return (!user_mode(regs) || user_64bit_mode(regs));
92 #endif
93 	case 0x60:
94 		/* 0x64 thru 0x67 are valid prefixes in all modes. */
95 		return (instr_lo & 0xC) == 0x4;
96 	case 0xF0:
97 		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
98 		return !instr_lo || (instr_lo>>1) == 1;
99 	case 0x00:
100 		/* Prefetch instruction is 0x0F0D or 0x0F18 */
101 		if (probe_kernel_address(instr, opcode))
102 			return 0;
103 
104 		*prefetch = (instr_lo == 0xF) &&
105 			(opcode == 0x0D || opcode == 0x18);
106 		return 0;
107 	default:
108 		return 0;
109 	}
110 }
111 
112 static int
113 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
114 {
115 	unsigned char *max_instr;
116 	unsigned char *instr;
117 	int prefetch = 0;
118 
119 	/*
120 	 * If it was a exec (instruction fetch) fault on NX page, then
121 	 * do not ignore the fault:
122 	 */
123 	if (error_code & X86_PF_INSTR)
124 		return 0;
125 
126 	instr = (void *)convert_ip_to_linear(current, regs);
127 	max_instr = instr + 15;
128 
129 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
130 		return 0;
131 
132 	while (instr < max_instr) {
133 		unsigned char opcode;
134 
135 		if (probe_kernel_address(instr, opcode))
136 			break;
137 
138 		instr++;
139 
140 		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
141 			break;
142 	}
143 	return prefetch;
144 }
145 
146 DEFINE_SPINLOCK(pgd_lock);
147 LIST_HEAD(pgd_list);
148 
149 #ifdef CONFIG_X86_32
150 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
151 {
152 	unsigned index = pgd_index(address);
153 	pgd_t *pgd_k;
154 	p4d_t *p4d, *p4d_k;
155 	pud_t *pud, *pud_k;
156 	pmd_t *pmd, *pmd_k;
157 
158 	pgd += index;
159 	pgd_k = init_mm.pgd + index;
160 
161 	if (!pgd_present(*pgd_k))
162 		return NULL;
163 
164 	/*
165 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
166 	 * and redundant with the set_pmd() on non-PAE. As would
167 	 * set_p4d/set_pud.
168 	 */
169 	p4d = p4d_offset(pgd, address);
170 	p4d_k = p4d_offset(pgd_k, address);
171 	if (!p4d_present(*p4d_k))
172 		return NULL;
173 
174 	pud = pud_offset(p4d, address);
175 	pud_k = pud_offset(p4d_k, address);
176 	if (!pud_present(*pud_k))
177 		return NULL;
178 
179 	pmd = pmd_offset(pud, address);
180 	pmd_k = pmd_offset(pud_k, address);
181 
182 	if (pmd_present(*pmd) != pmd_present(*pmd_k))
183 		set_pmd(pmd, *pmd_k);
184 
185 	if (!pmd_present(*pmd_k))
186 		return NULL;
187 	else
188 		BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
189 
190 	return pmd_k;
191 }
192 
193 static void vmalloc_sync(void)
194 {
195 	unsigned long address;
196 
197 	if (SHARED_KERNEL_PMD)
198 		return;
199 
200 	for (address = VMALLOC_START & PMD_MASK;
201 	     address >= TASK_SIZE_MAX && address < VMALLOC_END;
202 	     address += PMD_SIZE) {
203 		struct page *page;
204 
205 		spin_lock(&pgd_lock);
206 		list_for_each_entry(page, &pgd_list, lru) {
207 			spinlock_t *pgt_lock;
208 
209 			/* the pgt_lock only for Xen */
210 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
211 
212 			spin_lock(pgt_lock);
213 			vmalloc_sync_one(page_address(page), address);
214 			spin_unlock(pgt_lock);
215 		}
216 		spin_unlock(&pgd_lock);
217 	}
218 }
219 
220 void vmalloc_sync_mappings(void)
221 {
222 	vmalloc_sync();
223 }
224 
225 void vmalloc_sync_unmappings(void)
226 {
227 	vmalloc_sync();
228 }
229 
230 /*
231  * 32-bit:
232  *
233  *   Handle a fault on the vmalloc or module mapping area
234  */
235 static noinline int vmalloc_fault(unsigned long address)
236 {
237 	unsigned long pgd_paddr;
238 	pmd_t *pmd_k;
239 	pte_t *pte_k;
240 
241 	/* Make sure we are in vmalloc area: */
242 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
243 		return -1;
244 
245 	/*
246 	 * Synchronize this task's top level page-table
247 	 * with the 'reference' page table.
248 	 *
249 	 * Do _not_ use "current" here. We might be inside
250 	 * an interrupt in the middle of a task switch..
251 	 */
252 	pgd_paddr = read_cr3_pa();
253 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
254 	if (!pmd_k)
255 		return -1;
256 
257 	if (pmd_large(*pmd_k))
258 		return 0;
259 
260 	pte_k = pte_offset_kernel(pmd_k, address);
261 	if (!pte_present(*pte_k))
262 		return -1;
263 
264 	return 0;
265 }
266 NOKPROBE_SYMBOL(vmalloc_fault);
267 
268 /*
269  * Did it hit the DOS screen memory VA from vm86 mode?
270  */
271 static inline void
272 check_v8086_mode(struct pt_regs *regs, unsigned long address,
273 		 struct task_struct *tsk)
274 {
275 #ifdef CONFIG_VM86
276 	unsigned long bit;
277 
278 	if (!v8086_mode(regs) || !tsk->thread.vm86)
279 		return;
280 
281 	bit = (address - 0xA0000) >> PAGE_SHIFT;
282 	if (bit < 32)
283 		tsk->thread.vm86->screen_bitmap |= 1 << bit;
284 #endif
285 }
286 
287 static bool low_pfn(unsigned long pfn)
288 {
289 	return pfn < max_low_pfn;
290 }
291 
292 static void dump_pagetable(unsigned long address)
293 {
294 	pgd_t *base = __va(read_cr3_pa());
295 	pgd_t *pgd = &base[pgd_index(address)];
296 	p4d_t *p4d;
297 	pud_t *pud;
298 	pmd_t *pmd;
299 	pte_t *pte;
300 
301 #ifdef CONFIG_X86_PAE
302 	pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
303 	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
304 		goto out;
305 #define pr_pde pr_cont
306 #else
307 #define pr_pde pr_info
308 #endif
309 	p4d = p4d_offset(pgd, address);
310 	pud = pud_offset(p4d, address);
311 	pmd = pmd_offset(pud, address);
312 	pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
313 #undef pr_pde
314 
315 	/*
316 	 * We must not directly access the pte in the highpte
317 	 * case if the page table is located in highmem.
318 	 * And let's rather not kmap-atomic the pte, just in case
319 	 * it's allocated already:
320 	 */
321 	if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
322 		goto out;
323 
324 	pte = pte_offset_kernel(pmd, address);
325 	pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
326 out:
327 	pr_cont("\n");
328 }
329 
330 #else /* CONFIG_X86_64: */
331 
332 void vmalloc_sync_mappings(void)
333 {
334 	/*
335 	 * 64-bit mappings might allocate new p4d/pud pages
336 	 * that need to be propagated to all tasks' PGDs.
337 	 */
338 	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
339 }
340 
341 void vmalloc_sync_unmappings(void)
342 {
343 	/*
344 	 * Unmappings never allocate or free p4d/pud pages.
345 	 * No work is required here.
346 	 */
347 }
348 
349 /*
350  * 64-bit:
351  *
352  *   Handle a fault on the vmalloc area
353  */
354 static noinline int vmalloc_fault(unsigned long address)
355 {
356 	pgd_t *pgd, *pgd_k;
357 	p4d_t *p4d, *p4d_k;
358 	pud_t *pud;
359 	pmd_t *pmd;
360 	pte_t *pte;
361 
362 	/* Make sure we are in vmalloc area: */
363 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
364 		return -1;
365 
366 	/*
367 	 * Copy kernel mappings over when needed. This can also
368 	 * happen within a race in page table update. In the later
369 	 * case just flush:
370 	 */
371 	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
372 	pgd_k = pgd_offset_k(address);
373 	if (pgd_none(*pgd_k))
374 		return -1;
375 
376 	if (pgtable_l5_enabled()) {
377 		if (pgd_none(*pgd)) {
378 			set_pgd(pgd, *pgd_k);
379 			arch_flush_lazy_mmu_mode();
380 		} else {
381 			BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
382 		}
383 	}
384 
385 	/* With 4-level paging, copying happens on the p4d level. */
386 	p4d = p4d_offset(pgd, address);
387 	p4d_k = p4d_offset(pgd_k, address);
388 	if (p4d_none(*p4d_k))
389 		return -1;
390 
391 	if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
392 		set_p4d(p4d, *p4d_k);
393 		arch_flush_lazy_mmu_mode();
394 	} else {
395 		BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
396 	}
397 
398 	BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
399 
400 	pud = pud_offset(p4d, address);
401 	if (pud_none(*pud))
402 		return -1;
403 
404 	if (pud_large(*pud))
405 		return 0;
406 
407 	pmd = pmd_offset(pud, address);
408 	if (pmd_none(*pmd))
409 		return -1;
410 
411 	if (pmd_large(*pmd))
412 		return 0;
413 
414 	pte = pte_offset_kernel(pmd, address);
415 	if (!pte_present(*pte))
416 		return -1;
417 
418 	return 0;
419 }
420 NOKPROBE_SYMBOL(vmalloc_fault);
421 
422 #ifdef CONFIG_CPU_SUP_AMD
423 static const char errata93_warning[] =
424 KERN_ERR
425 "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
426 "******* Working around it, but it may cause SEGVs or burn power.\n"
427 "******* Please consider a BIOS update.\n"
428 "******* Disabling USB legacy in the BIOS may also help.\n";
429 #endif
430 
431 /*
432  * No vm86 mode in 64-bit mode:
433  */
434 static inline void
435 check_v8086_mode(struct pt_regs *regs, unsigned long address,
436 		 struct task_struct *tsk)
437 {
438 }
439 
440 static int bad_address(void *p)
441 {
442 	unsigned long dummy;
443 
444 	return probe_kernel_address((unsigned long *)p, dummy);
445 }
446 
447 static void dump_pagetable(unsigned long address)
448 {
449 	pgd_t *base = __va(read_cr3_pa());
450 	pgd_t *pgd = base + pgd_index(address);
451 	p4d_t *p4d;
452 	pud_t *pud;
453 	pmd_t *pmd;
454 	pte_t *pte;
455 
456 	if (bad_address(pgd))
457 		goto bad;
458 
459 	pr_info("PGD %lx ", pgd_val(*pgd));
460 
461 	if (!pgd_present(*pgd))
462 		goto out;
463 
464 	p4d = p4d_offset(pgd, address);
465 	if (bad_address(p4d))
466 		goto bad;
467 
468 	pr_cont("P4D %lx ", p4d_val(*p4d));
469 	if (!p4d_present(*p4d) || p4d_large(*p4d))
470 		goto out;
471 
472 	pud = pud_offset(p4d, address);
473 	if (bad_address(pud))
474 		goto bad;
475 
476 	pr_cont("PUD %lx ", pud_val(*pud));
477 	if (!pud_present(*pud) || pud_large(*pud))
478 		goto out;
479 
480 	pmd = pmd_offset(pud, address);
481 	if (bad_address(pmd))
482 		goto bad;
483 
484 	pr_cont("PMD %lx ", pmd_val(*pmd));
485 	if (!pmd_present(*pmd) || pmd_large(*pmd))
486 		goto out;
487 
488 	pte = pte_offset_kernel(pmd, address);
489 	if (bad_address(pte))
490 		goto bad;
491 
492 	pr_cont("PTE %lx", pte_val(*pte));
493 out:
494 	pr_cont("\n");
495 	return;
496 bad:
497 	pr_info("BAD\n");
498 }
499 
500 #endif /* CONFIG_X86_64 */
501 
502 /*
503  * Workaround for K8 erratum #93 & buggy BIOS.
504  *
505  * BIOS SMM functions are required to use a specific workaround
506  * to avoid corruption of the 64bit RIP register on C stepping K8.
507  *
508  * A lot of BIOS that didn't get tested properly miss this.
509  *
510  * The OS sees this as a page fault with the upper 32bits of RIP cleared.
511  * Try to work around it here.
512  *
513  * Note we only handle faults in kernel here.
514  * Does nothing on 32-bit.
515  */
516 static int is_errata93(struct pt_regs *regs, unsigned long address)
517 {
518 #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
519 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
520 	    || boot_cpu_data.x86 != 0xf)
521 		return 0;
522 
523 	if (address != regs->ip)
524 		return 0;
525 
526 	if ((address >> 32) != 0)
527 		return 0;
528 
529 	address |= 0xffffffffUL << 32;
530 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
531 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
532 		printk_once(errata93_warning);
533 		regs->ip = address;
534 		return 1;
535 	}
536 #endif
537 	return 0;
538 }
539 
540 /*
541  * Work around K8 erratum #100 K8 in compat mode occasionally jumps
542  * to illegal addresses >4GB.
543  *
544  * We catch this in the page fault handler because these addresses
545  * are not reachable. Just detect this case and return.  Any code
546  * segment in LDT is compatibility mode.
547  */
548 static int is_errata100(struct pt_regs *regs, unsigned long address)
549 {
550 #ifdef CONFIG_X86_64
551 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
552 		return 1;
553 #endif
554 	return 0;
555 }
556 
557 static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
558 {
559 #ifdef CONFIG_X86_F00F_BUG
560 	unsigned long nr;
561 
562 	/*
563 	 * Pentium F0 0F C7 C8 bug workaround:
564 	 */
565 	if (boot_cpu_has_bug(X86_BUG_F00F)) {
566 		nr = (address - idt_descr.address) >> 3;
567 
568 		if (nr == 6) {
569 			do_invalid_op(regs, 0);
570 			return 1;
571 		}
572 	}
573 #endif
574 	return 0;
575 }
576 
577 static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
578 {
579 	u32 offset = (index >> 3) * sizeof(struct desc_struct);
580 	unsigned long addr;
581 	struct ldttss_desc desc;
582 
583 	if (index == 0) {
584 		pr_alert("%s: NULL\n", name);
585 		return;
586 	}
587 
588 	if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
589 		pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
590 		return;
591 	}
592 
593 	if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
594 			      sizeof(struct ldttss_desc))) {
595 		pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
596 			 name, index);
597 		return;
598 	}
599 
600 	addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
601 #ifdef CONFIG_X86_64
602 	addr |= ((u64)desc.base3 << 32);
603 #endif
604 	pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
605 		 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
606 }
607 
608 static void
609 show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
610 {
611 	if (!oops_may_print())
612 		return;
613 
614 	if (error_code & X86_PF_INSTR) {
615 		unsigned int level;
616 		pgd_t *pgd;
617 		pte_t *pte;
618 
619 		pgd = __va(read_cr3_pa());
620 		pgd += pgd_index(address);
621 
622 		pte = lookup_address_in_pgd(pgd, address, &level);
623 
624 		if (pte && pte_present(*pte) && !pte_exec(*pte))
625 			pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
626 				from_kuid(&init_user_ns, current_uid()));
627 		if (pte && pte_present(*pte) && pte_exec(*pte) &&
628 				(pgd_flags(*pgd) & _PAGE_USER) &&
629 				(__read_cr4() & X86_CR4_SMEP))
630 			pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
631 				from_kuid(&init_user_ns, current_uid()));
632 	}
633 
634 	if (address < PAGE_SIZE && !user_mode(regs))
635 		pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
636 			(void *)address);
637 	else
638 		pr_alert("BUG: unable to handle page fault for address: %px\n",
639 			(void *)address);
640 
641 	pr_alert("#PF: %s %s in %s mode\n",
642 		 (error_code & X86_PF_USER)  ? "user" : "supervisor",
643 		 (error_code & X86_PF_INSTR) ? "instruction fetch" :
644 		 (error_code & X86_PF_WRITE) ? "write access" :
645 					       "read access",
646 			     user_mode(regs) ? "user" : "kernel");
647 	pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
648 		 !(error_code & X86_PF_PROT) ? "not-present page" :
649 		 (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
650 		 (error_code & X86_PF_PK)    ? "protection keys violation" :
651 					       "permissions violation");
652 
653 	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
654 		struct desc_ptr idt, gdt;
655 		u16 ldtr, tr;
656 
657 		/*
658 		 * This can happen for quite a few reasons.  The more obvious
659 		 * ones are faults accessing the GDT, or LDT.  Perhaps
660 		 * surprisingly, if the CPU tries to deliver a benign or
661 		 * contributory exception from user code and gets a page fault
662 		 * during delivery, the page fault can be delivered as though
663 		 * it originated directly from user code.  This could happen
664 		 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
665 		 * kernel or IST stack.
666 		 */
667 		store_idt(&idt);
668 
669 		/* Usable even on Xen PV -- it's just slow. */
670 		native_store_gdt(&gdt);
671 
672 		pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
673 			 idt.address, idt.size, gdt.address, gdt.size);
674 
675 		store_ldt(ldtr);
676 		show_ldttss(&gdt, "LDTR", ldtr);
677 
678 		store_tr(tr);
679 		show_ldttss(&gdt, "TR", tr);
680 	}
681 
682 	dump_pagetable(address);
683 }
684 
685 static noinline void
686 pgtable_bad(struct pt_regs *regs, unsigned long error_code,
687 	    unsigned long address)
688 {
689 	struct task_struct *tsk;
690 	unsigned long flags;
691 	int sig;
692 
693 	flags = oops_begin();
694 	tsk = current;
695 	sig = SIGKILL;
696 
697 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
698 	       tsk->comm, address);
699 	dump_pagetable(address);
700 
701 	if (__die("Bad pagetable", regs, error_code))
702 		sig = 0;
703 
704 	oops_end(flags, regs, sig);
705 }
706 
707 static void set_signal_archinfo(unsigned long address,
708 				unsigned long error_code)
709 {
710 	struct task_struct *tsk = current;
711 
712 	/*
713 	 * To avoid leaking information about the kernel page
714 	 * table layout, pretend that user-mode accesses to
715 	 * kernel addresses are always protection faults.
716 	 *
717 	 * NB: This means that failed vsyscalls with vsyscall=none
718 	 * will have the PROT bit.  This doesn't leak any
719 	 * information and does not appear to cause any problems.
720 	 */
721 	if (address >= TASK_SIZE_MAX)
722 		error_code |= X86_PF_PROT;
723 
724 	tsk->thread.trap_nr = X86_TRAP_PF;
725 	tsk->thread.error_code = error_code | X86_PF_USER;
726 	tsk->thread.cr2 = address;
727 }
728 
729 static noinline void
730 no_context(struct pt_regs *regs, unsigned long error_code,
731 	   unsigned long address, int signal, int si_code)
732 {
733 	struct task_struct *tsk = current;
734 	unsigned long flags;
735 	int sig;
736 
737 	if (user_mode(regs)) {
738 		/*
739 		 * This is an implicit supervisor-mode access from user
740 		 * mode.  Bypass all the kernel-mode recovery code and just
741 		 * OOPS.
742 		 */
743 		goto oops;
744 	}
745 
746 	/* Are we prepared to handle this kernel fault? */
747 	if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
748 		/*
749 		 * Any interrupt that takes a fault gets the fixup. This makes
750 		 * the below recursive fault logic only apply to a faults from
751 		 * task context.
752 		 */
753 		if (in_interrupt())
754 			return;
755 
756 		/*
757 		 * Per the above we're !in_interrupt(), aka. task context.
758 		 *
759 		 * In this case we need to make sure we're not recursively
760 		 * faulting through the emulate_vsyscall() logic.
761 		 */
762 		if (current->thread.sig_on_uaccess_err && signal) {
763 			set_signal_archinfo(address, error_code);
764 
765 			/* XXX: hwpoison faults will set the wrong code. */
766 			force_sig_fault(signal, si_code, (void __user *)address);
767 		}
768 
769 		/*
770 		 * Barring that, we can do the fixup and be happy.
771 		 */
772 		return;
773 	}
774 
775 #ifdef CONFIG_VMAP_STACK
776 	/*
777 	 * Stack overflow?  During boot, we can fault near the initial
778 	 * stack in the direct map, but that's not an overflow -- check
779 	 * that we're in vmalloc space to avoid this.
780 	 */
781 	if (is_vmalloc_addr((void *)address) &&
782 	    (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
783 	     address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
784 		unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
785 		/*
786 		 * We're likely to be running with very little stack space
787 		 * left.  It's plausible that we'd hit this condition but
788 		 * double-fault even before we get this far, in which case
789 		 * we're fine: the double-fault handler will deal with it.
790 		 *
791 		 * We don't want to make it all the way into the oops code
792 		 * and then double-fault, though, because we're likely to
793 		 * break the console driver and lose most of the stack dump.
794 		 */
795 		asm volatile ("movq %[stack], %%rsp\n\t"
796 			      "call handle_stack_overflow\n\t"
797 			      "1: jmp 1b"
798 			      : ASM_CALL_CONSTRAINT
799 			      : "D" ("kernel stack overflow (page fault)"),
800 				"S" (regs), "d" (address),
801 				[stack] "rm" (stack));
802 		unreachable();
803 	}
804 #endif
805 
806 	/*
807 	 * 32-bit:
808 	 *
809 	 *   Valid to do another page fault here, because if this fault
810 	 *   had been triggered by is_prefetch fixup_exception would have
811 	 *   handled it.
812 	 *
813 	 * 64-bit:
814 	 *
815 	 *   Hall of shame of CPU/BIOS bugs.
816 	 */
817 	if (is_prefetch(regs, error_code, address))
818 		return;
819 
820 	if (is_errata93(regs, address))
821 		return;
822 
823 	/*
824 	 * Buggy firmware could access regions which might page fault, try to
825 	 * recover from such faults.
826 	 */
827 	if (IS_ENABLED(CONFIG_EFI))
828 		efi_recover_from_page_fault(address);
829 
830 oops:
831 	/*
832 	 * Oops. The kernel tried to access some bad page. We'll have to
833 	 * terminate things with extreme prejudice:
834 	 */
835 	flags = oops_begin();
836 
837 	show_fault_oops(regs, error_code, address);
838 
839 	if (task_stack_end_corrupted(tsk))
840 		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
841 
842 	sig = SIGKILL;
843 	if (__die("Oops", regs, error_code))
844 		sig = 0;
845 
846 	/* Executive summary in case the body of the oops scrolled away */
847 	printk(KERN_DEFAULT "CR2: %016lx\n", address);
848 
849 	oops_end(flags, regs, sig);
850 }
851 
852 /*
853  * Print out info about fatal segfaults, if the show_unhandled_signals
854  * sysctl is set:
855  */
856 static inline void
857 show_signal_msg(struct pt_regs *regs, unsigned long error_code,
858 		unsigned long address, struct task_struct *tsk)
859 {
860 	const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
861 
862 	if (!unhandled_signal(tsk, SIGSEGV))
863 		return;
864 
865 	if (!printk_ratelimit())
866 		return;
867 
868 	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
869 		loglvl, tsk->comm, task_pid_nr(tsk), address,
870 		(void *)regs->ip, (void *)regs->sp, error_code);
871 
872 	print_vma_addr(KERN_CONT " in ", regs->ip);
873 
874 	printk(KERN_CONT "\n");
875 
876 	show_opcodes(regs, loglvl);
877 }
878 
879 /*
880  * The (legacy) vsyscall page is the long page in the kernel portion
881  * of the address space that has user-accessible permissions.
882  */
883 static bool is_vsyscall_vaddr(unsigned long vaddr)
884 {
885 	return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
886 }
887 
888 static void
889 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
890 		       unsigned long address, u32 pkey, int si_code)
891 {
892 	struct task_struct *tsk = current;
893 
894 	/* User mode accesses just cause a SIGSEGV */
895 	if (user_mode(regs) && (error_code & X86_PF_USER)) {
896 		/*
897 		 * It's possible to have interrupts off here:
898 		 */
899 		local_irq_enable();
900 
901 		/*
902 		 * Valid to do another page fault here because this one came
903 		 * from user space:
904 		 */
905 		if (is_prefetch(regs, error_code, address))
906 			return;
907 
908 		if (is_errata100(regs, address))
909 			return;
910 
911 		/*
912 		 * To avoid leaking information about the kernel page table
913 		 * layout, pretend that user-mode accesses to kernel addresses
914 		 * are always protection faults.
915 		 */
916 		if (address >= TASK_SIZE_MAX)
917 			error_code |= X86_PF_PROT;
918 
919 		if (likely(show_unhandled_signals))
920 			show_signal_msg(regs, error_code, address, tsk);
921 
922 		set_signal_archinfo(address, error_code);
923 
924 		if (si_code == SEGV_PKUERR)
925 			force_sig_pkuerr((void __user *)address, pkey);
926 
927 		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
928 
929 		return;
930 	}
931 
932 	if (is_f00f_bug(regs, address))
933 		return;
934 
935 	no_context(regs, error_code, address, SIGSEGV, si_code);
936 }
937 
938 static noinline void
939 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
940 		     unsigned long address)
941 {
942 	__bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
943 }
944 
945 static void
946 __bad_area(struct pt_regs *regs, unsigned long error_code,
947 	   unsigned long address, u32 pkey, int si_code)
948 {
949 	struct mm_struct *mm = current->mm;
950 	/*
951 	 * Something tried to access memory that isn't in our memory map..
952 	 * Fix it, but check if it's kernel or user first..
953 	 */
954 	up_read(&mm->mmap_sem);
955 
956 	__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
957 }
958 
959 static noinline void
960 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
961 {
962 	__bad_area(regs, error_code, address, 0, SEGV_MAPERR);
963 }
964 
965 static inline bool bad_area_access_from_pkeys(unsigned long error_code,
966 		struct vm_area_struct *vma)
967 {
968 	/* This code is always called on the current mm */
969 	bool foreign = false;
970 
971 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
972 		return false;
973 	if (error_code & X86_PF_PK)
974 		return true;
975 	/* this checks permission keys on the VMA: */
976 	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
977 				       (error_code & X86_PF_INSTR), foreign))
978 		return true;
979 	return false;
980 }
981 
982 static noinline void
983 bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
984 		      unsigned long address, struct vm_area_struct *vma)
985 {
986 	/*
987 	 * This OSPKE check is not strictly necessary at runtime.
988 	 * But, doing it this way allows compiler optimizations
989 	 * if pkeys are compiled out.
990 	 */
991 	if (bad_area_access_from_pkeys(error_code, vma)) {
992 		/*
993 		 * A protection key fault means that the PKRU value did not allow
994 		 * access to some PTE.  Userspace can figure out what PKRU was
995 		 * from the XSAVE state.  This function captures the pkey from
996 		 * the vma and passes it to userspace so userspace can discover
997 		 * which protection key was set on the PTE.
998 		 *
999 		 * If we get here, we know that the hardware signaled a X86_PF_PK
1000 		 * fault and that there was a VMA once we got in the fault
1001 		 * handler.  It does *not* guarantee that the VMA we find here
1002 		 * was the one that we faulted on.
1003 		 *
1004 		 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
1005 		 * 2. T1   : set PKRU to deny access to pkey=4, touches page
1006 		 * 3. T1   : faults...
1007 		 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
1008 		 * 5. T1   : enters fault handler, takes mmap_sem, etc...
1009 		 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
1010 		 *	     faulted on a pte with its pkey=4.
1011 		 */
1012 		u32 pkey = vma_pkey(vma);
1013 
1014 		__bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
1015 	} else {
1016 		__bad_area(regs, error_code, address, 0, SEGV_ACCERR);
1017 	}
1018 }
1019 
1020 static void
1021 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
1022 	  vm_fault_t fault)
1023 {
1024 	/* Kernel mode? Handle exceptions or die: */
1025 	if (!(error_code & X86_PF_USER)) {
1026 		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
1027 		return;
1028 	}
1029 
1030 	/* User-space => ok to do another page fault: */
1031 	if (is_prefetch(regs, error_code, address))
1032 		return;
1033 
1034 	set_signal_archinfo(address, error_code);
1035 
1036 #ifdef CONFIG_MEMORY_FAILURE
1037 	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
1038 		struct task_struct *tsk = current;
1039 		unsigned lsb = 0;
1040 
1041 		pr_err(
1042 	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
1043 			tsk->comm, tsk->pid, address);
1044 		if (fault & VM_FAULT_HWPOISON_LARGE)
1045 			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
1046 		if (fault & VM_FAULT_HWPOISON)
1047 			lsb = PAGE_SHIFT;
1048 		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
1049 		return;
1050 	}
1051 #endif
1052 	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
1053 }
1054 
1055 static noinline void
1056 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
1057 	       unsigned long address, vm_fault_t fault)
1058 {
1059 	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
1060 		no_context(regs, error_code, address, 0, 0);
1061 		return;
1062 	}
1063 
1064 	if (fault & VM_FAULT_OOM) {
1065 		/* Kernel mode? Handle exceptions or die: */
1066 		if (!(error_code & X86_PF_USER)) {
1067 			no_context(regs, error_code, address,
1068 				   SIGSEGV, SEGV_MAPERR);
1069 			return;
1070 		}
1071 
1072 		/*
1073 		 * We ran out of memory, call the OOM killer, and return the
1074 		 * userspace (which will retry the fault, or kill us if we got
1075 		 * oom-killed):
1076 		 */
1077 		pagefault_out_of_memory();
1078 	} else {
1079 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
1080 			     VM_FAULT_HWPOISON_LARGE))
1081 			do_sigbus(regs, error_code, address, fault);
1082 		else if (fault & VM_FAULT_SIGSEGV)
1083 			bad_area_nosemaphore(regs, error_code, address);
1084 		else
1085 			BUG();
1086 	}
1087 }
1088 
1089 static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
1090 {
1091 	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
1092 		return 0;
1093 
1094 	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
1095 		return 0;
1096 
1097 	return 1;
1098 }
1099 
1100 /*
1101  * Handle a spurious fault caused by a stale TLB entry.
1102  *
1103  * This allows us to lazily refresh the TLB when increasing the
1104  * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
1105  * eagerly is very expensive since that implies doing a full
1106  * cross-processor TLB flush, even if no stale TLB entries exist
1107  * on other processors.
1108  *
1109  * Spurious faults may only occur if the TLB contains an entry with
1110  * fewer permission than the page table entry.  Non-present (P = 0)
1111  * and reserved bit (R = 1) faults are never spurious.
1112  *
1113  * There are no security implications to leaving a stale TLB when
1114  * increasing the permissions on a page.
1115  *
1116  * Returns non-zero if a spurious fault was handled, zero otherwise.
1117  *
1118  * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
1119  * (Optional Invalidation).
1120  */
1121 static noinline int
1122 spurious_kernel_fault(unsigned long error_code, unsigned long address)
1123 {
1124 	pgd_t *pgd;
1125 	p4d_t *p4d;
1126 	pud_t *pud;
1127 	pmd_t *pmd;
1128 	pte_t *pte;
1129 	int ret;
1130 
1131 	/*
1132 	 * Only writes to RO or instruction fetches from NX may cause
1133 	 * spurious faults.
1134 	 *
1135 	 * These could be from user or supervisor accesses but the TLB
1136 	 * is only lazily flushed after a kernel mapping protection
1137 	 * change, so user accesses are not expected to cause spurious
1138 	 * faults.
1139 	 */
1140 	if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
1141 	    error_code != (X86_PF_INSTR | X86_PF_PROT))
1142 		return 0;
1143 
1144 	pgd = init_mm.pgd + pgd_index(address);
1145 	if (!pgd_present(*pgd))
1146 		return 0;
1147 
1148 	p4d = p4d_offset(pgd, address);
1149 	if (!p4d_present(*p4d))
1150 		return 0;
1151 
1152 	if (p4d_large(*p4d))
1153 		return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
1154 
1155 	pud = pud_offset(p4d, address);
1156 	if (!pud_present(*pud))
1157 		return 0;
1158 
1159 	if (pud_large(*pud))
1160 		return spurious_kernel_fault_check(error_code, (pte_t *) pud);
1161 
1162 	pmd = pmd_offset(pud, address);
1163 	if (!pmd_present(*pmd))
1164 		return 0;
1165 
1166 	if (pmd_large(*pmd))
1167 		return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1168 
1169 	pte = pte_offset_kernel(pmd, address);
1170 	if (!pte_present(*pte))
1171 		return 0;
1172 
1173 	ret = spurious_kernel_fault_check(error_code, pte);
1174 	if (!ret)
1175 		return 0;
1176 
1177 	/*
1178 	 * Make sure we have permissions in PMD.
1179 	 * If not, then there's a bug in the page tables:
1180 	 */
1181 	ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1182 	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1183 
1184 	return ret;
1185 }
1186 NOKPROBE_SYMBOL(spurious_kernel_fault);
1187 
1188 int show_unhandled_signals = 1;
1189 
1190 static inline int
1191 access_error(unsigned long error_code, struct vm_area_struct *vma)
1192 {
1193 	/* This is only called for the current mm, so: */
1194 	bool foreign = false;
1195 
1196 	/*
1197 	 * Read or write was blocked by protection keys.  This is
1198 	 * always an unconditional error and can never result in
1199 	 * a follow-up action to resolve the fault, like a COW.
1200 	 */
1201 	if (error_code & X86_PF_PK)
1202 		return 1;
1203 
1204 	/*
1205 	 * Make sure to check the VMA so that we do not perform
1206 	 * faults just to hit a X86_PF_PK as soon as we fill in a
1207 	 * page.
1208 	 */
1209 	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
1210 				       (error_code & X86_PF_INSTR), foreign))
1211 		return 1;
1212 
1213 	if (error_code & X86_PF_WRITE) {
1214 		/* write, present and write, not present: */
1215 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
1216 			return 1;
1217 		return 0;
1218 	}
1219 
1220 	/* read, present: */
1221 	if (unlikely(error_code & X86_PF_PROT))
1222 		return 1;
1223 
1224 	/* read, not present: */
1225 	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
1226 		return 1;
1227 
1228 	return 0;
1229 }
1230 
1231 static int fault_in_kernel_space(unsigned long address)
1232 {
1233 	/*
1234 	 * On 64-bit systems, the vsyscall page is at an address above
1235 	 * TASK_SIZE_MAX, but is not considered part of the kernel
1236 	 * address space.
1237 	 */
1238 	if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
1239 		return false;
1240 
1241 	return address >= TASK_SIZE_MAX;
1242 }
1243 
1244 /*
1245  * Called for all faults where 'address' is part of the kernel address
1246  * space.  Might get called for faults that originate from *code* that
1247  * ran in userspace or the kernel.
1248  */
1249 static void
1250 do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
1251 		   unsigned long address)
1252 {
1253 	/*
1254 	 * Protection keys exceptions only happen on user pages.  We
1255 	 * have no user pages in the kernel portion of the address
1256 	 * space, so do not expect them here.
1257 	 */
1258 	WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1259 
1260 	/*
1261 	 * We can fault-in kernel-space virtual memory on-demand. The
1262 	 * 'reference' page table is init_mm.pgd.
1263 	 *
1264 	 * NOTE! We MUST NOT take any locks for this case. We may
1265 	 * be in an interrupt or a critical region, and should
1266 	 * only copy the information from the master page table,
1267 	 * nothing more.
1268 	 *
1269 	 * Before doing this on-demand faulting, ensure that the
1270 	 * fault is not any of the following:
1271 	 * 1. A fault on a PTE with a reserved bit set.
1272 	 * 2. A fault caused by a user-mode access.  (Do not demand-
1273 	 *    fault kernel memory due to user-mode accesses).
1274 	 * 3. A fault caused by a page-level protection violation.
1275 	 *    (A demand fault would be on a non-present page which
1276 	 *     would have X86_PF_PROT==0).
1277 	 */
1278 	if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1279 		if (vmalloc_fault(address) >= 0)
1280 			return;
1281 	}
1282 
1283 	/* Was the fault spurious, caused by lazy TLB invalidation? */
1284 	if (spurious_kernel_fault(hw_error_code, address))
1285 		return;
1286 
1287 	/* kprobes don't want to hook the spurious faults: */
1288 	if (kprobe_page_fault(regs, X86_TRAP_PF))
1289 		return;
1290 
1291 	/*
1292 	 * Note, despite being a "bad area", there are quite a few
1293 	 * acceptable reasons to get here, such as erratum fixups
1294 	 * and handling kernel code that can fault, like get_user().
1295 	 *
1296 	 * Don't take the mm semaphore here. If we fixup a prefetch
1297 	 * fault we could otherwise deadlock:
1298 	 */
1299 	bad_area_nosemaphore(regs, hw_error_code, address);
1300 }
1301 NOKPROBE_SYMBOL(do_kern_addr_fault);
1302 
1303 /* Handle faults in the user portion of the address space */
1304 static inline
1305 void do_user_addr_fault(struct pt_regs *regs,
1306 			unsigned long hw_error_code,
1307 			unsigned long address)
1308 {
1309 	struct vm_area_struct *vma;
1310 	struct task_struct *tsk;
1311 	struct mm_struct *mm;
1312 	vm_fault_t fault, major = 0;
1313 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1314 
1315 	tsk = current;
1316 	mm = tsk->mm;
1317 
1318 	/* kprobes don't want to hook the spurious faults: */
1319 	if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
1320 		return;
1321 
1322 	/*
1323 	 * Reserved bits are never expected to be set on
1324 	 * entries in the user portion of the page tables.
1325 	 */
1326 	if (unlikely(hw_error_code & X86_PF_RSVD))
1327 		pgtable_bad(regs, hw_error_code, address);
1328 
1329 	/*
1330 	 * If SMAP is on, check for invalid kernel (supervisor) access to user
1331 	 * pages in the user address space.  The odd case here is WRUSS,
1332 	 * which, according to the preliminary documentation, does not respect
1333 	 * SMAP and will have the USER bit set so, in all cases, SMAP
1334 	 * enforcement appears to be consistent with the USER bit.
1335 	 */
1336 	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1337 		     !(hw_error_code & X86_PF_USER) &&
1338 		     !(regs->flags & X86_EFLAGS_AC)))
1339 	{
1340 		bad_area_nosemaphore(regs, hw_error_code, address);
1341 		return;
1342 	}
1343 
1344 	/*
1345 	 * If we're in an interrupt, have no user context or are running
1346 	 * in a region with pagefaults disabled then we must not take the fault
1347 	 */
1348 	if (unlikely(faulthandler_disabled() || !mm)) {
1349 		bad_area_nosemaphore(regs, hw_error_code, address);
1350 		return;
1351 	}
1352 
1353 	/*
1354 	 * It's safe to allow irq's after cr2 has been saved and the
1355 	 * vmalloc fault has been handled.
1356 	 *
1357 	 * User-mode registers count as a user access even for any
1358 	 * potential system fault or CPU buglet:
1359 	 */
1360 	if (user_mode(regs)) {
1361 		local_irq_enable();
1362 		flags |= FAULT_FLAG_USER;
1363 	} else {
1364 		if (regs->flags & X86_EFLAGS_IF)
1365 			local_irq_enable();
1366 	}
1367 
1368 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1369 
1370 	if (hw_error_code & X86_PF_WRITE)
1371 		flags |= FAULT_FLAG_WRITE;
1372 	if (hw_error_code & X86_PF_INSTR)
1373 		flags |= FAULT_FLAG_INSTRUCTION;
1374 
1375 #ifdef CONFIG_X86_64
1376 	/*
1377 	 * Faults in the vsyscall page might need emulation.  The
1378 	 * vsyscall page is at a high address (>PAGE_OFFSET), but is
1379 	 * considered to be part of the user address space.
1380 	 *
1381 	 * The vsyscall page does not have a "real" VMA, so do this
1382 	 * emulation before we go searching for VMAs.
1383 	 *
1384 	 * PKRU never rejects instruction fetches, so we don't need
1385 	 * to consider the PF_PK bit.
1386 	 */
1387 	if (is_vsyscall_vaddr(address)) {
1388 		if (emulate_vsyscall(hw_error_code, regs, address))
1389 			return;
1390 	}
1391 #endif
1392 
1393 	/*
1394 	 * Kernel-mode access to the user address space should only occur
1395 	 * on well-defined single instructions listed in the exception
1396 	 * tables.  But, an erroneous kernel fault occurring outside one of
1397 	 * those areas which also holds mmap_sem might deadlock attempting
1398 	 * to validate the fault against the address space.
1399 	 *
1400 	 * Only do the expensive exception table search when we might be at
1401 	 * risk of a deadlock.  This happens if we
1402 	 * 1. Failed to acquire mmap_sem, and
1403 	 * 2. The access did not originate in userspace.
1404 	 */
1405 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1406 		if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
1407 			/*
1408 			 * Fault from code in kernel from
1409 			 * which we do not expect faults.
1410 			 */
1411 			bad_area_nosemaphore(regs, hw_error_code, address);
1412 			return;
1413 		}
1414 retry:
1415 		down_read(&mm->mmap_sem);
1416 	} else {
1417 		/*
1418 		 * The above down_read_trylock() might have succeeded in
1419 		 * which case we'll have missed the might_sleep() from
1420 		 * down_read():
1421 		 */
1422 		might_sleep();
1423 	}
1424 
1425 	vma = find_vma(mm, address);
1426 	if (unlikely(!vma)) {
1427 		bad_area(regs, hw_error_code, address);
1428 		return;
1429 	}
1430 	if (likely(vma->vm_start <= address))
1431 		goto good_area;
1432 	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1433 		bad_area(regs, hw_error_code, address);
1434 		return;
1435 	}
1436 	if (unlikely(expand_stack(vma, address))) {
1437 		bad_area(regs, hw_error_code, address);
1438 		return;
1439 	}
1440 
1441 	/*
1442 	 * Ok, we have a good vm_area for this memory access, so
1443 	 * we can handle it..
1444 	 */
1445 good_area:
1446 	if (unlikely(access_error(hw_error_code, vma))) {
1447 		bad_area_access_error(regs, hw_error_code, address, vma);
1448 		return;
1449 	}
1450 
1451 	/*
1452 	 * If for any reason at all we couldn't handle the fault,
1453 	 * make sure we exit gracefully rather than endlessly redo
1454 	 * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1455 	 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
1456 	 *
1457 	 * Note that handle_userfault() may also release and reacquire mmap_sem
1458 	 * (and not return with VM_FAULT_RETRY), when returning to userland to
1459 	 * repeat the page fault later with a VM_FAULT_NOPAGE retval
1460 	 * (potentially after handling any pending signal during the return to
1461 	 * userland). The return to userland is identified whenever
1462 	 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
1463 	 */
1464 	fault = handle_mm_fault(vma, address, flags);
1465 	major |= fault & VM_FAULT_MAJOR;
1466 
1467 	/*
1468 	 * If we need to retry the mmap_sem has already been released,
1469 	 * and if there is a fatal signal pending there is no guarantee
1470 	 * that we made any progress. Handle this case first.
1471 	 */
1472 	if (unlikely(fault & VM_FAULT_RETRY)) {
1473 		/* Retry at most once */
1474 		if (flags & FAULT_FLAG_ALLOW_RETRY) {
1475 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
1476 			flags |= FAULT_FLAG_TRIED;
1477 			if (!fatal_signal_pending(tsk))
1478 				goto retry;
1479 		}
1480 
1481 		/* User mode? Just return to handle the fatal exception */
1482 		if (flags & FAULT_FLAG_USER)
1483 			return;
1484 
1485 		/* Not returning to user mode? Handle exceptions or die: */
1486 		no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
1487 		return;
1488 	}
1489 
1490 	up_read(&mm->mmap_sem);
1491 	if (unlikely(fault & VM_FAULT_ERROR)) {
1492 		mm_fault_error(regs, hw_error_code, address, fault);
1493 		return;
1494 	}
1495 
1496 	/*
1497 	 * Major/minor page fault accounting. If any of the events
1498 	 * returned VM_FAULT_MAJOR, we account it as a major fault.
1499 	 */
1500 	if (major) {
1501 		tsk->maj_flt++;
1502 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
1503 	} else {
1504 		tsk->min_flt++;
1505 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
1506 	}
1507 
1508 	check_v8086_mode(regs, address, tsk);
1509 }
1510 NOKPROBE_SYMBOL(do_user_addr_fault);
1511 
1512 static __always_inline void
1513 trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
1514 			 unsigned long address)
1515 {
1516 	if (!trace_pagefault_enabled())
1517 		return;
1518 
1519 	if (user_mode(regs))
1520 		trace_page_fault_user(address, regs, error_code);
1521 	else
1522 		trace_page_fault_kernel(address, regs, error_code);
1523 }
1524 
1525 dotraplinkage void
1526 do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
1527 		unsigned long address)
1528 {
1529 	prefetchw(&current->mm->mmap_sem);
1530 	trace_page_fault_entries(regs, hw_error_code, address);
1531 
1532 	if (unlikely(kmmio_fault(regs, address)))
1533 		return;
1534 
1535 	/* Was the fault on kernel-controlled part of the address space? */
1536 	if (unlikely(fault_in_kernel_space(address)))
1537 		do_kern_addr_fault(regs, hw_error_code, address);
1538 	else
1539 		do_user_addr_fault(regs, hw_error_code, address);
1540 }
1541 NOKPROBE_SYMBOL(do_page_fault);
1542