1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. 5 * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar 6 */ 7 #include <linux/sched.h> /* test_thread_flag(), ... */ 8 #include <linux/sched/task_stack.h> /* task_stack_*(), ... */ 9 #include <linux/kdebug.h> /* oops_begin/end, ... */ 10 #include <linux/extable.h> /* search_exception_tables */ 11 #include <linux/memblock.h> /* max_low_pfn */ 12 #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ 13 #include <linux/mmiotrace.h> /* kmmio_handler, ... */ 14 #include <linux/perf_event.h> /* perf_sw_event */ 15 #include <linux/hugetlb.h> /* hstate_index_to_shift */ 16 #include <linux/prefetch.h> /* prefetchw */ 17 #include <linux/context_tracking.h> /* exception_enter(), ... */ 18 #include <linux/uaccess.h> /* faulthandler_disabled() */ 19 #include <linux/efi.h> /* efi_recover_from_page_fault()*/ 20 #include <linux/mm_types.h> 21 22 #include <asm/cpufeature.h> /* boot_cpu_has, ... */ 23 #include <asm/traps.h> /* dotraplinkage, ... */ 24 #include <asm/pgalloc.h> /* pgd_*(), ... */ 25 #include <asm/fixmap.h> /* VSYSCALL_ADDR */ 26 #include <asm/vsyscall.h> /* emulate_vsyscall */ 27 #include <asm/vm86.h> /* struct vm86 */ 28 #include <asm/mmu_context.h> /* vma_pkey() */ 29 #include <asm/efi.h> /* efi_recover_from_page_fault()*/ 30 #include <asm/desc.h> /* store_idt(), ... */ 31 #include <asm/cpu_entry_area.h> /* exception stack */ 32 #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ 33 34 #define CREATE_TRACE_POINTS 35 #include <asm/trace/exceptions.h> 36 37 /* 38 * Returns 0 if mmiotrace is disabled, or if the fault is not 39 * handled by mmiotrace: 40 */ 41 static nokprobe_inline int 42 kmmio_fault(struct pt_regs *regs, unsigned long addr) 43 { 44 if (unlikely(is_kmmio_active())) 45 if (kmmio_handler(regs, addr) == 1) 46 return -1; 47 return 0; 48 } 49 50 /* 51 * Prefetch quirks: 52 * 53 * 32-bit mode: 54 * 55 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. 56 * Check that here and ignore it. 57 * 58 * 64-bit mode: 59 * 60 * Sometimes the CPU reports invalid exceptions on prefetch. 61 * Check that here and ignore it. 62 * 63 * Opcode checker based on code by Richard Brunner. 64 */ 65 static inline int 66 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, 67 unsigned char opcode, int *prefetch) 68 { 69 unsigned char instr_hi = opcode & 0xf0; 70 unsigned char instr_lo = opcode & 0x0f; 71 72 switch (instr_hi) { 73 case 0x20: 74 case 0x30: 75 /* 76 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. 77 * In X86_64 long mode, the CPU will signal invalid 78 * opcode if some of these prefixes are present so 79 * X86_64 will never get here anyway 80 */ 81 return ((instr_lo & 7) == 0x6); 82 #ifdef CONFIG_X86_64 83 case 0x40: 84 /* 85 * In AMD64 long mode 0x40..0x4F are valid REX prefixes 86 * Need to figure out under what instruction mode the 87 * instruction was issued. Could check the LDT for lm, 88 * but for now it's good enough to assume that long 89 * mode only uses well known segments or kernel. 90 */ 91 return (!user_mode(regs) || user_64bit_mode(regs)); 92 #endif 93 case 0x60: 94 /* 0x64 thru 0x67 are valid prefixes in all modes. */ 95 return (instr_lo & 0xC) == 0x4; 96 case 0xF0: 97 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ 98 return !instr_lo || (instr_lo>>1) == 1; 99 case 0x00: 100 /* Prefetch instruction is 0x0F0D or 0x0F18 */ 101 if (probe_kernel_address(instr, opcode)) 102 return 0; 103 104 *prefetch = (instr_lo == 0xF) && 105 (opcode == 0x0D || opcode == 0x18); 106 return 0; 107 default: 108 return 0; 109 } 110 } 111 112 static int 113 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) 114 { 115 unsigned char *max_instr; 116 unsigned char *instr; 117 int prefetch = 0; 118 119 /* 120 * If it was a exec (instruction fetch) fault on NX page, then 121 * do not ignore the fault: 122 */ 123 if (error_code & X86_PF_INSTR) 124 return 0; 125 126 instr = (void *)convert_ip_to_linear(current, regs); 127 max_instr = instr + 15; 128 129 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) 130 return 0; 131 132 while (instr < max_instr) { 133 unsigned char opcode; 134 135 if (probe_kernel_address(instr, opcode)) 136 break; 137 138 instr++; 139 140 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) 141 break; 142 } 143 return prefetch; 144 } 145 146 DEFINE_SPINLOCK(pgd_lock); 147 LIST_HEAD(pgd_list); 148 149 #ifdef CONFIG_X86_32 150 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) 151 { 152 unsigned index = pgd_index(address); 153 pgd_t *pgd_k; 154 p4d_t *p4d, *p4d_k; 155 pud_t *pud, *pud_k; 156 pmd_t *pmd, *pmd_k; 157 158 pgd += index; 159 pgd_k = init_mm.pgd + index; 160 161 if (!pgd_present(*pgd_k)) 162 return NULL; 163 164 /* 165 * set_pgd(pgd, *pgd_k); here would be useless on PAE 166 * and redundant with the set_pmd() on non-PAE. As would 167 * set_p4d/set_pud. 168 */ 169 p4d = p4d_offset(pgd, address); 170 p4d_k = p4d_offset(pgd_k, address); 171 if (!p4d_present(*p4d_k)) 172 return NULL; 173 174 pud = pud_offset(p4d, address); 175 pud_k = pud_offset(p4d_k, address); 176 if (!pud_present(*pud_k)) 177 return NULL; 178 179 pmd = pmd_offset(pud, address); 180 pmd_k = pmd_offset(pud_k, address); 181 182 if (pmd_present(*pmd) != pmd_present(*pmd_k)) 183 set_pmd(pmd, *pmd_k); 184 185 if (!pmd_present(*pmd_k)) 186 return NULL; 187 else 188 BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); 189 190 return pmd_k; 191 } 192 193 static void vmalloc_sync(void) 194 { 195 unsigned long address; 196 197 if (SHARED_KERNEL_PMD) 198 return; 199 200 for (address = VMALLOC_START & PMD_MASK; 201 address >= TASK_SIZE_MAX && address < VMALLOC_END; 202 address += PMD_SIZE) { 203 struct page *page; 204 205 spin_lock(&pgd_lock); 206 list_for_each_entry(page, &pgd_list, lru) { 207 spinlock_t *pgt_lock; 208 209 /* the pgt_lock only for Xen */ 210 pgt_lock = &pgd_page_get_mm(page)->page_table_lock; 211 212 spin_lock(pgt_lock); 213 vmalloc_sync_one(page_address(page), address); 214 spin_unlock(pgt_lock); 215 } 216 spin_unlock(&pgd_lock); 217 } 218 } 219 220 void vmalloc_sync_mappings(void) 221 { 222 vmalloc_sync(); 223 } 224 225 void vmalloc_sync_unmappings(void) 226 { 227 vmalloc_sync(); 228 } 229 230 /* 231 * 32-bit: 232 * 233 * Handle a fault on the vmalloc or module mapping area 234 */ 235 static noinline int vmalloc_fault(unsigned long address) 236 { 237 unsigned long pgd_paddr; 238 pmd_t *pmd_k; 239 pte_t *pte_k; 240 241 /* Make sure we are in vmalloc area: */ 242 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 243 return -1; 244 245 /* 246 * Synchronize this task's top level page-table 247 * with the 'reference' page table. 248 * 249 * Do _not_ use "current" here. We might be inside 250 * an interrupt in the middle of a task switch.. 251 */ 252 pgd_paddr = read_cr3_pa(); 253 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 254 if (!pmd_k) 255 return -1; 256 257 if (pmd_large(*pmd_k)) 258 return 0; 259 260 pte_k = pte_offset_kernel(pmd_k, address); 261 if (!pte_present(*pte_k)) 262 return -1; 263 264 return 0; 265 } 266 NOKPROBE_SYMBOL(vmalloc_fault); 267 268 /* 269 * Did it hit the DOS screen memory VA from vm86 mode? 270 */ 271 static inline void 272 check_v8086_mode(struct pt_regs *regs, unsigned long address, 273 struct task_struct *tsk) 274 { 275 #ifdef CONFIG_VM86 276 unsigned long bit; 277 278 if (!v8086_mode(regs) || !tsk->thread.vm86) 279 return; 280 281 bit = (address - 0xA0000) >> PAGE_SHIFT; 282 if (bit < 32) 283 tsk->thread.vm86->screen_bitmap |= 1 << bit; 284 #endif 285 } 286 287 static bool low_pfn(unsigned long pfn) 288 { 289 return pfn < max_low_pfn; 290 } 291 292 static void dump_pagetable(unsigned long address) 293 { 294 pgd_t *base = __va(read_cr3_pa()); 295 pgd_t *pgd = &base[pgd_index(address)]; 296 p4d_t *p4d; 297 pud_t *pud; 298 pmd_t *pmd; 299 pte_t *pte; 300 301 #ifdef CONFIG_X86_PAE 302 pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); 303 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 304 goto out; 305 #define pr_pde pr_cont 306 #else 307 #define pr_pde pr_info 308 #endif 309 p4d = p4d_offset(pgd, address); 310 pud = pud_offset(p4d, address); 311 pmd = pmd_offset(pud, address); 312 pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 313 #undef pr_pde 314 315 /* 316 * We must not directly access the pte in the highpte 317 * case if the page table is located in highmem. 318 * And let's rather not kmap-atomic the pte, just in case 319 * it's allocated already: 320 */ 321 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) 322 goto out; 323 324 pte = pte_offset_kernel(pmd, address); 325 pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 326 out: 327 pr_cont("\n"); 328 } 329 330 #else /* CONFIG_X86_64: */ 331 332 void vmalloc_sync_mappings(void) 333 { 334 /* 335 * 64-bit mappings might allocate new p4d/pud pages 336 * that need to be propagated to all tasks' PGDs. 337 */ 338 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); 339 } 340 341 void vmalloc_sync_unmappings(void) 342 { 343 /* 344 * Unmappings never allocate or free p4d/pud pages. 345 * No work is required here. 346 */ 347 } 348 349 /* 350 * 64-bit: 351 * 352 * Handle a fault on the vmalloc area 353 */ 354 static noinline int vmalloc_fault(unsigned long address) 355 { 356 pgd_t *pgd, *pgd_k; 357 p4d_t *p4d, *p4d_k; 358 pud_t *pud; 359 pmd_t *pmd; 360 pte_t *pte; 361 362 /* Make sure we are in vmalloc area: */ 363 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 364 return -1; 365 366 /* 367 * Copy kernel mappings over when needed. This can also 368 * happen within a race in page table update. In the later 369 * case just flush: 370 */ 371 pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); 372 pgd_k = pgd_offset_k(address); 373 if (pgd_none(*pgd_k)) 374 return -1; 375 376 if (pgtable_l5_enabled()) { 377 if (pgd_none(*pgd)) { 378 set_pgd(pgd, *pgd_k); 379 arch_flush_lazy_mmu_mode(); 380 } else { 381 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); 382 } 383 } 384 385 /* With 4-level paging, copying happens on the p4d level. */ 386 p4d = p4d_offset(pgd, address); 387 p4d_k = p4d_offset(pgd_k, address); 388 if (p4d_none(*p4d_k)) 389 return -1; 390 391 if (p4d_none(*p4d) && !pgtable_l5_enabled()) { 392 set_p4d(p4d, *p4d_k); 393 arch_flush_lazy_mmu_mode(); 394 } else { 395 BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); 396 } 397 398 BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); 399 400 pud = pud_offset(p4d, address); 401 if (pud_none(*pud)) 402 return -1; 403 404 if (pud_large(*pud)) 405 return 0; 406 407 pmd = pmd_offset(pud, address); 408 if (pmd_none(*pmd)) 409 return -1; 410 411 if (pmd_large(*pmd)) 412 return 0; 413 414 pte = pte_offset_kernel(pmd, address); 415 if (!pte_present(*pte)) 416 return -1; 417 418 return 0; 419 } 420 NOKPROBE_SYMBOL(vmalloc_fault); 421 422 #ifdef CONFIG_CPU_SUP_AMD 423 static const char errata93_warning[] = 424 KERN_ERR 425 "******* Your BIOS seems to not contain a fix for K8 errata #93\n" 426 "******* Working around it, but it may cause SEGVs or burn power.\n" 427 "******* Please consider a BIOS update.\n" 428 "******* Disabling USB legacy in the BIOS may also help.\n"; 429 #endif 430 431 /* 432 * No vm86 mode in 64-bit mode: 433 */ 434 static inline void 435 check_v8086_mode(struct pt_regs *regs, unsigned long address, 436 struct task_struct *tsk) 437 { 438 } 439 440 static int bad_address(void *p) 441 { 442 unsigned long dummy; 443 444 return probe_kernel_address((unsigned long *)p, dummy); 445 } 446 447 static void dump_pagetable(unsigned long address) 448 { 449 pgd_t *base = __va(read_cr3_pa()); 450 pgd_t *pgd = base + pgd_index(address); 451 p4d_t *p4d; 452 pud_t *pud; 453 pmd_t *pmd; 454 pte_t *pte; 455 456 if (bad_address(pgd)) 457 goto bad; 458 459 pr_info("PGD %lx ", pgd_val(*pgd)); 460 461 if (!pgd_present(*pgd)) 462 goto out; 463 464 p4d = p4d_offset(pgd, address); 465 if (bad_address(p4d)) 466 goto bad; 467 468 pr_cont("P4D %lx ", p4d_val(*p4d)); 469 if (!p4d_present(*p4d) || p4d_large(*p4d)) 470 goto out; 471 472 pud = pud_offset(p4d, address); 473 if (bad_address(pud)) 474 goto bad; 475 476 pr_cont("PUD %lx ", pud_val(*pud)); 477 if (!pud_present(*pud) || pud_large(*pud)) 478 goto out; 479 480 pmd = pmd_offset(pud, address); 481 if (bad_address(pmd)) 482 goto bad; 483 484 pr_cont("PMD %lx ", pmd_val(*pmd)); 485 if (!pmd_present(*pmd) || pmd_large(*pmd)) 486 goto out; 487 488 pte = pte_offset_kernel(pmd, address); 489 if (bad_address(pte)) 490 goto bad; 491 492 pr_cont("PTE %lx", pte_val(*pte)); 493 out: 494 pr_cont("\n"); 495 return; 496 bad: 497 pr_info("BAD\n"); 498 } 499 500 #endif /* CONFIG_X86_64 */ 501 502 /* 503 * Workaround for K8 erratum #93 & buggy BIOS. 504 * 505 * BIOS SMM functions are required to use a specific workaround 506 * to avoid corruption of the 64bit RIP register on C stepping K8. 507 * 508 * A lot of BIOS that didn't get tested properly miss this. 509 * 510 * The OS sees this as a page fault with the upper 32bits of RIP cleared. 511 * Try to work around it here. 512 * 513 * Note we only handle faults in kernel here. 514 * Does nothing on 32-bit. 515 */ 516 static int is_errata93(struct pt_regs *regs, unsigned long address) 517 { 518 #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) 519 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD 520 || boot_cpu_data.x86 != 0xf) 521 return 0; 522 523 if (address != regs->ip) 524 return 0; 525 526 if ((address >> 32) != 0) 527 return 0; 528 529 address |= 0xffffffffUL << 32; 530 if ((address >= (u64)_stext && address <= (u64)_etext) || 531 (address >= MODULES_VADDR && address <= MODULES_END)) { 532 printk_once(errata93_warning); 533 regs->ip = address; 534 return 1; 535 } 536 #endif 537 return 0; 538 } 539 540 /* 541 * Work around K8 erratum #100 K8 in compat mode occasionally jumps 542 * to illegal addresses >4GB. 543 * 544 * We catch this in the page fault handler because these addresses 545 * are not reachable. Just detect this case and return. Any code 546 * segment in LDT is compatibility mode. 547 */ 548 static int is_errata100(struct pt_regs *regs, unsigned long address) 549 { 550 #ifdef CONFIG_X86_64 551 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) 552 return 1; 553 #endif 554 return 0; 555 } 556 557 static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 558 { 559 #ifdef CONFIG_X86_F00F_BUG 560 unsigned long nr; 561 562 /* 563 * Pentium F0 0F C7 C8 bug workaround: 564 */ 565 if (boot_cpu_has_bug(X86_BUG_F00F)) { 566 nr = (address - idt_descr.address) >> 3; 567 568 if (nr == 6) { 569 do_invalid_op(regs, 0); 570 return 1; 571 } 572 } 573 #endif 574 return 0; 575 } 576 577 static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) 578 { 579 u32 offset = (index >> 3) * sizeof(struct desc_struct); 580 unsigned long addr; 581 struct ldttss_desc desc; 582 583 if (index == 0) { 584 pr_alert("%s: NULL\n", name); 585 return; 586 } 587 588 if (offset + sizeof(struct ldttss_desc) >= gdt->size) { 589 pr_alert("%s: 0x%hx -- out of bounds\n", name, index); 590 return; 591 } 592 593 if (probe_kernel_read(&desc, (void *)(gdt->address + offset), 594 sizeof(struct ldttss_desc))) { 595 pr_alert("%s: 0x%hx -- GDT entry is not readable\n", 596 name, index); 597 return; 598 } 599 600 addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24); 601 #ifdef CONFIG_X86_64 602 addr |= ((u64)desc.base3 << 32); 603 #endif 604 pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", 605 name, index, addr, (desc.limit0 | (desc.limit1 << 16))); 606 } 607 608 static void 609 show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) 610 { 611 if (!oops_may_print()) 612 return; 613 614 if (error_code & X86_PF_INSTR) { 615 unsigned int level; 616 pgd_t *pgd; 617 pte_t *pte; 618 619 pgd = __va(read_cr3_pa()); 620 pgd += pgd_index(address); 621 622 pte = lookup_address_in_pgd(pgd, address, &level); 623 624 if (pte && pte_present(*pte) && !pte_exec(*pte)) 625 pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", 626 from_kuid(&init_user_ns, current_uid())); 627 if (pte && pte_present(*pte) && pte_exec(*pte) && 628 (pgd_flags(*pgd) & _PAGE_USER) && 629 (__read_cr4() & X86_CR4_SMEP)) 630 pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", 631 from_kuid(&init_user_ns, current_uid())); 632 } 633 634 if (address < PAGE_SIZE && !user_mode(regs)) 635 pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", 636 (void *)address); 637 else 638 pr_alert("BUG: unable to handle page fault for address: %px\n", 639 (void *)address); 640 641 pr_alert("#PF: %s %s in %s mode\n", 642 (error_code & X86_PF_USER) ? "user" : "supervisor", 643 (error_code & X86_PF_INSTR) ? "instruction fetch" : 644 (error_code & X86_PF_WRITE) ? "write access" : 645 "read access", 646 user_mode(regs) ? "user" : "kernel"); 647 pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, 648 !(error_code & X86_PF_PROT) ? "not-present page" : 649 (error_code & X86_PF_RSVD) ? "reserved bit violation" : 650 (error_code & X86_PF_PK) ? "protection keys violation" : 651 "permissions violation"); 652 653 if (!(error_code & X86_PF_USER) && user_mode(regs)) { 654 struct desc_ptr idt, gdt; 655 u16 ldtr, tr; 656 657 /* 658 * This can happen for quite a few reasons. The more obvious 659 * ones are faults accessing the GDT, or LDT. Perhaps 660 * surprisingly, if the CPU tries to deliver a benign or 661 * contributory exception from user code and gets a page fault 662 * during delivery, the page fault can be delivered as though 663 * it originated directly from user code. This could happen 664 * due to wrong permissions on the IDT, GDT, LDT, TSS, or 665 * kernel or IST stack. 666 */ 667 store_idt(&idt); 668 669 /* Usable even on Xen PV -- it's just slow. */ 670 native_store_gdt(&gdt); 671 672 pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", 673 idt.address, idt.size, gdt.address, gdt.size); 674 675 store_ldt(ldtr); 676 show_ldttss(&gdt, "LDTR", ldtr); 677 678 store_tr(tr); 679 show_ldttss(&gdt, "TR", tr); 680 } 681 682 dump_pagetable(address); 683 } 684 685 static noinline void 686 pgtable_bad(struct pt_regs *regs, unsigned long error_code, 687 unsigned long address) 688 { 689 struct task_struct *tsk; 690 unsigned long flags; 691 int sig; 692 693 flags = oops_begin(); 694 tsk = current; 695 sig = SIGKILL; 696 697 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 698 tsk->comm, address); 699 dump_pagetable(address); 700 701 if (__die("Bad pagetable", regs, error_code)) 702 sig = 0; 703 704 oops_end(flags, regs, sig); 705 } 706 707 static void set_signal_archinfo(unsigned long address, 708 unsigned long error_code) 709 { 710 struct task_struct *tsk = current; 711 712 /* 713 * To avoid leaking information about the kernel page 714 * table layout, pretend that user-mode accesses to 715 * kernel addresses are always protection faults. 716 * 717 * NB: This means that failed vsyscalls with vsyscall=none 718 * will have the PROT bit. This doesn't leak any 719 * information and does not appear to cause any problems. 720 */ 721 if (address >= TASK_SIZE_MAX) 722 error_code |= X86_PF_PROT; 723 724 tsk->thread.trap_nr = X86_TRAP_PF; 725 tsk->thread.error_code = error_code | X86_PF_USER; 726 tsk->thread.cr2 = address; 727 } 728 729 static noinline void 730 no_context(struct pt_regs *regs, unsigned long error_code, 731 unsigned long address, int signal, int si_code) 732 { 733 struct task_struct *tsk = current; 734 unsigned long flags; 735 int sig; 736 737 if (user_mode(regs)) { 738 /* 739 * This is an implicit supervisor-mode access from user 740 * mode. Bypass all the kernel-mode recovery code and just 741 * OOPS. 742 */ 743 goto oops; 744 } 745 746 /* Are we prepared to handle this kernel fault? */ 747 if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { 748 /* 749 * Any interrupt that takes a fault gets the fixup. This makes 750 * the below recursive fault logic only apply to a faults from 751 * task context. 752 */ 753 if (in_interrupt()) 754 return; 755 756 /* 757 * Per the above we're !in_interrupt(), aka. task context. 758 * 759 * In this case we need to make sure we're not recursively 760 * faulting through the emulate_vsyscall() logic. 761 */ 762 if (current->thread.sig_on_uaccess_err && signal) { 763 set_signal_archinfo(address, error_code); 764 765 /* XXX: hwpoison faults will set the wrong code. */ 766 force_sig_fault(signal, si_code, (void __user *)address); 767 } 768 769 /* 770 * Barring that, we can do the fixup and be happy. 771 */ 772 return; 773 } 774 775 #ifdef CONFIG_VMAP_STACK 776 /* 777 * Stack overflow? During boot, we can fault near the initial 778 * stack in the direct map, but that's not an overflow -- check 779 * that we're in vmalloc space to avoid this. 780 */ 781 if (is_vmalloc_addr((void *)address) && 782 (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || 783 address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { 784 unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); 785 /* 786 * We're likely to be running with very little stack space 787 * left. It's plausible that we'd hit this condition but 788 * double-fault even before we get this far, in which case 789 * we're fine: the double-fault handler will deal with it. 790 * 791 * We don't want to make it all the way into the oops code 792 * and then double-fault, though, because we're likely to 793 * break the console driver and lose most of the stack dump. 794 */ 795 asm volatile ("movq %[stack], %%rsp\n\t" 796 "call handle_stack_overflow\n\t" 797 "1: jmp 1b" 798 : ASM_CALL_CONSTRAINT 799 : "D" ("kernel stack overflow (page fault)"), 800 "S" (regs), "d" (address), 801 [stack] "rm" (stack)); 802 unreachable(); 803 } 804 #endif 805 806 /* 807 * 32-bit: 808 * 809 * Valid to do another page fault here, because if this fault 810 * had been triggered by is_prefetch fixup_exception would have 811 * handled it. 812 * 813 * 64-bit: 814 * 815 * Hall of shame of CPU/BIOS bugs. 816 */ 817 if (is_prefetch(regs, error_code, address)) 818 return; 819 820 if (is_errata93(regs, address)) 821 return; 822 823 /* 824 * Buggy firmware could access regions which might page fault, try to 825 * recover from such faults. 826 */ 827 if (IS_ENABLED(CONFIG_EFI)) 828 efi_recover_from_page_fault(address); 829 830 oops: 831 /* 832 * Oops. The kernel tried to access some bad page. We'll have to 833 * terminate things with extreme prejudice: 834 */ 835 flags = oops_begin(); 836 837 show_fault_oops(regs, error_code, address); 838 839 if (task_stack_end_corrupted(tsk)) 840 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 841 842 sig = SIGKILL; 843 if (__die("Oops", regs, error_code)) 844 sig = 0; 845 846 /* Executive summary in case the body of the oops scrolled away */ 847 printk(KERN_DEFAULT "CR2: %016lx\n", address); 848 849 oops_end(flags, regs, sig); 850 } 851 852 /* 853 * Print out info about fatal segfaults, if the show_unhandled_signals 854 * sysctl is set: 855 */ 856 static inline void 857 show_signal_msg(struct pt_regs *regs, unsigned long error_code, 858 unsigned long address, struct task_struct *tsk) 859 { 860 const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; 861 862 if (!unhandled_signal(tsk, SIGSEGV)) 863 return; 864 865 if (!printk_ratelimit()) 866 return; 867 868 printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", 869 loglvl, tsk->comm, task_pid_nr(tsk), address, 870 (void *)regs->ip, (void *)regs->sp, error_code); 871 872 print_vma_addr(KERN_CONT " in ", regs->ip); 873 874 printk(KERN_CONT "\n"); 875 876 show_opcodes(regs, loglvl); 877 } 878 879 /* 880 * The (legacy) vsyscall page is the long page in the kernel portion 881 * of the address space that has user-accessible permissions. 882 */ 883 static bool is_vsyscall_vaddr(unsigned long vaddr) 884 { 885 return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); 886 } 887 888 static void 889 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 890 unsigned long address, u32 pkey, int si_code) 891 { 892 struct task_struct *tsk = current; 893 894 /* User mode accesses just cause a SIGSEGV */ 895 if (user_mode(regs) && (error_code & X86_PF_USER)) { 896 /* 897 * It's possible to have interrupts off here: 898 */ 899 local_irq_enable(); 900 901 /* 902 * Valid to do another page fault here because this one came 903 * from user space: 904 */ 905 if (is_prefetch(regs, error_code, address)) 906 return; 907 908 if (is_errata100(regs, address)) 909 return; 910 911 /* 912 * To avoid leaking information about the kernel page table 913 * layout, pretend that user-mode accesses to kernel addresses 914 * are always protection faults. 915 */ 916 if (address >= TASK_SIZE_MAX) 917 error_code |= X86_PF_PROT; 918 919 if (likely(show_unhandled_signals)) 920 show_signal_msg(regs, error_code, address, tsk); 921 922 set_signal_archinfo(address, error_code); 923 924 if (si_code == SEGV_PKUERR) 925 force_sig_pkuerr((void __user *)address, pkey); 926 927 force_sig_fault(SIGSEGV, si_code, (void __user *)address); 928 929 return; 930 } 931 932 if (is_f00f_bug(regs, address)) 933 return; 934 935 no_context(regs, error_code, address, SIGSEGV, si_code); 936 } 937 938 static noinline void 939 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 940 unsigned long address) 941 { 942 __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); 943 } 944 945 static void 946 __bad_area(struct pt_regs *regs, unsigned long error_code, 947 unsigned long address, u32 pkey, int si_code) 948 { 949 struct mm_struct *mm = current->mm; 950 /* 951 * Something tried to access memory that isn't in our memory map.. 952 * Fix it, but check if it's kernel or user first.. 953 */ 954 up_read(&mm->mmap_sem); 955 956 __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); 957 } 958 959 static noinline void 960 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) 961 { 962 __bad_area(regs, error_code, address, 0, SEGV_MAPERR); 963 } 964 965 static inline bool bad_area_access_from_pkeys(unsigned long error_code, 966 struct vm_area_struct *vma) 967 { 968 /* This code is always called on the current mm */ 969 bool foreign = false; 970 971 if (!boot_cpu_has(X86_FEATURE_OSPKE)) 972 return false; 973 if (error_code & X86_PF_PK) 974 return true; 975 /* this checks permission keys on the VMA: */ 976 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), 977 (error_code & X86_PF_INSTR), foreign)) 978 return true; 979 return false; 980 } 981 982 static noinline void 983 bad_area_access_error(struct pt_regs *regs, unsigned long error_code, 984 unsigned long address, struct vm_area_struct *vma) 985 { 986 /* 987 * This OSPKE check is not strictly necessary at runtime. 988 * But, doing it this way allows compiler optimizations 989 * if pkeys are compiled out. 990 */ 991 if (bad_area_access_from_pkeys(error_code, vma)) { 992 /* 993 * A protection key fault means that the PKRU value did not allow 994 * access to some PTE. Userspace can figure out what PKRU was 995 * from the XSAVE state. This function captures the pkey from 996 * the vma and passes it to userspace so userspace can discover 997 * which protection key was set on the PTE. 998 * 999 * If we get here, we know that the hardware signaled a X86_PF_PK 1000 * fault and that there was a VMA once we got in the fault 1001 * handler. It does *not* guarantee that the VMA we find here 1002 * was the one that we faulted on. 1003 * 1004 * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); 1005 * 2. T1 : set PKRU to deny access to pkey=4, touches page 1006 * 3. T1 : faults... 1007 * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); 1008 * 5. T1 : enters fault handler, takes mmap_sem, etc... 1009 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really 1010 * faulted on a pte with its pkey=4. 1011 */ 1012 u32 pkey = vma_pkey(vma); 1013 1014 __bad_area(regs, error_code, address, pkey, SEGV_PKUERR); 1015 } else { 1016 __bad_area(regs, error_code, address, 0, SEGV_ACCERR); 1017 } 1018 } 1019 1020 static void 1021 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, 1022 vm_fault_t fault) 1023 { 1024 /* Kernel mode? Handle exceptions or die: */ 1025 if (!(error_code & X86_PF_USER)) { 1026 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); 1027 return; 1028 } 1029 1030 /* User-space => ok to do another page fault: */ 1031 if (is_prefetch(regs, error_code, address)) 1032 return; 1033 1034 set_signal_archinfo(address, error_code); 1035 1036 #ifdef CONFIG_MEMORY_FAILURE 1037 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { 1038 struct task_struct *tsk = current; 1039 unsigned lsb = 0; 1040 1041 pr_err( 1042 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 1043 tsk->comm, tsk->pid, address); 1044 if (fault & VM_FAULT_HWPOISON_LARGE) 1045 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 1046 if (fault & VM_FAULT_HWPOISON) 1047 lsb = PAGE_SHIFT; 1048 force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); 1049 return; 1050 } 1051 #endif 1052 force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); 1053 } 1054 1055 static noinline void 1056 mm_fault_error(struct pt_regs *regs, unsigned long error_code, 1057 unsigned long address, vm_fault_t fault) 1058 { 1059 if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { 1060 no_context(regs, error_code, address, 0, 0); 1061 return; 1062 } 1063 1064 if (fault & VM_FAULT_OOM) { 1065 /* Kernel mode? Handle exceptions or die: */ 1066 if (!(error_code & X86_PF_USER)) { 1067 no_context(regs, error_code, address, 1068 SIGSEGV, SEGV_MAPERR); 1069 return; 1070 } 1071 1072 /* 1073 * We ran out of memory, call the OOM killer, and return the 1074 * userspace (which will retry the fault, or kill us if we got 1075 * oom-killed): 1076 */ 1077 pagefault_out_of_memory(); 1078 } else { 1079 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| 1080 VM_FAULT_HWPOISON_LARGE)) 1081 do_sigbus(regs, error_code, address, fault); 1082 else if (fault & VM_FAULT_SIGSEGV) 1083 bad_area_nosemaphore(regs, error_code, address); 1084 else 1085 BUG(); 1086 } 1087 } 1088 1089 static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) 1090 { 1091 if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) 1092 return 0; 1093 1094 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) 1095 return 0; 1096 1097 return 1; 1098 } 1099 1100 /* 1101 * Handle a spurious fault caused by a stale TLB entry. 1102 * 1103 * This allows us to lazily refresh the TLB when increasing the 1104 * permissions of a kernel page (RO -> RW or NX -> X). Doing it 1105 * eagerly is very expensive since that implies doing a full 1106 * cross-processor TLB flush, even if no stale TLB entries exist 1107 * on other processors. 1108 * 1109 * Spurious faults may only occur if the TLB contains an entry with 1110 * fewer permission than the page table entry. Non-present (P = 0) 1111 * and reserved bit (R = 1) faults are never spurious. 1112 * 1113 * There are no security implications to leaving a stale TLB when 1114 * increasing the permissions on a page. 1115 * 1116 * Returns non-zero if a spurious fault was handled, zero otherwise. 1117 * 1118 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 1119 * (Optional Invalidation). 1120 */ 1121 static noinline int 1122 spurious_kernel_fault(unsigned long error_code, unsigned long address) 1123 { 1124 pgd_t *pgd; 1125 p4d_t *p4d; 1126 pud_t *pud; 1127 pmd_t *pmd; 1128 pte_t *pte; 1129 int ret; 1130 1131 /* 1132 * Only writes to RO or instruction fetches from NX may cause 1133 * spurious faults. 1134 * 1135 * These could be from user or supervisor accesses but the TLB 1136 * is only lazily flushed after a kernel mapping protection 1137 * change, so user accesses are not expected to cause spurious 1138 * faults. 1139 */ 1140 if (error_code != (X86_PF_WRITE | X86_PF_PROT) && 1141 error_code != (X86_PF_INSTR | X86_PF_PROT)) 1142 return 0; 1143 1144 pgd = init_mm.pgd + pgd_index(address); 1145 if (!pgd_present(*pgd)) 1146 return 0; 1147 1148 p4d = p4d_offset(pgd, address); 1149 if (!p4d_present(*p4d)) 1150 return 0; 1151 1152 if (p4d_large(*p4d)) 1153 return spurious_kernel_fault_check(error_code, (pte_t *) p4d); 1154 1155 pud = pud_offset(p4d, address); 1156 if (!pud_present(*pud)) 1157 return 0; 1158 1159 if (pud_large(*pud)) 1160 return spurious_kernel_fault_check(error_code, (pte_t *) pud); 1161 1162 pmd = pmd_offset(pud, address); 1163 if (!pmd_present(*pmd)) 1164 return 0; 1165 1166 if (pmd_large(*pmd)) 1167 return spurious_kernel_fault_check(error_code, (pte_t *) pmd); 1168 1169 pte = pte_offset_kernel(pmd, address); 1170 if (!pte_present(*pte)) 1171 return 0; 1172 1173 ret = spurious_kernel_fault_check(error_code, pte); 1174 if (!ret) 1175 return 0; 1176 1177 /* 1178 * Make sure we have permissions in PMD. 1179 * If not, then there's a bug in the page tables: 1180 */ 1181 ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); 1182 WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); 1183 1184 return ret; 1185 } 1186 NOKPROBE_SYMBOL(spurious_kernel_fault); 1187 1188 int show_unhandled_signals = 1; 1189 1190 static inline int 1191 access_error(unsigned long error_code, struct vm_area_struct *vma) 1192 { 1193 /* This is only called for the current mm, so: */ 1194 bool foreign = false; 1195 1196 /* 1197 * Read or write was blocked by protection keys. This is 1198 * always an unconditional error and can never result in 1199 * a follow-up action to resolve the fault, like a COW. 1200 */ 1201 if (error_code & X86_PF_PK) 1202 return 1; 1203 1204 /* 1205 * Make sure to check the VMA so that we do not perform 1206 * faults just to hit a X86_PF_PK as soon as we fill in a 1207 * page. 1208 */ 1209 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), 1210 (error_code & X86_PF_INSTR), foreign)) 1211 return 1; 1212 1213 if (error_code & X86_PF_WRITE) { 1214 /* write, present and write, not present: */ 1215 if (unlikely(!(vma->vm_flags & VM_WRITE))) 1216 return 1; 1217 return 0; 1218 } 1219 1220 /* read, present: */ 1221 if (unlikely(error_code & X86_PF_PROT)) 1222 return 1; 1223 1224 /* read, not present: */ 1225 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) 1226 return 1; 1227 1228 return 0; 1229 } 1230 1231 static int fault_in_kernel_space(unsigned long address) 1232 { 1233 /* 1234 * On 64-bit systems, the vsyscall page is at an address above 1235 * TASK_SIZE_MAX, but is not considered part of the kernel 1236 * address space. 1237 */ 1238 if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) 1239 return false; 1240 1241 return address >= TASK_SIZE_MAX; 1242 } 1243 1244 /* 1245 * Called for all faults where 'address' is part of the kernel address 1246 * space. Might get called for faults that originate from *code* that 1247 * ran in userspace or the kernel. 1248 */ 1249 static void 1250 do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, 1251 unsigned long address) 1252 { 1253 /* 1254 * Protection keys exceptions only happen on user pages. We 1255 * have no user pages in the kernel portion of the address 1256 * space, so do not expect them here. 1257 */ 1258 WARN_ON_ONCE(hw_error_code & X86_PF_PK); 1259 1260 /* 1261 * We can fault-in kernel-space virtual memory on-demand. The 1262 * 'reference' page table is init_mm.pgd. 1263 * 1264 * NOTE! We MUST NOT take any locks for this case. We may 1265 * be in an interrupt or a critical region, and should 1266 * only copy the information from the master page table, 1267 * nothing more. 1268 * 1269 * Before doing this on-demand faulting, ensure that the 1270 * fault is not any of the following: 1271 * 1. A fault on a PTE with a reserved bit set. 1272 * 2. A fault caused by a user-mode access. (Do not demand- 1273 * fault kernel memory due to user-mode accesses). 1274 * 3. A fault caused by a page-level protection violation. 1275 * (A demand fault would be on a non-present page which 1276 * would have X86_PF_PROT==0). 1277 */ 1278 if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { 1279 if (vmalloc_fault(address) >= 0) 1280 return; 1281 } 1282 1283 /* Was the fault spurious, caused by lazy TLB invalidation? */ 1284 if (spurious_kernel_fault(hw_error_code, address)) 1285 return; 1286 1287 /* kprobes don't want to hook the spurious faults: */ 1288 if (kprobe_page_fault(regs, X86_TRAP_PF)) 1289 return; 1290 1291 /* 1292 * Note, despite being a "bad area", there are quite a few 1293 * acceptable reasons to get here, such as erratum fixups 1294 * and handling kernel code that can fault, like get_user(). 1295 * 1296 * Don't take the mm semaphore here. If we fixup a prefetch 1297 * fault we could otherwise deadlock: 1298 */ 1299 bad_area_nosemaphore(regs, hw_error_code, address); 1300 } 1301 NOKPROBE_SYMBOL(do_kern_addr_fault); 1302 1303 /* Handle faults in the user portion of the address space */ 1304 static inline 1305 void do_user_addr_fault(struct pt_regs *regs, 1306 unsigned long hw_error_code, 1307 unsigned long address) 1308 { 1309 struct vm_area_struct *vma; 1310 struct task_struct *tsk; 1311 struct mm_struct *mm; 1312 vm_fault_t fault, major = 0; 1313 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 1314 1315 tsk = current; 1316 mm = tsk->mm; 1317 1318 /* kprobes don't want to hook the spurious faults: */ 1319 if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) 1320 return; 1321 1322 /* 1323 * Reserved bits are never expected to be set on 1324 * entries in the user portion of the page tables. 1325 */ 1326 if (unlikely(hw_error_code & X86_PF_RSVD)) 1327 pgtable_bad(regs, hw_error_code, address); 1328 1329 /* 1330 * If SMAP is on, check for invalid kernel (supervisor) access to user 1331 * pages in the user address space. The odd case here is WRUSS, 1332 * which, according to the preliminary documentation, does not respect 1333 * SMAP and will have the USER bit set so, in all cases, SMAP 1334 * enforcement appears to be consistent with the USER bit. 1335 */ 1336 if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && 1337 !(hw_error_code & X86_PF_USER) && 1338 !(regs->flags & X86_EFLAGS_AC))) 1339 { 1340 bad_area_nosemaphore(regs, hw_error_code, address); 1341 return; 1342 } 1343 1344 /* 1345 * If we're in an interrupt, have no user context or are running 1346 * in a region with pagefaults disabled then we must not take the fault 1347 */ 1348 if (unlikely(faulthandler_disabled() || !mm)) { 1349 bad_area_nosemaphore(regs, hw_error_code, address); 1350 return; 1351 } 1352 1353 /* 1354 * It's safe to allow irq's after cr2 has been saved and the 1355 * vmalloc fault has been handled. 1356 * 1357 * User-mode registers count as a user access even for any 1358 * potential system fault or CPU buglet: 1359 */ 1360 if (user_mode(regs)) { 1361 local_irq_enable(); 1362 flags |= FAULT_FLAG_USER; 1363 } else { 1364 if (regs->flags & X86_EFLAGS_IF) 1365 local_irq_enable(); 1366 } 1367 1368 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 1369 1370 if (hw_error_code & X86_PF_WRITE) 1371 flags |= FAULT_FLAG_WRITE; 1372 if (hw_error_code & X86_PF_INSTR) 1373 flags |= FAULT_FLAG_INSTRUCTION; 1374 1375 #ifdef CONFIG_X86_64 1376 /* 1377 * Faults in the vsyscall page might need emulation. The 1378 * vsyscall page is at a high address (>PAGE_OFFSET), but is 1379 * considered to be part of the user address space. 1380 * 1381 * The vsyscall page does not have a "real" VMA, so do this 1382 * emulation before we go searching for VMAs. 1383 * 1384 * PKRU never rejects instruction fetches, so we don't need 1385 * to consider the PF_PK bit. 1386 */ 1387 if (is_vsyscall_vaddr(address)) { 1388 if (emulate_vsyscall(hw_error_code, regs, address)) 1389 return; 1390 } 1391 #endif 1392 1393 /* 1394 * Kernel-mode access to the user address space should only occur 1395 * on well-defined single instructions listed in the exception 1396 * tables. But, an erroneous kernel fault occurring outside one of 1397 * those areas which also holds mmap_sem might deadlock attempting 1398 * to validate the fault against the address space. 1399 * 1400 * Only do the expensive exception table search when we might be at 1401 * risk of a deadlock. This happens if we 1402 * 1. Failed to acquire mmap_sem, and 1403 * 2. The access did not originate in userspace. 1404 */ 1405 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1406 if (!user_mode(regs) && !search_exception_tables(regs->ip)) { 1407 /* 1408 * Fault from code in kernel from 1409 * which we do not expect faults. 1410 */ 1411 bad_area_nosemaphore(regs, hw_error_code, address); 1412 return; 1413 } 1414 retry: 1415 down_read(&mm->mmap_sem); 1416 } else { 1417 /* 1418 * The above down_read_trylock() might have succeeded in 1419 * which case we'll have missed the might_sleep() from 1420 * down_read(): 1421 */ 1422 might_sleep(); 1423 } 1424 1425 vma = find_vma(mm, address); 1426 if (unlikely(!vma)) { 1427 bad_area(regs, hw_error_code, address); 1428 return; 1429 } 1430 if (likely(vma->vm_start <= address)) 1431 goto good_area; 1432 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 1433 bad_area(regs, hw_error_code, address); 1434 return; 1435 } 1436 if (unlikely(expand_stack(vma, address))) { 1437 bad_area(regs, hw_error_code, address); 1438 return; 1439 } 1440 1441 /* 1442 * Ok, we have a good vm_area for this memory access, so 1443 * we can handle it.. 1444 */ 1445 good_area: 1446 if (unlikely(access_error(hw_error_code, vma))) { 1447 bad_area_access_error(regs, hw_error_code, address, vma); 1448 return; 1449 } 1450 1451 /* 1452 * If for any reason at all we couldn't handle the fault, 1453 * make sure we exit gracefully rather than endlessly redo 1454 * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if 1455 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. 1456 * 1457 * Note that handle_userfault() may also release and reacquire mmap_sem 1458 * (and not return with VM_FAULT_RETRY), when returning to userland to 1459 * repeat the page fault later with a VM_FAULT_NOPAGE retval 1460 * (potentially after handling any pending signal during the return to 1461 * userland). The return to userland is identified whenever 1462 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. 1463 */ 1464 fault = handle_mm_fault(vma, address, flags); 1465 major |= fault & VM_FAULT_MAJOR; 1466 1467 /* 1468 * If we need to retry the mmap_sem has already been released, 1469 * and if there is a fatal signal pending there is no guarantee 1470 * that we made any progress. Handle this case first. 1471 */ 1472 if (unlikely(fault & VM_FAULT_RETRY)) { 1473 /* Retry at most once */ 1474 if (flags & FAULT_FLAG_ALLOW_RETRY) { 1475 flags &= ~FAULT_FLAG_ALLOW_RETRY; 1476 flags |= FAULT_FLAG_TRIED; 1477 if (!fatal_signal_pending(tsk)) 1478 goto retry; 1479 } 1480 1481 /* User mode? Just return to handle the fatal exception */ 1482 if (flags & FAULT_FLAG_USER) 1483 return; 1484 1485 /* Not returning to user mode? Handle exceptions or die: */ 1486 no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR); 1487 return; 1488 } 1489 1490 up_read(&mm->mmap_sem); 1491 if (unlikely(fault & VM_FAULT_ERROR)) { 1492 mm_fault_error(regs, hw_error_code, address, fault); 1493 return; 1494 } 1495 1496 /* 1497 * Major/minor page fault accounting. If any of the events 1498 * returned VM_FAULT_MAJOR, we account it as a major fault. 1499 */ 1500 if (major) { 1501 tsk->maj_flt++; 1502 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); 1503 } else { 1504 tsk->min_flt++; 1505 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); 1506 } 1507 1508 check_v8086_mode(regs, address, tsk); 1509 } 1510 NOKPROBE_SYMBOL(do_user_addr_fault); 1511 1512 static __always_inline void 1513 trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, 1514 unsigned long address) 1515 { 1516 if (!trace_pagefault_enabled()) 1517 return; 1518 1519 if (user_mode(regs)) 1520 trace_page_fault_user(address, regs, error_code); 1521 else 1522 trace_page_fault_kernel(address, regs, error_code); 1523 } 1524 1525 dotraplinkage void 1526 do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, 1527 unsigned long address) 1528 { 1529 prefetchw(¤t->mm->mmap_sem); 1530 trace_page_fault_entries(regs, hw_error_code, address); 1531 1532 if (unlikely(kmmio_fault(regs, address))) 1533 return; 1534 1535 /* Was the fault on kernel-controlled part of the address space? */ 1536 if (unlikely(fault_in_kernel_space(address))) 1537 do_kern_addr_fault(regs, hw_error_code, address); 1538 else 1539 do_user_addr_fault(regs, hw_error_code, address); 1540 } 1541 NOKPROBE_SYMBOL(do_page_fault); 1542