1 /* 2 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs 4 * 5 * Pentium III FXSR, SSE support 6 * Gareth Hughes <gareth@valinux.com>, May 2000 7 */ 8 9 /* 10 * Handle hardware traps and faults. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/context_tracking.h> 16 #include <linux/interrupt.h> 17 #include <linux/kallsyms.h> 18 #include <linux/spinlock.h> 19 #include <linux/kprobes.h> 20 #include <linux/uaccess.h> 21 #include <linux/kdebug.h> 22 #include <linux/kgdb.h> 23 #include <linux/kernel.h> 24 #include <linux/export.h> 25 #include <linux/ptrace.h> 26 #include <linux/uprobes.h> 27 #include <linux/string.h> 28 #include <linux/delay.h> 29 #include <linux/errno.h> 30 #include <linux/kexec.h> 31 #include <linux/sched.h> 32 #include <linux/sched/task_stack.h> 33 #include <linux/timer.h> 34 #include <linux/init.h> 35 #include <linux/bug.h> 36 #include <linux/nmi.h> 37 #include <linux/mm.h> 38 #include <linux/smp.h> 39 #include <linux/io.h> 40 #include <linux/hardirq.h> 41 #include <linux/atomic.h> 42 43 #include <asm/stacktrace.h> 44 #include <asm/processor.h> 45 #include <asm/debugreg.h> 46 #include <asm/text-patching.h> 47 #include <asm/ftrace.h> 48 #include <asm/traps.h> 49 #include <asm/desc.h> 50 #include <asm/fpu/internal.h> 51 #include <asm/cpu.h> 52 #include <asm/cpu_entry_area.h> 53 #include <asm/mce.h> 54 #include <asm/fixmap.h> 55 #include <asm/mach_traps.h> 56 #include <asm/alternative.h> 57 #include <asm/fpu/xstate.h> 58 #include <asm/vm86.h> 59 #include <asm/umip.h> 60 #include <asm/insn.h> 61 #include <asm/insn-eval.h> 62 63 #ifdef CONFIG_X86_64 64 #include <asm/x86_init.h> 65 #include <asm/pgalloc.h> 66 #include <asm/proto.h> 67 #else 68 #include <asm/processor-flags.h> 69 #include <asm/setup.h> 70 #include <asm/proto.h> 71 #endif 72 73 DECLARE_BITMAP(system_vectors, NR_VECTORS); 74 75 static inline void cond_local_irq_enable(struct pt_regs *regs) 76 { 77 if (regs->flags & X86_EFLAGS_IF) 78 local_irq_enable(); 79 } 80 81 static inline void cond_local_irq_disable(struct pt_regs *regs) 82 { 83 if (regs->flags & X86_EFLAGS_IF) 84 local_irq_disable(); 85 } 86 87 int is_valid_bugaddr(unsigned long addr) 88 { 89 unsigned short ud; 90 91 if (addr < TASK_SIZE_MAX) 92 return 0; 93 94 if (probe_kernel_address((unsigned short *)addr, ud)) 95 return 0; 96 97 return ud == INSN_UD0 || ud == INSN_UD2; 98 } 99 100 int fixup_bug(struct pt_regs *regs, int trapnr) 101 { 102 if (trapnr != X86_TRAP_UD) 103 return 0; 104 105 switch (report_bug(regs->ip, regs)) { 106 case BUG_TRAP_TYPE_NONE: 107 case BUG_TRAP_TYPE_BUG: 108 break; 109 110 case BUG_TRAP_TYPE_WARN: 111 regs->ip += LEN_UD2; 112 return 1; 113 } 114 115 return 0; 116 } 117 118 static nokprobe_inline int 119 do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, 120 struct pt_regs *regs, long error_code) 121 { 122 if (v8086_mode(regs)) { 123 /* 124 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. 125 * On nmi (interrupt 2), do_trap should not be called. 126 */ 127 if (trapnr < X86_TRAP_UD) { 128 if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, 129 error_code, trapnr)) 130 return 0; 131 } 132 } else if (!user_mode(regs)) { 133 if (fixup_exception(regs, trapnr, error_code, 0)) 134 return 0; 135 136 tsk->thread.error_code = error_code; 137 tsk->thread.trap_nr = trapnr; 138 die(str, regs, error_code); 139 } 140 141 /* 142 * We want error_code and trap_nr set for userspace faults and 143 * kernelspace faults which result in die(), but not 144 * kernelspace faults which are fixed up. die() gives the 145 * process no chance to handle the signal and notice the 146 * kernel fault information, so that won't result in polluting 147 * the information about previously queued, but not yet 148 * delivered, faults. See also exc_general_protection below. 149 */ 150 tsk->thread.error_code = error_code; 151 tsk->thread.trap_nr = trapnr; 152 153 return -1; 154 } 155 156 static void show_signal(struct task_struct *tsk, int signr, 157 const char *type, const char *desc, 158 struct pt_regs *regs, long error_code) 159 { 160 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 161 printk_ratelimit()) { 162 pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", 163 tsk->comm, task_pid_nr(tsk), type, desc, 164 regs->ip, regs->sp, error_code); 165 print_vma_addr(KERN_CONT " in ", regs->ip); 166 pr_cont("\n"); 167 } 168 } 169 170 static void 171 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, 172 long error_code, int sicode, void __user *addr) 173 { 174 struct task_struct *tsk = current; 175 176 if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) 177 return; 178 179 show_signal(tsk, signr, "trap ", str, regs, error_code); 180 181 if (!sicode) 182 force_sig(signr); 183 else 184 force_sig_fault(signr, sicode, addr); 185 } 186 NOKPROBE_SYMBOL(do_trap); 187 188 static void do_error_trap(struct pt_regs *regs, long error_code, char *str, 189 unsigned long trapnr, int signr, int sicode, void __user *addr) 190 { 191 RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); 192 193 /* 194 * WARN*()s end up here; fix them up before we call the 195 * notifier chain. 196 */ 197 if (!user_mode(regs) && fixup_bug(regs, trapnr)) 198 return; 199 200 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != 201 NOTIFY_STOP) { 202 cond_local_irq_enable(regs); 203 do_trap(trapnr, signr, str, regs, error_code, sicode, addr); 204 cond_local_irq_disable(regs); 205 } 206 } 207 208 /* 209 * Posix requires to provide the address of the faulting instruction for 210 * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. 211 * 212 * This address is usually regs->ip, but when an uprobe moved the code out 213 * of line then regs->ip points to the XOL code which would confuse 214 * anything which analyzes the fault address vs. the unmodified binary. If 215 * a trap happened in XOL code then uprobe maps regs->ip back to the 216 * original instruction address. 217 */ 218 static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) 219 { 220 return (void __user *)uprobe_get_trap_addr(regs); 221 } 222 223 DEFINE_IDTENTRY(exc_divide_error) 224 { 225 do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE, 226 FPE_INTDIV, error_get_trap_addr(regs)); 227 } 228 229 DEFINE_IDTENTRY(exc_overflow) 230 { 231 do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); 232 } 233 234 #ifdef CONFIG_X86_F00F_BUG 235 void handle_invalid_op(struct pt_regs *regs) 236 #else 237 static inline void handle_invalid_op(struct pt_regs *regs) 238 #endif 239 { 240 do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, 241 ILL_ILLOPN, error_get_trap_addr(regs)); 242 } 243 244 DEFINE_IDTENTRY(exc_invalid_op) 245 { 246 handle_invalid_op(regs); 247 } 248 249 DEFINE_IDTENTRY(exc_coproc_segment_overrun) 250 { 251 do_error_trap(regs, 0, "coprocessor segment overrun", 252 X86_TRAP_OLD_MF, SIGFPE, 0, NULL); 253 } 254 255 DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) 256 { 257 do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, 258 0, NULL); 259 } 260 261 DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) 262 { 263 do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, 264 SIGBUS, 0, NULL); 265 } 266 267 DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) 268 { 269 do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, 270 0, NULL); 271 } 272 273 DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) 274 { 275 char *str = "alignment check"; 276 277 if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) 278 return; 279 280 if (!user_mode(regs)) 281 die("Split lock detected\n", regs, error_code); 282 283 local_irq_enable(); 284 285 if (handle_user_split_lock(regs, error_code)) 286 return; 287 288 do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, 289 error_code, BUS_ADRALN, NULL); 290 } 291 292 #ifdef CONFIG_VMAP_STACK 293 __visible void __noreturn handle_stack_overflow(const char *message, 294 struct pt_regs *regs, 295 unsigned long fault_address) 296 { 297 printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", 298 (void *)fault_address, current->stack, 299 (char *)current->stack + THREAD_SIZE - 1); 300 die(message, regs, 0); 301 302 /* Be absolutely certain we don't return. */ 303 panic("%s", message); 304 } 305 #endif 306 307 /* 308 * Runs on an IST stack for x86_64 and on a special task stack for x86_32. 309 * 310 * On x86_64, this is more or less a normal kernel entry. Notwithstanding the 311 * SDM's warnings about double faults being unrecoverable, returning works as 312 * expected. Presumably what the SDM actually means is that the CPU may get 313 * the register state wrong on entry, so returning could be a bad idea. 314 * 315 * Various CPU engineers have promised that double faults due to an IRET fault 316 * while the stack is read-only are, in fact, recoverable. 317 * 318 * On x86_32, this is entered through a task gate, and regs are synthesized 319 * from the TSS. Returning is, in principle, okay, but changes to regs will 320 * be lost. If, for some reason, we need to return to a context with modified 321 * regs, the shim code could be adjusted to synchronize the registers. 322 * 323 * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs 324 * to be read before doing anything else. 325 */ 326 DEFINE_IDTENTRY_DF(exc_double_fault) 327 { 328 static const char str[] = "double fault"; 329 struct task_struct *tsk = current; 330 331 #ifdef CONFIG_VMAP_STACK 332 unsigned long address = read_cr2(); 333 #endif 334 335 #ifdef CONFIG_X86_ESPFIX64 336 extern unsigned char native_irq_return_iret[]; 337 338 /* 339 * If IRET takes a non-IST fault on the espfix64 stack, then we 340 * end up promoting it to a doublefault. In that case, take 341 * advantage of the fact that we're not using the normal (TSS.sp0) 342 * stack right now. We can write a fake #GP(0) frame at TSS.sp0 343 * and then modify our own IRET frame so that, when we return, 344 * we land directly at the #GP(0) vector with the stack already 345 * set up according to its expectations. 346 * 347 * The net result is that our #GP handler will think that we 348 * entered from usermode with the bad user context. 349 * 350 * No need for nmi_enter() here because we don't use RCU. 351 */ 352 if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && 353 regs->cs == __KERNEL_CS && 354 regs->ip == (unsigned long)native_irq_return_iret) 355 { 356 struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; 357 unsigned long *p = (unsigned long *)regs->sp; 358 359 /* 360 * regs->sp points to the failing IRET frame on the 361 * ESPFIX64 stack. Copy it to the entry stack. This fills 362 * in gpregs->ss through gpregs->ip. 363 * 364 */ 365 gpregs->ip = p[0]; 366 gpregs->cs = p[1]; 367 gpregs->flags = p[2]; 368 gpregs->sp = p[3]; 369 gpregs->ss = p[4]; 370 gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ 371 372 /* 373 * Adjust our frame so that we return straight to the #GP 374 * vector with the expected RSP value. This is safe because 375 * we won't enable interupts or schedule before we invoke 376 * general_protection, so nothing will clobber the stack 377 * frame we just set up. 378 * 379 * We will enter general_protection with kernel GSBASE, 380 * which is what the stub expects, given that the faulting 381 * RIP will be the IRET instruction. 382 */ 383 regs->ip = (unsigned long)asm_exc_general_protection; 384 regs->sp = (unsigned long)&gpregs->orig_ax; 385 386 return; 387 } 388 #endif 389 390 nmi_enter(); 391 instrumentation_begin(); 392 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 393 394 tsk->thread.error_code = error_code; 395 tsk->thread.trap_nr = X86_TRAP_DF; 396 397 #ifdef CONFIG_VMAP_STACK 398 /* 399 * If we overflow the stack into a guard page, the CPU will fail 400 * to deliver #PF and will send #DF instead. Similarly, if we 401 * take any non-IST exception while too close to the bottom of 402 * the stack, the processor will get a page fault while 403 * delivering the exception and will generate a double fault. 404 * 405 * According to the SDM (footnote in 6.15 under "Interrupt 14 - 406 * Page-Fault Exception (#PF): 407 * 408 * Processors update CR2 whenever a page fault is detected. If a 409 * second page fault occurs while an earlier page fault is being 410 * delivered, the faulting linear address of the second fault will 411 * overwrite the contents of CR2 (replacing the previous 412 * address). These updates to CR2 occur even if the page fault 413 * results in a double fault or occurs during the delivery of a 414 * double fault. 415 * 416 * The logic below has a small possibility of incorrectly diagnosing 417 * some errors as stack overflows. For example, if the IDT or GDT 418 * gets corrupted such that #GP delivery fails due to a bad descriptor 419 * causing #GP and we hit this condition while CR2 coincidentally 420 * points to the stack guard page, we'll think we overflowed the 421 * stack. Given that we're going to panic one way or another 422 * if this happens, this isn't necessarily worth fixing. 423 * 424 * If necessary, we could improve the test by only diagnosing 425 * a stack overflow if the saved RSP points within 47 bytes of 426 * the bottom of the stack: if RSP == tsk_stack + 48 and we 427 * take an exception, the stack is already aligned and there 428 * will be enough room SS, RSP, RFLAGS, CS, RIP, and a 429 * possible error code, so a stack overflow would *not* double 430 * fault. With any less space left, exception delivery could 431 * fail, and, as a practical matter, we've overflowed the 432 * stack even if the actual trigger for the double fault was 433 * something else. 434 */ 435 if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) { 436 handle_stack_overflow("kernel stack overflow (double-fault)", 437 regs, address); 438 } 439 #endif 440 441 pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); 442 die("double fault", regs, error_code); 443 panic("Machine halted."); 444 instrumentation_end(); 445 } 446 447 DEFINE_IDTENTRY(exc_bounds) 448 { 449 if (notify_die(DIE_TRAP, "bounds", regs, 0, 450 X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) 451 return; 452 cond_local_irq_enable(regs); 453 454 if (!user_mode(regs)) 455 die("bounds", regs, 0); 456 457 do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); 458 459 cond_local_irq_disable(regs); 460 } 461 462 enum kernel_gp_hint { 463 GP_NO_HINT, 464 GP_NON_CANONICAL, 465 GP_CANONICAL 466 }; 467 468 /* 469 * When an uncaught #GP occurs, try to determine the memory address accessed by 470 * the instruction and return that address to the caller. Also, try to figure 471 * out whether any part of the access to that address was non-canonical. 472 */ 473 static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, 474 unsigned long *addr) 475 { 476 u8 insn_buf[MAX_INSN_SIZE]; 477 struct insn insn; 478 479 if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) 480 return GP_NO_HINT; 481 482 kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); 483 insn_get_modrm(&insn); 484 insn_get_sib(&insn); 485 486 *addr = (unsigned long)insn_get_addr_ref(&insn, regs); 487 if (*addr == -1UL) 488 return GP_NO_HINT; 489 490 #ifdef CONFIG_X86_64 491 /* 492 * Check that: 493 * - the operand is not in the kernel half 494 * - the last byte of the operand is not in the user canonical half 495 */ 496 if (*addr < ~__VIRTUAL_MASK && 497 *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) 498 return GP_NON_CANONICAL; 499 #endif 500 501 return GP_CANONICAL; 502 } 503 504 #define GPFSTR "general protection fault" 505 506 DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) 507 { 508 char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; 509 enum kernel_gp_hint hint = GP_NO_HINT; 510 struct task_struct *tsk; 511 unsigned long gp_addr; 512 int ret; 513 514 cond_local_irq_enable(regs); 515 516 if (static_cpu_has(X86_FEATURE_UMIP)) { 517 if (user_mode(regs) && fixup_umip_exception(regs)) 518 goto exit; 519 } 520 521 if (v8086_mode(regs)) { 522 local_irq_enable(); 523 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 524 local_irq_disable(); 525 return; 526 } 527 528 tsk = current; 529 530 if (user_mode(regs)) { 531 tsk->thread.error_code = error_code; 532 tsk->thread.trap_nr = X86_TRAP_GP; 533 534 show_signal(tsk, SIGSEGV, "", desc, regs, error_code); 535 force_sig(SIGSEGV); 536 goto exit; 537 } 538 539 if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) 540 goto exit; 541 542 tsk->thread.error_code = error_code; 543 tsk->thread.trap_nr = X86_TRAP_GP; 544 545 /* 546 * To be potentially processing a kprobe fault and to trust the result 547 * from kprobe_running(), we have to be non-preemptible. 548 */ 549 if (!preemptible() && 550 kprobe_running() && 551 kprobe_fault_handler(regs, X86_TRAP_GP)) 552 goto exit; 553 554 ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); 555 if (ret == NOTIFY_STOP) 556 goto exit; 557 558 if (error_code) 559 snprintf(desc, sizeof(desc), "segment-related " GPFSTR); 560 else 561 hint = get_kernel_gp_address(regs, &gp_addr); 562 563 if (hint != GP_NO_HINT) 564 snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", 565 (hint == GP_NON_CANONICAL) ? "probably for non-canonical address" 566 : "maybe for address", 567 gp_addr); 568 569 /* 570 * KASAN is interested only in the non-canonical case, clear it 571 * otherwise. 572 */ 573 if (hint != GP_NON_CANONICAL) 574 gp_addr = 0; 575 576 die_addr(desc, regs, error_code, gp_addr); 577 578 exit: 579 cond_local_irq_disable(regs); 580 } 581 582 static bool do_int3(struct pt_regs *regs) 583 { 584 int res; 585 586 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 587 if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, 588 SIGTRAP) == NOTIFY_STOP) 589 return true; 590 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ 591 592 #ifdef CONFIG_KPROBES 593 if (kprobe_int3_handler(regs)) 594 return true; 595 #endif 596 res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); 597 598 return res == NOTIFY_STOP; 599 } 600 601 static void do_int3_user(struct pt_regs *regs) 602 { 603 if (do_int3(regs)) 604 return; 605 606 cond_local_irq_enable(regs); 607 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); 608 cond_local_irq_disable(regs); 609 } 610 611 DEFINE_IDTENTRY_RAW(exc_int3) 612 { 613 /* 614 * poke_int3_handler() is completely self contained code; it does (and 615 * must) *NOT* call out to anything, lest it hits upon yet another 616 * INT3. 617 */ 618 if (poke_int3_handler(regs)) 619 return; 620 621 /* 622 * idtentry_enter_user() uses static_branch_{,un}likely() and therefore 623 * can trigger INT3, hence poke_int3_handler() must be done 624 * before. If the entry came from kernel mode, then use nmi_enter() 625 * because the INT3 could have been hit in any context including 626 * NMI. 627 */ 628 if (user_mode(regs)) { 629 idtentry_enter_user(regs); 630 instrumentation_begin(); 631 do_int3_user(regs); 632 instrumentation_end(); 633 idtentry_exit_user(regs); 634 } else { 635 nmi_enter(); 636 instrumentation_begin(); 637 if (!do_int3(regs)) 638 die("int3", regs, 0); 639 instrumentation_end(); 640 nmi_exit(); 641 } 642 } 643 644 #ifdef CONFIG_X86_64 645 /* 646 * Help handler running on a per-cpu (IST or entry trampoline) stack 647 * to switch to the normal thread stack if the interrupted code was in 648 * user mode. The actual stack switch is done in entry_64.S 649 */ 650 asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) 651 { 652 struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; 653 if (regs != eregs) 654 *regs = *eregs; 655 return regs; 656 } 657 658 struct bad_iret_stack { 659 void *error_entry_ret; 660 struct pt_regs regs; 661 }; 662 663 asmlinkage __visible noinstr 664 struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) 665 { 666 /* 667 * This is called from entry_64.S early in handling a fault 668 * caused by a bad iret to user mode. To handle the fault 669 * correctly, we want to move our stack frame to where it would 670 * be had we entered directly on the entry stack (rather than 671 * just below the IRET frame) and we want to pretend that the 672 * exception came from the IRET target. 673 */ 674 struct bad_iret_stack tmp, *new_stack = 675 (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; 676 677 /* Copy the IRET target to the temporary storage. */ 678 memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); 679 680 /* Copy the remainder of the stack from the current stack. */ 681 memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); 682 683 /* Update the entry stack */ 684 memcpy(new_stack, &tmp, sizeof(tmp)); 685 686 BUG_ON(!user_mode(&new_stack->regs)); 687 return new_stack; 688 } 689 #endif 690 691 static bool is_sysenter_singlestep(struct pt_regs *regs) 692 { 693 /* 694 * We don't try for precision here. If we're anywhere in the region of 695 * code that can be single-stepped in the SYSENTER entry path, then 696 * assume that this is a useless single-step trap due to SYSENTER 697 * being invoked with TF set. (We don't know in advance exactly 698 * which instructions will be hit because BTF could plausibly 699 * be set.) 700 */ 701 #ifdef CONFIG_X86_32 702 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < 703 (unsigned long)__end_SYSENTER_singlestep_region - 704 (unsigned long)__begin_SYSENTER_singlestep_region; 705 #elif defined(CONFIG_IA32_EMULATION) 706 return (regs->ip - (unsigned long)entry_SYSENTER_compat) < 707 (unsigned long)__end_entry_SYSENTER_compat - 708 (unsigned long)entry_SYSENTER_compat; 709 #else 710 return false; 711 #endif 712 } 713 714 static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) 715 { 716 /* 717 * Disable breakpoints during exception handling; recursive exceptions 718 * are exceedingly 'fun'. 719 * 720 * Since this function is NOKPROBE, and that also applies to 721 * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a 722 * HW_BREAKPOINT_W on our stack) 723 * 724 * Entry text is excluded for HW_BP_X and cpu_entry_area, which 725 * includes the entry stack is excluded for everything. 726 */ 727 get_debugreg(*dr7, 7); 728 set_debugreg(0, 7); 729 730 /* 731 * Ensure the compiler doesn't lower the above statements into 732 * the critical section; disabling breakpoints late would not 733 * be good. 734 */ 735 barrier(); 736 737 /* 738 * The Intel SDM says: 739 * 740 * Certain debug exceptions may clear bits 0-3. The remaining 741 * contents of the DR6 register are never cleared by the 742 * processor. To avoid confusion in identifying debug 743 * exceptions, debug handlers should clear the register before 744 * returning to the interrupted task. 745 * 746 * Keep it simple: clear DR6 immediately. 747 */ 748 get_debugreg(*dr6, 6); 749 set_debugreg(0, 6); 750 /* Filter out all the reserved bits which are preset to 1 */ 751 *dr6 &= ~DR6_RESERVED; 752 } 753 754 static __always_inline void debug_exit(unsigned long dr7) 755 { 756 /* 757 * Ensure the compiler doesn't raise this statement into 758 * the critical section; enabling breakpoints early would 759 * not be good. 760 */ 761 barrier(); 762 set_debugreg(dr7, 7); 763 } 764 765 /* 766 * Our handling of the processor debug registers is non-trivial. 767 * We do not clear them on entry and exit from the kernel. Therefore 768 * it is possible to get a watchpoint trap here from inside the kernel. 769 * However, the code in ./ptrace.c has ensured that the user can 770 * only set watchpoints on userspace addresses. Therefore the in-kernel 771 * watchpoint trap can only occur in code which is reading/writing 772 * from user space. Such code must not hold kernel locks (since it 773 * can equally take a page fault), therefore it is safe to call 774 * force_sig_info even though that claims and releases locks. 775 * 776 * Code in ./signal.c ensures that the debug control register 777 * is restored before we deliver any signal, and therefore that 778 * user code runs with the correct debug control register even though 779 * we clear it here. 780 * 781 * Being careful here means that we don't have to be as careful in a 782 * lot of more complicated places (task switching can be a bit lazy 783 * about restoring all the debug state, and ptrace doesn't have to 784 * find every occurrence of the TF bit that could be saved away even 785 * by user code) 786 * 787 * May run on IST stack. 788 */ 789 static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, 790 bool user_icebp) 791 { 792 struct task_struct *tsk = current; 793 int si_code; 794 795 /* Store the virtualized DR6 value */ 796 tsk->thread.debugreg6 = dr6; 797 798 instrumentation_begin(); 799 #ifdef CONFIG_KPROBES 800 if (kprobe_debug_handler(regs)) { 801 instrumentation_end(); 802 return; 803 } 804 #endif 805 806 if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, 807 SIGTRAP) == NOTIFY_STOP) { 808 instrumentation_end(); 809 return; 810 } 811 812 /* 813 * Let others (NMI) know that the debug stack is in use 814 * as we may switch to the interrupt stack. 815 */ 816 debug_stack_usage_inc(); 817 818 /* It's safe to allow irq's after DR6 has been saved */ 819 cond_local_irq_enable(regs); 820 821 if (v8086_mode(regs)) { 822 handle_vm86_trap((struct kernel_vm86_regs *) regs, 0, 823 X86_TRAP_DB); 824 goto out; 825 } 826 827 if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { 828 /* 829 * Historical junk that used to handle SYSENTER single-stepping. 830 * This should be unreachable now. If we survive for a while 831 * without anyone hitting this warning, we'll turn this into 832 * an oops. 833 */ 834 tsk->thread.debugreg6 &= ~DR_STEP; 835 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 836 regs->flags &= ~X86_EFLAGS_TF; 837 } 838 839 si_code = get_si_code(tsk->thread.debugreg6); 840 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) 841 send_sigtrap(regs, 0, si_code); 842 843 out: 844 cond_local_irq_disable(regs); 845 debug_stack_usage_dec(); 846 instrumentation_end(); 847 } 848 849 static __always_inline void exc_debug_kernel(struct pt_regs *regs, 850 unsigned long dr6) 851 { 852 nmi_enter(); 853 /* 854 * The SDM says "The processor clears the BTF flag when it 855 * generates a debug exception." Clear TIF_BLOCKSTEP to keep 856 * TIF_BLOCKSTEP in sync with the hardware BTF flag. 857 */ 858 clear_thread_flag(TIF_BLOCKSTEP); 859 860 /* 861 * Catch SYSENTER with TF set and clear DR_STEP. If this hit a 862 * watchpoint at the same time then that will still be handled. 863 */ 864 if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) 865 dr6 &= ~DR_STEP; 866 867 /* 868 * If DR6 is zero, no point in trying to handle it. The kernel is 869 * not using INT1. 870 */ 871 if (dr6) 872 handle_debug(regs, dr6, false); 873 874 nmi_exit(); 875 } 876 877 static __always_inline void exc_debug_user(struct pt_regs *regs, 878 unsigned long dr6) 879 { 880 idtentry_enter_user(regs); 881 clear_thread_flag(TIF_BLOCKSTEP); 882 883 /* 884 * If dr6 has no reason to give us about the origin of this trap, 885 * then it's very likely the result of an icebp/int01 trap. 886 * User wants a sigtrap for that. 887 */ 888 handle_debug(regs, dr6, !dr6); 889 idtentry_exit_user(regs); 890 } 891 892 #ifdef CONFIG_X86_64 893 /* IST stack entry */ 894 DEFINE_IDTENTRY_DEBUG(exc_debug) 895 { 896 unsigned long dr6, dr7; 897 898 debug_enter(&dr6, &dr7); 899 exc_debug_kernel(regs, dr6); 900 debug_exit(dr7); 901 } 902 903 /* User entry, runs on regular task stack */ 904 DEFINE_IDTENTRY_DEBUG_USER(exc_debug) 905 { 906 unsigned long dr6, dr7; 907 908 debug_enter(&dr6, &dr7); 909 exc_debug_user(regs, dr6); 910 debug_exit(dr7); 911 } 912 #else 913 /* 32 bit does not have separate entry points. */ 914 DEFINE_IDTENTRY_DEBUG(exc_debug) 915 { 916 unsigned long dr6, dr7; 917 918 debug_enter(&dr6, &dr7); 919 920 if (user_mode(regs)) 921 exc_debug_user(regs, dr6); 922 else 923 exc_debug_kernel(regs, dr6); 924 925 debug_exit(dr7); 926 } 927 #endif 928 929 /* 930 * Note that we play around with the 'TS' bit in an attempt to get 931 * the correct behaviour even in the presence of the asynchronous 932 * IRQ13 behaviour 933 */ 934 static void math_error(struct pt_regs *regs, int trapnr) 935 { 936 struct task_struct *task = current; 937 struct fpu *fpu = &task->thread.fpu; 938 int si_code; 939 char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : 940 "simd exception"; 941 942 cond_local_irq_enable(regs); 943 944 if (!user_mode(regs)) { 945 if (fixup_exception(regs, trapnr, 0, 0)) 946 goto exit; 947 948 task->thread.error_code = 0; 949 task->thread.trap_nr = trapnr; 950 951 if (notify_die(DIE_TRAP, str, regs, 0, trapnr, 952 SIGFPE) != NOTIFY_STOP) 953 die(str, regs, 0); 954 goto exit; 955 } 956 957 /* 958 * Save the info for the exception handler and clear the error. 959 */ 960 fpu__save(fpu); 961 962 task->thread.trap_nr = trapnr; 963 task->thread.error_code = 0; 964 965 si_code = fpu__exception_code(fpu, trapnr); 966 /* Retry when we get spurious exceptions: */ 967 if (!si_code) 968 goto exit; 969 970 force_sig_fault(SIGFPE, si_code, 971 (void __user *)uprobe_get_trap_addr(regs)); 972 exit: 973 cond_local_irq_disable(regs); 974 } 975 976 DEFINE_IDTENTRY(exc_coprocessor_error) 977 { 978 math_error(regs, X86_TRAP_MF); 979 } 980 981 DEFINE_IDTENTRY(exc_simd_coprocessor_error) 982 { 983 if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { 984 /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ 985 if (!static_cpu_has(X86_FEATURE_XMM)) { 986 __exc_general_protection(regs, 0); 987 return; 988 } 989 } 990 math_error(regs, X86_TRAP_XF); 991 } 992 993 DEFINE_IDTENTRY(exc_spurious_interrupt_bug) 994 { 995 /* 996 * This addresses a Pentium Pro Erratum: 997 * 998 * PROBLEM: If the APIC subsystem is configured in mixed mode with 999 * Virtual Wire mode implemented through the local APIC, an 1000 * interrupt vector of 0Fh (Intel reserved encoding) may be 1001 * generated by the local APIC (Int 15). This vector may be 1002 * generated upon receipt of a spurious interrupt (an interrupt 1003 * which is removed before the system receives the INTA sequence) 1004 * instead of the programmed 8259 spurious interrupt vector. 1005 * 1006 * IMPLICATION: The spurious interrupt vector programmed in the 1007 * 8259 is normally handled by an operating system's spurious 1008 * interrupt handler. However, a vector of 0Fh is unknown to some 1009 * operating systems, which would crash if this erratum occurred. 1010 * 1011 * In theory this could be limited to 32bit, but the handler is not 1012 * hurting and who knows which other CPUs suffer from this. 1013 */ 1014 } 1015 1016 DEFINE_IDTENTRY(exc_device_not_available) 1017 { 1018 unsigned long cr0 = read_cr0(); 1019 1020 #ifdef CONFIG_MATH_EMULATION 1021 if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { 1022 struct math_emu_info info = { }; 1023 1024 cond_local_irq_enable(regs); 1025 1026 info.regs = regs; 1027 math_emulate(&info); 1028 1029 cond_local_irq_disable(regs); 1030 return; 1031 } 1032 #endif 1033 1034 /* This should not happen. */ 1035 if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { 1036 /* Try to fix it up and carry on. */ 1037 write_cr0(cr0 & ~X86_CR0_TS); 1038 } else { 1039 /* 1040 * Something terrible happened, and we're better off trying 1041 * to kill the task than getting stuck in a never-ending 1042 * loop of #NM faults. 1043 */ 1044 die("unexpected #NM exception", regs, 0); 1045 } 1046 } 1047 1048 #ifdef CONFIG_X86_32 1049 DEFINE_IDTENTRY_SW(iret_error) 1050 { 1051 local_irq_enable(); 1052 if (notify_die(DIE_TRAP, "iret exception", regs, 0, 1053 X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { 1054 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, 1055 ILL_BADSTK, (void __user *)NULL); 1056 } 1057 local_irq_disable(); 1058 } 1059 #endif 1060 1061 void __init trap_init(void) 1062 { 1063 /* Init cpu_entry_area before IST entries are set up */ 1064 setup_cpu_entry_areas(); 1065 1066 idt_setup_traps(); 1067 1068 /* 1069 * Set the IDT descriptor to a fixed read-only location, so that the 1070 * "sidt" instruction will not leak the location of the kernel, and 1071 * to defend the IDT against arbitrary memory write vulnerabilities. 1072 * It will be reloaded in cpu_init() */ 1073 cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), 1074 PAGE_KERNEL_RO); 1075 idt_descr.address = CPU_ENTRY_AREA_RO_IDT; 1076 1077 /* 1078 * Should be a barrier for any external CPU state: 1079 */ 1080 cpu_init(); 1081 1082 idt_setup_ist_traps(); 1083 1084 idt_setup_debugidt_traps(); 1085 } 1086