1/* 2 * linux/arch/x86_64/entry.S 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs 6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 7 * 8 * entry.S contains the system-call and fault low-level handling routines. 9 * 10 * Some of this is documented in Documentation/x86/entry_64.txt 11 * 12 * A note on terminology: 13 * - iret frame: Architecture defined interrupt frame from SS to RIP 14 * at the top of the kernel process stack. 15 * 16 * Some macro usage: 17 * - ENTRY/END: Define functions in the symbol table. 18 * - TRACE_IRQ_*: Trace hardirq state for lock debugging. 19 * - idtentry: Define exception entry points. 20 */ 21#include <linux/linkage.h> 22#include <asm/segment.h> 23#include <asm/cache.h> 24#include <asm/errno.h> 25#include "calling.h" 26#include <asm/asm-offsets.h> 27#include <asm/msr.h> 28#include <asm/unistd.h> 29#include <asm/thread_info.h> 30#include <asm/hw_irq.h> 31#include <asm/page_types.h> 32#include <asm/irqflags.h> 33#include <asm/paravirt.h> 34#include <asm/percpu.h> 35#include <asm/asm.h> 36#include <asm/smap.h> 37#include <asm/pgtable_types.h> 38#include <linux/err.h> 39 40/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 41#include <linux/elf-em.h> 42#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) 43#define __AUDIT_ARCH_64BIT 0x80000000 44#define __AUDIT_ARCH_LE 0x40000000 45 46.code64 47.section .entry.text, "ax" 48 49#ifdef CONFIG_PARAVIRT 50ENTRY(native_usergs_sysret64) 51 swapgs 52 sysretq 53ENDPROC(native_usergs_sysret64) 54#endif /* CONFIG_PARAVIRT */ 55 56.macro TRACE_IRQS_IRETQ 57#ifdef CONFIG_TRACE_IRQFLAGS 58 bt $9, EFLAGS(%rsp) /* interrupts off? */ 59 jnc 1f 60 TRACE_IRQS_ON 611: 62#endif 63.endm 64 65/* 66 * When dynamic function tracer is enabled it will add a breakpoint 67 * to all locations that it is about to modify, sync CPUs, update 68 * all the code, sync CPUs, then remove the breakpoints. In this time 69 * if lockdep is enabled, it might jump back into the debug handler 70 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). 71 * 72 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to 73 * make sure the stack pointer does not get reset back to the top 74 * of the debug stack, and instead just reuses the current stack. 75 */ 76#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) 77 78.macro TRACE_IRQS_OFF_DEBUG 79 call debug_stack_set_zero 80 TRACE_IRQS_OFF 81 call debug_stack_reset 82.endm 83 84.macro TRACE_IRQS_ON_DEBUG 85 call debug_stack_set_zero 86 TRACE_IRQS_ON 87 call debug_stack_reset 88.endm 89 90.macro TRACE_IRQS_IRETQ_DEBUG 91 bt $9, EFLAGS(%rsp) /* interrupts off? */ 92 jnc 1f 93 TRACE_IRQS_ON_DEBUG 941: 95.endm 96 97#else 98# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF 99# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON 100# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ 101#endif 102 103/* 104 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. 105 * 106 * This is the only entry point used for 64-bit system calls. The 107 * hardware interface is reasonably well designed and the register to 108 * argument mapping Linux uses fits well with the registers that are 109 * available when SYSCALL is used. 110 * 111 * SYSCALL instructions can be found inlined in libc implementations as 112 * well as some other programs and libraries. There are also a handful 113 * of SYSCALL instructions in the vDSO used, for example, as a 114 * clock_gettimeofday fallback. 115 * 116 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 117 * then loads new ss, cs, and rip from previously programmed MSRs. 118 * rflags gets masked by a value from another MSR (so CLD and CLAC 119 * are not needed). SYSCALL does not save anything on the stack 120 * and does not change rsp. 121 * 122 * Registers on entry: 123 * rax system call number 124 * rcx return address 125 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) 126 * rdi arg0 127 * rsi arg1 128 * rdx arg2 129 * r10 arg3 (needs to be moved to rcx to conform to C ABI) 130 * r8 arg4 131 * r9 arg5 132 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) 133 * 134 * Only called from user space. 135 * 136 * When user can change pt_regs->foo always force IRET. That is because 137 * it deals with uncanonical addresses better. SYSRET has trouble 138 * with them due to bugs in both AMD and Intel CPUs. 139 */ 140 141ENTRY(entry_SYSCALL_64) 142 /* 143 * Interrupts are off on entry. 144 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 145 * it is too small to ever cause noticeable irq latency. 146 */ 147 SWAPGS_UNSAFE_STACK 148 /* 149 * A hypervisor implementation might want to use a label 150 * after the swapgs, so that it can do the swapgs 151 * for the guest and jump here on syscall. 152 */ 153GLOBAL(entry_SYSCALL_64_after_swapgs) 154 155 movq %rsp, PER_CPU_VAR(rsp_scratch) 156 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 157 158 TRACE_IRQS_OFF 159 160 /* Construct struct pt_regs on stack */ 161 pushq $__USER_DS /* pt_regs->ss */ 162 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ 163 pushq %r11 /* pt_regs->flags */ 164 pushq $__USER_CS /* pt_regs->cs */ 165 pushq %rcx /* pt_regs->ip */ 166 pushq %rax /* pt_regs->orig_ax */ 167 pushq %rdi /* pt_regs->di */ 168 pushq %rsi /* pt_regs->si */ 169 pushq %rdx /* pt_regs->dx */ 170 pushq %rcx /* pt_regs->cx */ 171 pushq $-ENOSYS /* pt_regs->ax */ 172 pushq %r8 /* pt_regs->r8 */ 173 pushq %r9 /* pt_regs->r9 */ 174 pushq %r10 /* pt_regs->r10 */ 175 pushq %r11 /* pt_regs->r11 */ 176 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 177 178 /* 179 * If we need to do entry work or if we guess we'll need to do 180 * exit work, go straight to the slow path. 181 */ 182 testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 183 jnz entry_SYSCALL64_slow_path 184 185entry_SYSCALL_64_fastpath: 186 /* 187 * Easy case: enable interrupts and issue the syscall. If the syscall 188 * needs pt_regs, we'll call a stub that disables interrupts again 189 * and jumps to the slow path. 190 */ 191 TRACE_IRQS_ON 192 ENABLE_INTERRUPTS(CLBR_NONE) 193#if __SYSCALL_MASK == ~0 194 cmpq $__NR_syscall_max, %rax 195#else 196 andl $__SYSCALL_MASK, %eax 197 cmpl $__NR_syscall_max, %eax 198#endif 199 ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 200 movq %r10, %rcx 201 202 /* 203 * This call instruction is handled specially in stub_ptregs_64. 204 * It might end up jumping to the slow path. If it jumps, RAX 205 * and all argument registers are clobbered. 206 */ 207 call *sys_call_table(, %rax, 8) 208.Lentry_SYSCALL_64_after_fastpath_call: 209 210 movq %rax, RAX(%rsp) 2111: 212 213 /* 214 * If we get here, then we know that pt_regs is clean for SYSRET64. 215 * If we see that no exit work is required (which we are required 216 * to check with IRQs off), then we can go straight to SYSRET64. 217 */ 218 DISABLE_INTERRUPTS(CLBR_NONE) 219 TRACE_IRQS_OFF 220 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 221 jnz 1f 222 223 LOCKDEP_SYS_EXIT 224 TRACE_IRQS_ON /* user mode is traced as IRQs on */ 225 movq RIP(%rsp), %rcx 226 movq EFLAGS(%rsp), %r11 227 RESTORE_C_REGS_EXCEPT_RCX_R11 228 movq RSP(%rsp), %rsp 229 USERGS_SYSRET64 230 2311: 232 /* 233 * The fast path looked good when we started, but something changed 234 * along the way and we need to switch to the slow path. Calling 235 * raise(3) will trigger this, for example. IRQs are off. 236 */ 237 TRACE_IRQS_ON 238 ENABLE_INTERRUPTS(CLBR_NONE) 239 SAVE_EXTRA_REGS 240 movq %rsp, %rdi 241 call syscall_return_slowpath /* returns with IRQs disabled */ 242 jmp return_from_SYSCALL_64 243 244entry_SYSCALL64_slow_path: 245 /* IRQs are off. */ 246 SAVE_EXTRA_REGS 247 movq %rsp, %rdi 248 call do_syscall_64 /* returns with IRQs disabled */ 249 250return_from_SYSCALL_64: 251 RESTORE_EXTRA_REGS 252 TRACE_IRQS_IRETQ /* we're about to change IF */ 253 254 /* 255 * Try to use SYSRET instead of IRET if we're returning to 256 * a completely clean 64-bit userspace context. 257 */ 258 movq RCX(%rsp), %rcx 259 movq RIP(%rsp), %r11 260 cmpq %rcx, %r11 /* RCX == RIP */ 261 jne opportunistic_sysret_failed 262 263 /* 264 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP 265 * in kernel space. This essentially lets the user take over 266 * the kernel, since userspace controls RSP. 267 * 268 * If width of "canonical tail" ever becomes variable, this will need 269 * to be updated to remain correct on both old and new CPUs. 270 */ 271 .ifne __VIRTUAL_MASK_SHIFT - 47 272 .error "virtual address width changed -- SYSRET checks need update" 273 .endif 274 275 /* Change top 16 bits to be the sign-extension of 47th bit */ 276 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 277 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 278 279 /* If this changed %rcx, it was not canonical */ 280 cmpq %rcx, %r11 281 jne opportunistic_sysret_failed 282 283 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ 284 jne opportunistic_sysret_failed 285 286 movq R11(%rsp), %r11 287 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ 288 jne opportunistic_sysret_failed 289 290 /* 291 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot 292 * restore RF properly. If the slowpath sets it for whatever reason, we 293 * need to restore it correctly. 294 * 295 * SYSRET can restore TF, but unlike IRET, restoring TF results in a 296 * trap from userspace immediately after SYSRET. This would cause an 297 * infinite loop whenever #DB happens with register state that satisfies 298 * the opportunistic SYSRET conditions. For example, single-stepping 299 * this user code: 300 * 301 * movq $stuck_here, %rcx 302 * pushfq 303 * popq %r11 304 * stuck_here: 305 * 306 * would never get past 'stuck_here'. 307 */ 308 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 309 jnz opportunistic_sysret_failed 310 311 /* nothing to check for RSP */ 312 313 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ 314 jne opportunistic_sysret_failed 315 316 /* 317 * We win! This label is here just for ease of understanding 318 * perf profiles. Nothing jumps here. 319 */ 320syscall_return_via_sysret: 321 /* rcx and r11 are already restored (see code above) */ 322 RESTORE_C_REGS_EXCEPT_RCX_R11 323 movq RSP(%rsp), %rsp 324 USERGS_SYSRET64 325 326opportunistic_sysret_failed: 327 SWAPGS 328 jmp restore_c_regs_and_iret 329END(entry_SYSCALL_64) 330 331ENTRY(stub_ptregs_64) 332 /* 333 * Syscalls marked as needing ptregs land here. 334 * If we are on the fast path, we need to save the extra regs, 335 * which we achieve by trying again on the slow path. If we are on 336 * the slow path, the extra regs are already saved. 337 * 338 * RAX stores a pointer to the C function implementing the syscall. 339 * IRQs are on. 340 */ 341 cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) 342 jne 1f 343 344 /* 345 * Called from fast path -- disable IRQs again, pop return address 346 * and jump to slow path 347 */ 348 DISABLE_INTERRUPTS(CLBR_NONE) 349 TRACE_IRQS_OFF 350 popq %rax 351 jmp entry_SYSCALL64_slow_path 352 3531: 354 /* Called from C */ 355 jmp *%rax /* called from C */ 356END(stub_ptregs_64) 357 358.macro ptregs_stub func 359ENTRY(ptregs_\func) 360 leaq \func(%rip), %rax 361 jmp stub_ptregs_64 362END(ptregs_\func) 363.endm 364 365/* Instantiate ptregs_stub for each ptregs-using syscall */ 366#define __SYSCALL_64_QUAL_(sym) 367#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym 368#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) 369#include <asm/syscalls_64.h> 370 371/* 372 * A newly forked process directly context switches into this address. 373 * 374 * rdi: prev task we switched from 375 */ 376ENTRY(ret_from_fork) 377 LOCK ; btr $TIF_FORK, TI_flags(%r8) 378 379 call schedule_tail /* rdi: 'prev' task parameter */ 380 381 testb $3, CS(%rsp) /* from kernel_thread? */ 382 jnz 1f 383 384 /* 385 * We came from kernel_thread. This code path is quite twisted, and 386 * someone should clean it up. 387 * 388 * copy_thread_tls stashes the function pointer in RBX and the 389 * parameter to be passed in RBP. The called function is permitted 390 * to call do_execve and thereby jump to user mode. 391 */ 392 movq RBP(%rsp), %rdi 393 call *RBX(%rsp) 394 movl $0, RAX(%rsp) 395 396 /* 397 * Fall through as though we're exiting a syscall. This makes a 398 * twisted sort of sense if we just called do_execve. 399 */ 400 4011: 402 movq %rsp, %rdi 403 call syscall_return_slowpath /* returns with IRQs disabled */ 404 TRACE_IRQS_ON /* user mode is traced as IRQS on */ 405 SWAPGS 406 jmp restore_regs_and_iret 407END(ret_from_fork) 408 409/* 410 * Build the entry stubs with some assembler magic. 411 * We pack 1 stub into every 8-byte block. 412 */ 413 .align 8 414ENTRY(irq_entries_start) 415 vector=FIRST_EXTERNAL_VECTOR 416 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) 417 pushq $(~vector+0x80) /* Note: always in signed byte range */ 418 vector=vector+1 419 jmp common_interrupt 420 .align 8 421 .endr 422END(irq_entries_start) 423 424/* 425 * Interrupt entry/exit. 426 * 427 * Interrupt entry points save only callee clobbered registers in fast path. 428 * 429 * Entry runs with interrupts off. 430 */ 431 432/* 0(%rsp): ~(interrupt number) */ 433 .macro interrupt func 434 cld 435 ALLOC_PT_GPREGS_ON_STACK 436 SAVE_C_REGS 437 SAVE_EXTRA_REGS 438 439 testb $3, CS(%rsp) 440 jz 1f 441 442 /* 443 * IRQ from user mode. Switch to kernel gsbase and inform context 444 * tracking that we're in kernel mode. 445 */ 446 SWAPGS 447 448 /* 449 * We need to tell lockdep that IRQs are off. We can't do this until 450 * we fix gsbase, and we should do it before enter_from_user_mode 451 * (which can take locks). Since TRACE_IRQS_OFF idempotent, 452 * the simplest way to handle it is to just call it twice if 453 * we enter from user mode. There's no reason to optimize this since 454 * TRACE_IRQS_OFF is a no-op if lockdep is off. 455 */ 456 TRACE_IRQS_OFF 457 458 CALL_enter_from_user_mode 459 4601: 461 /* 462 * Save previous stack pointer, optionally switch to interrupt stack. 463 * irq_count is used to check if a CPU is already on an interrupt stack 464 * or not. While this is essentially redundant with preempt_count it is 465 * a little cheaper to use a separate counter in the PDA (short of 466 * moving irq_enter into assembly, which would be too much work) 467 */ 468 movq %rsp, %rdi 469 incl PER_CPU_VAR(irq_count) 470 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 471 pushq %rdi 472 /* We entered an interrupt context - irqs are off: */ 473 TRACE_IRQS_OFF 474 475 call \func /* rdi points to pt_regs */ 476 .endm 477 478 /* 479 * The interrupt stubs push (~vector+0x80) onto the stack and 480 * then jump to common_interrupt. 481 */ 482 .p2align CONFIG_X86_L1_CACHE_SHIFT 483common_interrupt: 484 ASM_CLAC 485 addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ 486 interrupt do_IRQ 487 /* 0(%rsp): old RSP */ 488ret_from_intr: 489 DISABLE_INTERRUPTS(CLBR_NONE) 490 TRACE_IRQS_OFF 491 decl PER_CPU_VAR(irq_count) 492 493 /* Restore saved previous stack */ 494 popq %rsp 495 496 testb $3, CS(%rsp) 497 jz retint_kernel 498 499 /* Interrupt came from user space */ 500GLOBAL(retint_user) 501 mov %rsp,%rdi 502 call prepare_exit_to_usermode 503 TRACE_IRQS_IRETQ 504 SWAPGS 505 jmp restore_regs_and_iret 506 507/* Returning to kernel space */ 508retint_kernel: 509#ifdef CONFIG_PREEMPT 510 /* Interrupts are off */ 511 /* Check if we need preemption */ 512 bt $9, EFLAGS(%rsp) /* were interrupts off? */ 513 jnc 1f 5140: cmpl $0, PER_CPU_VAR(__preempt_count) 515 jnz 1f 516 call preempt_schedule_irq 517 jmp 0b 5181: 519#endif 520 /* 521 * The iretq could re-enable interrupts: 522 */ 523 TRACE_IRQS_IRETQ 524 525/* 526 * At this label, code paths which return to kernel and to user, 527 * which come from interrupts/exception and from syscalls, merge. 528 */ 529GLOBAL(restore_regs_and_iret) 530 RESTORE_EXTRA_REGS 531restore_c_regs_and_iret: 532 RESTORE_C_REGS 533 REMOVE_PT_GPREGS_FROM_STACK 8 534 INTERRUPT_RETURN 535 536ENTRY(native_iret) 537 /* 538 * Are we returning to a stack segment from the LDT? Note: in 539 * 64-bit mode SS:RSP on the exception stack is always valid. 540 */ 541#ifdef CONFIG_X86_ESPFIX64 542 testb $4, (SS-RIP)(%rsp) 543 jnz native_irq_return_ldt 544#endif 545 546.global native_irq_return_iret 547native_irq_return_iret: 548 /* 549 * This may fault. Non-paranoid faults on return to userspace are 550 * handled by fixup_bad_iret. These include #SS, #GP, and #NP. 551 * Double-faults due to espfix64 are handled in do_double_fault. 552 * Other faults here are fatal. 553 */ 554 iretq 555 556#ifdef CONFIG_X86_ESPFIX64 557native_irq_return_ldt: 558 pushq %rax 559 pushq %rdi 560 SWAPGS 561 movq PER_CPU_VAR(espfix_waddr), %rdi 562 movq %rax, (0*8)(%rdi) /* RAX */ 563 movq (2*8)(%rsp), %rax /* RIP */ 564 movq %rax, (1*8)(%rdi) 565 movq (3*8)(%rsp), %rax /* CS */ 566 movq %rax, (2*8)(%rdi) 567 movq (4*8)(%rsp), %rax /* RFLAGS */ 568 movq %rax, (3*8)(%rdi) 569 movq (6*8)(%rsp), %rax /* SS */ 570 movq %rax, (5*8)(%rdi) 571 movq (5*8)(%rsp), %rax /* RSP */ 572 movq %rax, (4*8)(%rdi) 573 andl $0xffff0000, %eax 574 popq %rdi 575 orq PER_CPU_VAR(espfix_stack), %rax 576 SWAPGS 577 movq %rax, %rsp 578 popq %rax 579 jmp native_irq_return_iret 580#endif 581END(common_interrupt) 582 583/* 584 * APIC interrupts. 585 */ 586.macro apicinterrupt3 num sym do_sym 587ENTRY(\sym) 588 ASM_CLAC 589 pushq $~(\num) 590.Lcommon_\sym: 591 interrupt \do_sym 592 jmp ret_from_intr 593END(\sym) 594.endm 595 596#ifdef CONFIG_TRACING 597#define trace(sym) trace_##sym 598#define smp_trace(sym) smp_trace_##sym 599 600.macro trace_apicinterrupt num sym 601apicinterrupt3 \num trace(\sym) smp_trace(\sym) 602.endm 603#else 604.macro trace_apicinterrupt num sym do_sym 605.endm 606#endif 607 608/* Make sure APIC interrupt handlers end up in the irqentry section: */ 609#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) 610# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" 611# define POP_SECTION_IRQENTRY .popsection 612#else 613# define PUSH_SECTION_IRQENTRY 614# define POP_SECTION_IRQENTRY 615#endif 616 617.macro apicinterrupt num sym do_sym 618PUSH_SECTION_IRQENTRY 619apicinterrupt3 \num \sym \do_sym 620trace_apicinterrupt \num \sym 621POP_SECTION_IRQENTRY 622.endm 623 624#ifdef CONFIG_SMP 625apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 626apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt 627#endif 628 629#ifdef CONFIG_X86_UV 630apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt 631#endif 632 633apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt 634apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi 635 636#ifdef CONFIG_HAVE_KVM 637apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi 638apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi 639#endif 640 641#ifdef CONFIG_X86_MCE_THRESHOLD 642apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt 643#endif 644 645#ifdef CONFIG_X86_MCE_AMD 646apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt 647#endif 648 649#ifdef CONFIG_X86_THERMAL_VECTOR 650apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt 651#endif 652 653#ifdef CONFIG_SMP 654apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt 655apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt 656apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt 657#endif 658 659apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt 660apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt 661 662#ifdef CONFIG_IRQ_WORK 663apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt 664#endif 665 666/* 667 * Exception entry points. 668 */ 669#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) 670 671.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 672ENTRY(\sym) 673 /* Sanity check */ 674 .if \shift_ist != -1 && \paranoid == 0 675 .error "using shift_ist requires paranoid=1" 676 .endif 677 678 ASM_CLAC 679 PARAVIRT_ADJUST_EXCEPTION_FRAME 680 681 .ifeq \has_error_code 682 pushq $-1 /* ORIG_RAX: no syscall to restart */ 683 .endif 684 685 ALLOC_PT_GPREGS_ON_STACK 686 687 .if \paranoid 688 .if \paranoid == 1 689 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ 690 jnz 1f 691 .endif 692 call paranoid_entry 693 .else 694 call error_entry 695 .endif 696 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ 697 698 .if \paranoid 699 .if \shift_ist != -1 700 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ 701 .else 702 TRACE_IRQS_OFF 703 .endif 704 .endif 705 706 movq %rsp, %rdi /* pt_regs pointer */ 707 708 .if \has_error_code 709 movq ORIG_RAX(%rsp), %rsi /* get error code */ 710 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ 711 .else 712 xorl %esi, %esi /* no error code */ 713 .endif 714 715 .if \shift_ist != -1 716 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 717 .endif 718 719 call \do_sym 720 721 .if \shift_ist != -1 722 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 723 .endif 724 725 /* these procedures expect "no swapgs" flag in ebx */ 726 .if \paranoid 727 jmp paranoid_exit 728 .else 729 jmp error_exit 730 .endif 731 732 .if \paranoid == 1 733 /* 734 * Paranoid entry from userspace. Switch stacks and treat it 735 * as a normal entry. This means that paranoid handlers 736 * run in real process context if user_mode(regs). 737 */ 7381: 739 call error_entry 740 741 742 movq %rsp, %rdi /* pt_regs pointer */ 743 call sync_regs 744 movq %rax, %rsp /* switch stack */ 745 746 movq %rsp, %rdi /* pt_regs pointer */ 747 748 .if \has_error_code 749 movq ORIG_RAX(%rsp), %rsi /* get error code */ 750 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ 751 .else 752 xorl %esi, %esi /* no error code */ 753 .endif 754 755 call \do_sym 756 757 jmp error_exit /* %ebx: no swapgs flag */ 758 .endif 759END(\sym) 760.endm 761 762#ifdef CONFIG_TRACING 763.macro trace_idtentry sym do_sym has_error_code:req 764idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code 765idtentry \sym \do_sym has_error_code=\has_error_code 766.endm 767#else 768.macro trace_idtentry sym do_sym has_error_code:req 769idtentry \sym \do_sym has_error_code=\has_error_code 770.endm 771#endif 772 773idtentry divide_error do_divide_error has_error_code=0 774idtentry overflow do_overflow has_error_code=0 775idtentry bounds do_bounds has_error_code=0 776idtentry invalid_op do_invalid_op has_error_code=0 777idtentry device_not_available do_device_not_available has_error_code=0 778idtentry double_fault do_double_fault has_error_code=1 paranoid=2 779idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 780idtentry invalid_TSS do_invalid_TSS has_error_code=1 781idtentry segment_not_present do_segment_not_present has_error_code=1 782idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 783idtentry coprocessor_error do_coprocessor_error has_error_code=0 784idtentry alignment_check do_alignment_check has_error_code=1 785idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 786 787 788 /* 789 * Reload gs selector with exception handling 790 * edi: new selector 791 */ 792ENTRY(native_load_gs_index) 793 pushfq 794 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 795 SWAPGS 796.Lgs_change: 797 movl %edi, %gs 7982: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE 799 SWAPGS 800 popfq 801 ret 802END(native_load_gs_index) 803 804 _ASM_EXTABLE(.Lgs_change, bad_gs) 805 .section .fixup, "ax" 806 /* running with kernelgs */ 807bad_gs: 808 SWAPGS /* switch back to user gs */ 809.macro ZAP_GS 810 /* This can't be a string because the preprocessor needs to see it. */ 811 movl $__USER_DS, %eax 812 movl %eax, %gs 813.endm 814 ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG 815 xorl %eax, %eax 816 movl %eax, %gs 817 jmp 2b 818 .previous 819 820/* Call softirq on interrupt stack. Interrupts are off. */ 821ENTRY(do_softirq_own_stack) 822 pushq %rbp 823 mov %rsp, %rbp 824 incl PER_CPU_VAR(irq_count) 825 cmove PER_CPU_VAR(irq_stack_ptr), %rsp 826 push %rbp /* frame pointer backlink */ 827 call __do_softirq 828 leaveq 829 decl PER_CPU_VAR(irq_count) 830 ret 831END(do_softirq_own_stack) 832 833#ifdef CONFIG_XEN 834idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 835 836/* 837 * A note on the "critical region" in our callback handler. 838 * We want to avoid stacking callback handlers due to events occurring 839 * during handling of the last event. To do this, we keep events disabled 840 * until we've done all processing. HOWEVER, we must enable events before 841 * popping the stack frame (can't be done atomically) and so it would still 842 * be possible to get enough handler activations to overflow the stack. 843 * Although unlikely, bugs of that kind are hard to track down, so we'd 844 * like to avoid the possibility. 845 * So, on entry to the handler we detect whether we interrupted an 846 * existing activation in its critical region -- if so, we pop the current 847 * activation and restart the handler using the previous one. 848 */ 849ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ 850 851/* 852 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 853 * see the correct pointer to the pt_regs 854 */ 855 movq %rdi, %rsp /* we don't return, adjust the stack frame */ 85611: incl PER_CPU_VAR(irq_count) 857 movq %rsp, %rbp 858 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 859 pushq %rbp /* frame pointer backlink */ 860 call xen_evtchn_do_upcall 861 popq %rsp 862 decl PER_CPU_VAR(irq_count) 863#ifndef CONFIG_PREEMPT 864 call xen_maybe_preempt_hcall 865#endif 866 jmp error_exit 867END(xen_do_hypervisor_callback) 868 869/* 870 * Hypervisor uses this for application faults while it executes. 871 * We get here for two reasons: 872 * 1. Fault while reloading DS, ES, FS or GS 873 * 2. Fault while executing IRET 874 * Category 1 we do not need to fix up as Xen has already reloaded all segment 875 * registers that could be reloaded and zeroed the others. 876 * Category 2 we fix up by killing the current process. We cannot use the 877 * normal Linux return path in this case because if we use the IRET hypercall 878 * to pop the stack frame we end up in an infinite loop of failsafe callbacks. 879 * We distinguish between categories by comparing each saved segment register 880 * with its current contents: any discrepancy means we in category 1. 881 */ 882ENTRY(xen_failsafe_callback) 883 movl %ds, %ecx 884 cmpw %cx, 0x10(%rsp) 885 jne 1f 886 movl %es, %ecx 887 cmpw %cx, 0x18(%rsp) 888 jne 1f 889 movl %fs, %ecx 890 cmpw %cx, 0x20(%rsp) 891 jne 1f 892 movl %gs, %ecx 893 cmpw %cx, 0x28(%rsp) 894 jne 1f 895 /* All segments match their saved values => Category 2 (Bad IRET). */ 896 movq (%rsp), %rcx 897 movq 8(%rsp), %r11 898 addq $0x30, %rsp 899 pushq $0 /* RIP */ 900 pushq %r11 901 pushq %rcx 902 jmp general_protection 9031: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 904 movq (%rsp), %rcx 905 movq 8(%rsp), %r11 906 addq $0x30, %rsp 907 pushq $-1 /* orig_ax = -1 => not a system call */ 908 ALLOC_PT_GPREGS_ON_STACK 909 SAVE_C_REGS 910 SAVE_EXTRA_REGS 911 jmp error_exit 912END(xen_failsafe_callback) 913 914apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ 915 xen_hvm_callback_vector xen_evtchn_do_upcall 916 917#endif /* CONFIG_XEN */ 918 919#if IS_ENABLED(CONFIG_HYPERV) 920apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ 921 hyperv_callback_vector hyperv_vector_handler 922#endif /* CONFIG_HYPERV */ 923 924idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 925idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 926idtentry stack_segment do_stack_segment has_error_code=1 927 928#ifdef CONFIG_XEN 929idtentry xen_debug do_debug has_error_code=0 930idtentry xen_int3 do_int3 has_error_code=0 931idtentry xen_stack_segment do_stack_segment has_error_code=1 932#endif 933 934idtentry general_protection do_general_protection has_error_code=1 935trace_idtentry page_fault do_page_fault has_error_code=1 936 937#ifdef CONFIG_KVM_GUEST 938idtentry async_page_fault do_async_page_fault has_error_code=1 939#endif 940 941#ifdef CONFIG_X86_MCE 942idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) 943#endif 944 945/* 946 * Save all registers in pt_regs, and switch gs if needed. 947 * Use slow, but surefire "are we in kernel?" check. 948 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise 949 */ 950ENTRY(paranoid_entry) 951 cld 952 SAVE_C_REGS 8 953 SAVE_EXTRA_REGS 8 954 movl $1, %ebx 955 movl $MSR_GS_BASE, %ecx 956 rdmsr 957 testl %edx, %edx 958 js 1f /* negative -> in kernel */ 959 SWAPGS 960 xorl %ebx, %ebx 9611: ret 962END(paranoid_entry) 963 964/* 965 * "Paranoid" exit path from exception stack. This is invoked 966 * only on return from non-NMI IST interrupts that came 967 * from kernel space. 968 * 969 * We may be returning to very strange contexts (e.g. very early 970 * in syscall entry), so checking for preemption here would 971 * be complicated. Fortunately, we there's no good reason 972 * to try to handle preemption here. 973 * 974 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) 975 */ 976ENTRY(paranoid_exit) 977 DISABLE_INTERRUPTS(CLBR_NONE) 978 TRACE_IRQS_OFF_DEBUG 979 testl %ebx, %ebx /* swapgs needed? */ 980 jnz paranoid_exit_no_swapgs 981 TRACE_IRQS_IRETQ 982 SWAPGS_UNSAFE_STACK 983 jmp paranoid_exit_restore 984paranoid_exit_no_swapgs: 985 TRACE_IRQS_IRETQ_DEBUG 986paranoid_exit_restore: 987 RESTORE_EXTRA_REGS 988 RESTORE_C_REGS 989 REMOVE_PT_GPREGS_FROM_STACK 8 990 INTERRUPT_RETURN 991END(paranoid_exit) 992 993/* 994 * Save all registers in pt_regs, and switch gs if needed. 995 * Return: EBX=0: came from user mode; EBX=1: otherwise 996 */ 997ENTRY(error_entry) 998 cld 999 SAVE_C_REGS 8 1000 SAVE_EXTRA_REGS 8 1001 xorl %ebx, %ebx 1002 testb $3, CS+8(%rsp) 1003 jz .Lerror_kernelspace 1004 1005.Lerror_entry_from_usermode_swapgs: 1006 /* 1007 * We entered from user mode or we're pretending to have entered 1008 * from user mode due to an IRET fault. 1009 */ 1010 SWAPGS 1011 1012.Lerror_entry_from_usermode_after_swapgs: 1013 /* 1014 * We need to tell lockdep that IRQs are off. We can't do this until 1015 * we fix gsbase, and we should do it before enter_from_user_mode 1016 * (which can take locks). 1017 */ 1018 TRACE_IRQS_OFF 1019 CALL_enter_from_user_mode 1020 ret 1021 1022.Lerror_entry_done: 1023 TRACE_IRQS_OFF 1024 ret 1025 1026 /* 1027 * There are two places in the kernel that can potentially fault with 1028 * usergs. Handle them here. B stepping K8s sometimes report a 1029 * truncated RIP for IRET exceptions returning to compat mode. Check 1030 * for these here too. 1031 */ 1032.Lerror_kernelspace: 1033 incl %ebx 1034 leaq native_irq_return_iret(%rip), %rcx 1035 cmpq %rcx, RIP+8(%rsp) 1036 je .Lerror_bad_iret 1037 movl %ecx, %eax /* zero extend */ 1038 cmpq %rax, RIP+8(%rsp) 1039 je .Lbstep_iret 1040 cmpq $.Lgs_change, RIP+8(%rsp) 1041 jne .Lerror_entry_done 1042 1043 /* 1044 * hack: .Lgs_change can fail with user gsbase. If this happens, fix up 1045 * gsbase and proceed. We'll fix up the exception and land in 1046 * .Lgs_change's error handler with kernel gsbase. 1047 */ 1048 jmp .Lerror_entry_from_usermode_swapgs 1049 1050.Lbstep_iret: 1051 /* Fix truncated RIP */ 1052 movq %rcx, RIP+8(%rsp) 1053 /* fall through */ 1054 1055.Lerror_bad_iret: 1056 /* 1057 * We came from an IRET to user mode, so we have user gsbase. 1058 * Switch to kernel gsbase: 1059 */ 1060 SWAPGS 1061 1062 /* 1063 * Pretend that the exception came from user mode: set up pt_regs 1064 * as if we faulted immediately after IRET and clear EBX so that 1065 * error_exit knows that we will be returning to user mode. 1066 */ 1067 mov %rsp, %rdi 1068 call fixup_bad_iret 1069 mov %rax, %rsp 1070 decl %ebx 1071 jmp .Lerror_entry_from_usermode_after_swapgs 1072END(error_entry) 1073 1074 1075/* 1076 * On entry, EBS is a "return to kernel mode" flag: 1077 * 1: already in kernel mode, don't need SWAPGS 1078 * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode 1079 */ 1080ENTRY(error_exit) 1081 movl %ebx, %eax 1082 DISABLE_INTERRUPTS(CLBR_NONE) 1083 TRACE_IRQS_OFF 1084 testl %eax, %eax 1085 jnz retint_kernel 1086 jmp retint_user 1087END(error_exit) 1088 1089/* Runs on exception stack */ 1090ENTRY(nmi) 1091 /* 1092 * Fix up the exception frame if we're on Xen. 1093 * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most 1094 * one value to the stack on native, so it may clobber the rdx 1095 * scratch slot, but it won't clobber any of the important 1096 * slots past it. 1097 * 1098 * Xen is a different story, because the Xen frame itself overlaps 1099 * the "NMI executing" variable. 1100 */ 1101 PARAVIRT_ADJUST_EXCEPTION_FRAME 1102 1103 /* 1104 * We allow breakpoints in NMIs. If a breakpoint occurs, then 1105 * the iretq it performs will take us out of NMI context. 1106 * This means that we can have nested NMIs where the next 1107 * NMI is using the top of the stack of the previous NMI. We 1108 * can't let it execute because the nested NMI will corrupt the 1109 * stack of the previous NMI. NMI handlers are not re-entrant 1110 * anyway. 1111 * 1112 * To handle this case we do the following: 1113 * Check the a special location on the stack that contains 1114 * a variable that is set when NMIs are executing. 1115 * The interrupted task's stack is also checked to see if it 1116 * is an NMI stack. 1117 * If the variable is not set and the stack is not the NMI 1118 * stack then: 1119 * o Set the special variable on the stack 1120 * o Copy the interrupt frame into an "outermost" location on the 1121 * stack 1122 * o Copy the interrupt frame into an "iret" location on the stack 1123 * o Continue processing the NMI 1124 * If the variable is set or the previous stack is the NMI stack: 1125 * o Modify the "iret" location to jump to the repeat_nmi 1126 * o return back to the first NMI 1127 * 1128 * Now on exit of the first NMI, we first clear the stack variable 1129 * The NMI stack will tell any nested NMIs at that point that it is 1130 * nested. Then we pop the stack normally with iret, and if there was 1131 * a nested NMI that updated the copy interrupt stack frame, a 1132 * jump will be made to the repeat_nmi code that will handle the second 1133 * NMI. 1134 * 1135 * However, espfix prevents us from directly returning to userspace 1136 * with a single IRET instruction. Similarly, IRET to user mode 1137 * can fault. We therefore handle NMIs from user space like 1138 * other IST entries. 1139 */ 1140 1141 /* Use %rdx as our temp variable throughout */ 1142 pushq %rdx 1143 1144 testb $3, CS-RIP+8(%rsp) 1145 jz .Lnmi_from_kernel 1146 1147 /* 1148 * NMI from user mode. We need to run on the thread stack, but we 1149 * can't go through the normal entry paths: NMIs are masked, and 1150 * we don't want to enable interrupts, because then we'll end 1151 * up in an awkward situation in which IRQs are on but NMIs 1152 * are off. 1153 * 1154 * We also must not push anything to the stack before switching 1155 * stacks lest we corrupt the "NMI executing" variable. 1156 */ 1157 1158 SWAPGS_UNSAFE_STACK 1159 cld 1160 movq %rsp, %rdx 1161 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 1162 pushq 5*8(%rdx) /* pt_regs->ss */ 1163 pushq 4*8(%rdx) /* pt_regs->rsp */ 1164 pushq 3*8(%rdx) /* pt_regs->flags */ 1165 pushq 2*8(%rdx) /* pt_regs->cs */ 1166 pushq 1*8(%rdx) /* pt_regs->rip */ 1167 pushq $-1 /* pt_regs->orig_ax */ 1168 pushq %rdi /* pt_regs->di */ 1169 pushq %rsi /* pt_regs->si */ 1170 pushq (%rdx) /* pt_regs->dx */ 1171 pushq %rcx /* pt_regs->cx */ 1172 pushq %rax /* pt_regs->ax */ 1173 pushq %r8 /* pt_regs->r8 */ 1174 pushq %r9 /* pt_regs->r9 */ 1175 pushq %r10 /* pt_regs->r10 */ 1176 pushq %r11 /* pt_regs->r11 */ 1177 pushq %rbx /* pt_regs->rbx */ 1178 pushq %rbp /* pt_regs->rbp */ 1179 pushq %r12 /* pt_regs->r12 */ 1180 pushq %r13 /* pt_regs->r13 */ 1181 pushq %r14 /* pt_regs->r14 */ 1182 pushq %r15 /* pt_regs->r15 */ 1183 1184 /* 1185 * At this point we no longer need to worry about stack damage 1186 * due to nesting -- we're on the normal thread stack and we're 1187 * done with the NMI stack. 1188 */ 1189 1190 movq %rsp, %rdi 1191 movq $-1, %rsi 1192 call do_nmi 1193 1194 /* 1195 * Return back to user mode. We must *not* do the normal exit 1196 * work, because we don't want to enable interrupts. Fortunately, 1197 * do_nmi doesn't modify pt_regs. 1198 */ 1199 SWAPGS 1200 jmp restore_c_regs_and_iret 1201 1202.Lnmi_from_kernel: 1203 /* 1204 * Here's what our stack frame will look like: 1205 * +---------------------------------------------------------+ 1206 * | original SS | 1207 * | original Return RSP | 1208 * | original RFLAGS | 1209 * | original CS | 1210 * | original RIP | 1211 * +---------------------------------------------------------+ 1212 * | temp storage for rdx | 1213 * +---------------------------------------------------------+ 1214 * | "NMI executing" variable | 1215 * +---------------------------------------------------------+ 1216 * | iret SS } Copied from "outermost" frame | 1217 * | iret Return RSP } on each loop iteration; overwritten | 1218 * | iret RFLAGS } by a nested NMI to force another | 1219 * | iret CS } iteration if needed. | 1220 * | iret RIP } | 1221 * +---------------------------------------------------------+ 1222 * | outermost SS } initialized in first_nmi; | 1223 * | outermost Return RSP } will not be changed before | 1224 * | outermost RFLAGS } NMI processing is done. | 1225 * | outermost CS } Copied to "iret" frame on each | 1226 * | outermost RIP } iteration. | 1227 * +---------------------------------------------------------+ 1228 * | pt_regs | 1229 * +---------------------------------------------------------+ 1230 * 1231 * The "original" frame is used by hardware. Before re-enabling 1232 * NMIs, we need to be done with it, and we need to leave enough 1233 * space for the asm code here. 1234 * 1235 * We return by executing IRET while RSP points to the "iret" frame. 1236 * That will either return for real or it will loop back into NMI 1237 * processing. 1238 * 1239 * The "outermost" frame is copied to the "iret" frame on each 1240 * iteration of the loop, so each iteration starts with the "iret" 1241 * frame pointing to the final return target. 1242 */ 1243 1244 /* 1245 * Determine whether we're a nested NMI. 1246 * 1247 * If we interrupted kernel code between repeat_nmi and 1248 * end_repeat_nmi, then we are a nested NMI. We must not 1249 * modify the "iret" frame because it's being written by 1250 * the outer NMI. That's okay; the outer NMI handler is 1251 * about to about to call do_nmi anyway, so we can just 1252 * resume the outer NMI. 1253 */ 1254 1255 movq $repeat_nmi, %rdx 1256 cmpq 8(%rsp), %rdx 1257 ja 1f 1258 movq $end_repeat_nmi, %rdx 1259 cmpq 8(%rsp), %rdx 1260 ja nested_nmi_out 12611: 1262 1263 /* 1264 * Now check "NMI executing". If it's set, then we're nested. 1265 * This will not detect if we interrupted an outer NMI just 1266 * before IRET. 1267 */ 1268 cmpl $1, -8(%rsp) 1269 je nested_nmi 1270 1271 /* 1272 * Now test if the previous stack was an NMI stack. This covers 1273 * the case where we interrupt an outer NMI after it clears 1274 * "NMI executing" but before IRET. We need to be careful, though: 1275 * there is one case in which RSP could point to the NMI stack 1276 * despite there being no NMI active: naughty userspace controls 1277 * RSP at the very beginning of the SYSCALL targets. We can 1278 * pull a fast one on naughty userspace, though: we program 1279 * SYSCALL to mask DF, so userspace cannot cause DF to be set 1280 * if it controls the kernel's RSP. We set DF before we clear 1281 * "NMI executing". 1282 */ 1283 lea 6*8(%rsp), %rdx 1284 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ 1285 cmpq %rdx, 4*8(%rsp) 1286 /* If the stack pointer is above the NMI stack, this is a normal NMI */ 1287 ja first_nmi 1288 1289 subq $EXCEPTION_STKSZ, %rdx 1290 cmpq %rdx, 4*8(%rsp) 1291 /* If it is below the NMI stack, it is a normal NMI */ 1292 jb first_nmi 1293 1294 /* Ah, it is within the NMI stack. */ 1295 1296 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) 1297 jz first_nmi /* RSP was user controlled. */ 1298 1299 /* This is a nested NMI. */ 1300 1301nested_nmi: 1302 /* 1303 * Modify the "iret" frame to point to repeat_nmi, forcing another 1304 * iteration of NMI handling. 1305 */ 1306 subq $8, %rsp 1307 leaq -10*8(%rsp), %rdx 1308 pushq $__KERNEL_DS 1309 pushq %rdx 1310 pushfq 1311 pushq $__KERNEL_CS 1312 pushq $repeat_nmi 1313 1314 /* Put stack back */ 1315 addq $(6*8), %rsp 1316 1317nested_nmi_out: 1318 popq %rdx 1319 1320 /* We are returning to kernel mode, so this cannot result in a fault. */ 1321 INTERRUPT_RETURN 1322 1323first_nmi: 1324 /* Restore rdx. */ 1325 movq (%rsp), %rdx 1326 1327 /* Make room for "NMI executing". */ 1328 pushq $0 1329 1330 /* Leave room for the "iret" frame */ 1331 subq $(5*8), %rsp 1332 1333 /* Copy the "original" frame to the "outermost" frame */ 1334 .rept 5 1335 pushq 11*8(%rsp) 1336 .endr 1337 1338 /* Everything up to here is safe from nested NMIs */ 1339 1340#ifdef CONFIG_DEBUG_ENTRY 1341 /* 1342 * For ease of testing, unmask NMIs right away. Disabled by 1343 * default because IRET is very expensive. 1344 */ 1345 pushq $0 /* SS */ 1346 pushq %rsp /* RSP (minus 8 because of the previous push) */ 1347 addq $8, (%rsp) /* Fix up RSP */ 1348 pushfq /* RFLAGS */ 1349 pushq $__KERNEL_CS /* CS */ 1350 pushq $1f /* RIP */ 1351 INTERRUPT_RETURN /* continues at repeat_nmi below */ 13521: 1353#endif 1354 1355repeat_nmi: 1356 /* 1357 * If there was a nested NMI, the first NMI's iret will return 1358 * here. But NMIs are still enabled and we can take another 1359 * nested NMI. The nested NMI checks the interrupted RIP to see 1360 * if it is between repeat_nmi and end_repeat_nmi, and if so 1361 * it will just return, as we are about to repeat an NMI anyway. 1362 * This makes it safe to copy to the stack frame that a nested 1363 * NMI will update. 1364 * 1365 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if 1366 * we're repeating an NMI, gsbase has the same value that it had on 1367 * the first iteration. paranoid_entry will load the kernel 1368 * gsbase if needed before we call do_nmi. "NMI executing" 1369 * is zero. 1370 */ 1371 movq $1, 10*8(%rsp) /* Set "NMI executing". */ 1372 1373 /* 1374 * Copy the "outermost" frame to the "iret" frame. NMIs that nest 1375 * here must not modify the "iret" frame while we're writing to 1376 * it or it will end up containing garbage. 1377 */ 1378 addq $(10*8), %rsp 1379 .rept 5 1380 pushq -6*8(%rsp) 1381 .endr 1382 subq $(5*8), %rsp 1383end_repeat_nmi: 1384 1385 /* 1386 * Everything below this point can be preempted by a nested NMI. 1387 * If this happens, then the inner NMI will change the "iret" 1388 * frame to point back to repeat_nmi. 1389 */ 1390 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1391 ALLOC_PT_GPREGS_ON_STACK 1392 1393 /* 1394 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit 1395 * as we should not be calling schedule in NMI context. 1396 * Even with normal interrupts enabled. An NMI should not be 1397 * setting NEED_RESCHED or anything that normal interrupts and 1398 * exceptions might do. 1399 */ 1400 call paranoid_entry 1401 1402 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1403 movq %rsp, %rdi 1404 movq $-1, %rsi 1405 call do_nmi 1406 1407 testl %ebx, %ebx /* swapgs needed? */ 1408 jnz nmi_restore 1409nmi_swapgs: 1410 SWAPGS_UNSAFE_STACK 1411nmi_restore: 1412 RESTORE_EXTRA_REGS 1413 RESTORE_C_REGS 1414 1415 /* Point RSP at the "iret" frame. */ 1416 REMOVE_PT_GPREGS_FROM_STACK 6*8 1417 1418 /* 1419 * Clear "NMI executing". Set DF first so that we can easily 1420 * distinguish the remaining code between here and IRET from 1421 * the SYSCALL entry and exit paths. On a native kernel, we 1422 * could just inspect RIP, but, on paravirt kernels, 1423 * INTERRUPT_RETURN can translate into a jump into a 1424 * hypercall page. 1425 */ 1426 std 1427 movq $0, 5*8(%rsp) /* clear "NMI executing" */ 1428 1429 /* 1430 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI 1431 * stack in a single instruction. We are returning to kernel 1432 * mode, so this cannot result in a fault. 1433 */ 1434 INTERRUPT_RETURN 1435END(nmi) 1436 1437ENTRY(ignore_sysret) 1438 mov $-ENOSYS, %eax 1439 sysret 1440END(ignore_sysret) 1441 1442ENTRY(rewind_stack_do_exit) 1443 /* Prevent any naive code from trying to unwind to our caller. */ 1444 xorl %ebp, %ebp 1445 1446 movq PER_CPU_VAR(cpu_current_top_of_stack), %rax 1447 leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp 1448 1449 call do_exit 14501: jmp 1b 1451END(rewind_stack_do_exit) 1452