1/* 2 * linux/arch/x86_64/entry.S 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs 6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 7 * 8 * entry.S contains the system-call and fault low-level handling routines. 9 * 10 * Some of this is documented in Documentation/x86/entry_64.txt 11 * 12 * A note on terminology: 13 * - iret frame: Architecture defined interrupt frame from SS to RIP 14 * at the top of the kernel process stack. 15 * 16 * Some macro usage: 17 * - ENTRY/END: Define functions in the symbol table. 18 * - TRACE_IRQ_*: Trace hardirq state for lock debugging. 19 * - idtentry: Define exception entry points. 20 */ 21#include <linux/linkage.h> 22#include <asm/segment.h> 23#include <asm/cache.h> 24#include <asm/errno.h> 25#include "calling.h" 26#include <asm/asm-offsets.h> 27#include <asm/msr.h> 28#include <asm/unistd.h> 29#include <asm/thread_info.h> 30#include <asm/hw_irq.h> 31#include <asm/page_types.h> 32#include <asm/irqflags.h> 33#include <asm/paravirt.h> 34#include <asm/percpu.h> 35#include <asm/asm.h> 36#include <asm/smap.h> 37#include <asm/pgtable_types.h> 38#include <linux/err.h> 39 40/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 41#include <linux/elf-em.h> 42#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) 43#define __AUDIT_ARCH_64BIT 0x80000000 44#define __AUDIT_ARCH_LE 0x40000000 45 46.code64 47.section .entry.text, "ax" 48 49#ifdef CONFIG_PARAVIRT 50ENTRY(native_usergs_sysret64) 51 swapgs 52 sysretq 53ENDPROC(native_usergs_sysret64) 54#endif /* CONFIG_PARAVIRT */ 55 56.macro TRACE_IRQS_IRETQ 57#ifdef CONFIG_TRACE_IRQFLAGS 58 bt $9, EFLAGS(%rsp) /* interrupts off? */ 59 jnc 1f 60 TRACE_IRQS_ON 611: 62#endif 63.endm 64 65/* 66 * When dynamic function tracer is enabled it will add a breakpoint 67 * to all locations that it is about to modify, sync CPUs, update 68 * all the code, sync CPUs, then remove the breakpoints. In this time 69 * if lockdep is enabled, it might jump back into the debug handler 70 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). 71 * 72 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to 73 * make sure the stack pointer does not get reset back to the top 74 * of the debug stack, and instead just reuses the current stack. 75 */ 76#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) 77 78.macro TRACE_IRQS_OFF_DEBUG 79 call debug_stack_set_zero 80 TRACE_IRQS_OFF 81 call debug_stack_reset 82.endm 83 84.macro TRACE_IRQS_ON_DEBUG 85 call debug_stack_set_zero 86 TRACE_IRQS_ON 87 call debug_stack_reset 88.endm 89 90.macro TRACE_IRQS_IRETQ_DEBUG 91 bt $9, EFLAGS(%rsp) /* interrupts off? */ 92 jnc 1f 93 TRACE_IRQS_ON_DEBUG 941: 95.endm 96 97#else 98# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF 99# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON 100# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ 101#endif 102 103/* 104 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. 105 * 106 * This is the only entry point used for 64-bit system calls. The 107 * hardware interface is reasonably well designed and the register to 108 * argument mapping Linux uses fits well with the registers that are 109 * available when SYSCALL is used. 110 * 111 * SYSCALL instructions can be found inlined in libc implementations as 112 * well as some other programs and libraries. There are also a handful 113 * of SYSCALL instructions in the vDSO used, for example, as a 114 * clock_gettimeofday fallback. 115 * 116 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 117 * then loads new ss, cs, and rip from previously programmed MSRs. 118 * rflags gets masked by a value from another MSR (so CLD and CLAC 119 * are not needed). SYSCALL does not save anything on the stack 120 * and does not change rsp. 121 * 122 * Registers on entry: 123 * rax system call number 124 * rcx return address 125 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) 126 * rdi arg0 127 * rsi arg1 128 * rdx arg2 129 * r10 arg3 (needs to be moved to rcx to conform to C ABI) 130 * r8 arg4 131 * r9 arg5 132 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) 133 * 134 * Only called from user space. 135 * 136 * When user can change pt_regs->foo always force IRET. That is because 137 * it deals with uncanonical addresses better. SYSRET has trouble 138 * with them due to bugs in both AMD and Intel CPUs. 139 */ 140 141ENTRY(entry_SYSCALL_64) 142 /* 143 * Interrupts are off on entry. 144 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 145 * it is too small to ever cause noticeable irq latency. 146 */ 147 SWAPGS_UNSAFE_STACK 148 /* 149 * A hypervisor implementation might want to use a label 150 * after the swapgs, so that it can do the swapgs 151 * for the guest and jump here on syscall. 152 */ 153GLOBAL(entry_SYSCALL_64_after_swapgs) 154 155 movq %rsp, PER_CPU_VAR(rsp_scratch) 156 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 157 158 TRACE_IRQS_OFF 159 160 /* Construct struct pt_regs on stack */ 161 pushq $__USER_DS /* pt_regs->ss */ 162 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ 163 pushq %r11 /* pt_regs->flags */ 164 pushq $__USER_CS /* pt_regs->cs */ 165 pushq %rcx /* pt_regs->ip */ 166 pushq %rax /* pt_regs->orig_ax */ 167 pushq %rdi /* pt_regs->di */ 168 pushq %rsi /* pt_regs->si */ 169 pushq %rdx /* pt_regs->dx */ 170 pushq %rcx /* pt_regs->cx */ 171 pushq $-ENOSYS /* pt_regs->ax */ 172 pushq %r8 /* pt_regs->r8 */ 173 pushq %r9 /* pt_regs->r9 */ 174 pushq %r10 /* pt_regs->r10 */ 175 pushq %r11 /* pt_regs->r11 */ 176 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 177 178 /* 179 * If we need to do entry work or if we guess we'll need to do 180 * exit work, go straight to the slow path. 181 */ 182 testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 183 jnz entry_SYSCALL64_slow_path 184 185entry_SYSCALL_64_fastpath: 186 /* 187 * Easy case: enable interrupts and issue the syscall. If the syscall 188 * needs pt_regs, we'll call a stub that disables interrupts again 189 * and jumps to the slow path. 190 */ 191 TRACE_IRQS_ON 192 ENABLE_INTERRUPTS(CLBR_NONE) 193#if __SYSCALL_MASK == ~0 194 cmpq $__NR_syscall_max, %rax 195#else 196 andl $__SYSCALL_MASK, %eax 197 cmpl $__NR_syscall_max, %eax 198#endif 199 ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 200 movq %r10, %rcx 201 202 /* 203 * This call instruction is handled specially in stub_ptregs_64. 204 * It might end up jumping to the slow path. If it jumps, RAX 205 * and all argument registers are clobbered. 206 */ 207 call *sys_call_table(, %rax, 8) 208.Lentry_SYSCALL_64_after_fastpath_call: 209 210 movq %rax, RAX(%rsp) 2111: 212 213 /* 214 * If we get here, then we know that pt_regs is clean for SYSRET64. 215 * If we see that no exit work is required (which we are required 216 * to check with IRQs off), then we can go straight to SYSRET64. 217 */ 218 DISABLE_INTERRUPTS(CLBR_NONE) 219 TRACE_IRQS_OFF 220 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 221 jnz 1f 222 223 LOCKDEP_SYS_EXIT 224 TRACE_IRQS_ON /* user mode is traced as IRQs on */ 225 movq RIP(%rsp), %rcx 226 movq EFLAGS(%rsp), %r11 227 RESTORE_C_REGS_EXCEPT_RCX_R11 228 movq RSP(%rsp), %rsp 229 USERGS_SYSRET64 230 2311: 232 /* 233 * The fast path looked good when we started, but something changed 234 * along the way and we need to switch to the slow path. Calling 235 * raise(3) will trigger this, for example. IRQs are off. 236 */ 237 TRACE_IRQS_ON 238 ENABLE_INTERRUPTS(CLBR_NONE) 239 SAVE_EXTRA_REGS 240 movq %rsp, %rdi 241 call syscall_return_slowpath /* returns with IRQs disabled */ 242 jmp return_from_SYSCALL_64 243 244entry_SYSCALL64_slow_path: 245 /* IRQs are off. */ 246 SAVE_EXTRA_REGS 247 movq %rsp, %rdi 248 call do_syscall_64 /* returns with IRQs disabled */ 249 250return_from_SYSCALL_64: 251 RESTORE_EXTRA_REGS 252 TRACE_IRQS_IRETQ /* we're about to change IF */ 253 254 /* 255 * Try to use SYSRET instead of IRET if we're returning to 256 * a completely clean 64-bit userspace context. 257 */ 258 movq RCX(%rsp), %rcx 259 movq RIP(%rsp), %r11 260 cmpq %rcx, %r11 /* RCX == RIP */ 261 jne opportunistic_sysret_failed 262 263 /* 264 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP 265 * in kernel space. This essentially lets the user take over 266 * the kernel, since userspace controls RSP. 267 * 268 * If width of "canonical tail" ever becomes variable, this will need 269 * to be updated to remain correct on both old and new CPUs. 270 */ 271 .ifne __VIRTUAL_MASK_SHIFT - 47 272 .error "virtual address width changed -- SYSRET checks need update" 273 .endif 274 275 /* Change top 16 bits to be the sign-extension of 47th bit */ 276 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 277 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 278 279 /* If this changed %rcx, it was not canonical */ 280 cmpq %rcx, %r11 281 jne opportunistic_sysret_failed 282 283 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ 284 jne opportunistic_sysret_failed 285 286 movq R11(%rsp), %r11 287 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ 288 jne opportunistic_sysret_failed 289 290 /* 291 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, 292 * restoring TF results in a trap from userspace immediately after 293 * SYSRET. This would cause an infinite loop whenever #DB happens 294 * with register state that satisfies the opportunistic SYSRET 295 * conditions. For example, single-stepping this user code: 296 * 297 * movq $stuck_here, %rcx 298 * pushfq 299 * popq %r11 300 * stuck_here: 301 * 302 * would never get past 'stuck_here'. 303 */ 304 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 305 jnz opportunistic_sysret_failed 306 307 /* nothing to check for RSP */ 308 309 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ 310 jne opportunistic_sysret_failed 311 312 /* 313 * We win! This label is here just for ease of understanding 314 * perf profiles. Nothing jumps here. 315 */ 316syscall_return_via_sysret: 317 /* rcx and r11 are already restored (see code above) */ 318 RESTORE_C_REGS_EXCEPT_RCX_R11 319 movq RSP(%rsp), %rsp 320 USERGS_SYSRET64 321 322opportunistic_sysret_failed: 323 SWAPGS 324 jmp restore_c_regs_and_iret 325END(entry_SYSCALL_64) 326 327ENTRY(stub_ptregs_64) 328 /* 329 * Syscalls marked as needing ptregs land here. 330 * If we are on the fast path, we need to save the extra regs, 331 * which we achieve by trying again on the slow path. If we are on 332 * the slow path, the extra regs are already saved. 333 * 334 * RAX stores a pointer to the C function implementing the syscall. 335 * IRQs are on. 336 */ 337 cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) 338 jne 1f 339 340 /* 341 * Called from fast path -- disable IRQs again, pop return address 342 * and jump to slow path 343 */ 344 DISABLE_INTERRUPTS(CLBR_NONE) 345 TRACE_IRQS_OFF 346 popq %rax 347 jmp entry_SYSCALL64_slow_path 348 3491: 350 /* Called from C */ 351 jmp *%rax /* called from C */ 352END(stub_ptregs_64) 353 354.macro ptregs_stub func 355ENTRY(ptregs_\func) 356 leaq \func(%rip), %rax 357 jmp stub_ptregs_64 358END(ptregs_\func) 359.endm 360 361/* Instantiate ptregs_stub for each ptregs-using syscall */ 362#define __SYSCALL_64_QUAL_(sym) 363#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym 364#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) 365#include <asm/syscalls_64.h> 366 367/* 368 * A newly forked process directly context switches into this address. 369 * 370 * rdi: prev task we switched from 371 */ 372ENTRY(ret_from_fork) 373 LOCK ; btr $TIF_FORK, TI_flags(%r8) 374 375 call schedule_tail /* rdi: 'prev' task parameter */ 376 377 testb $3, CS(%rsp) /* from kernel_thread? */ 378 jnz 1f 379 380 /* 381 * We came from kernel_thread. This code path is quite twisted, and 382 * someone should clean it up. 383 * 384 * copy_thread_tls stashes the function pointer in RBX and the 385 * parameter to be passed in RBP. The called function is permitted 386 * to call do_execve and thereby jump to user mode. 387 */ 388 movq RBP(%rsp), %rdi 389 call *RBX(%rsp) 390 movl $0, RAX(%rsp) 391 392 /* 393 * Fall through as though we're exiting a syscall. This makes a 394 * twisted sort of sense if we just called do_execve. 395 */ 396 3971: 398 movq %rsp, %rdi 399 call syscall_return_slowpath /* returns with IRQs disabled */ 400 TRACE_IRQS_ON /* user mode is traced as IRQS on */ 401 SWAPGS 402 jmp restore_regs_and_iret 403END(ret_from_fork) 404 405/* 406 * Build the entry stubs with some assembler magic. 407 * We pack 1 stub into every 8-byte block. 408 */ 409 .align 8 410ENTRY(irq_entries_start) 411 vector=FIRST_EXTERNAL_VECTOR 412 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) 413 pushq $(~vector+0x80) /* Note: always in signed byte range */ 414 vector=vector+1 415 jmp common_interrupt 416 .align 8 417 .endr 418END(irq_entries_start) 419 420/* 421 * Interrupt entry/exit. 422 * 423 * Interrupt entry points save only callee clobbered registers in fast path. 424 * 425 * Entry runs with interrupts off. 426 */ 427 428/* 0(%rsp): ~(interrupt number) */ 429 .macro interrupt func 430 cld 431 ALLOC_PT_GPREGS_ON_STACK 432 SAVE_C_REGS 433 SAVE_EXTRA_REGS 434 435 testb $3, CS(%rsp) 436 jz 1f 437 438 /* 439 * IRQ from user mode. Switch to kernel gsbase and inform context 440 * tracking that we're in kernel mode. 441 */ 442 SWAPGS 443 444 /* 445 * We need to tell lockdep that IRQs are off. We can't do this until 446 * we fix gsbase, and we should do it before enter_from_user_mode 447 * (which can take locks). Since TRACE_IRQS_OFF idempotent, 448 * the simplest way to handle it is to just call it twice if 449 * we enter from user mode. There's no reason to optimize this since 450 * TRACE_IRQS_OFF is a no-op if lockdep is off. 451 */ 452 TRACE_IRQS_OFF 453 454 CALL_enter_from_user_mode 455 4561: 457 /* 458 * Save previous stack pointer, optionally switch to interrupt stack. 459 * irq_count is used to check if a CPU is already on an interrupt stack 460 * or not. While this is essentially redundant with preempt_count it is 461 * a little cheaper to use a separate counter in the PDA (short of 462 * moving irq_enter into assembly, which would be too much work) 463 */ 464 movq %rsp, %rdi 465 incl PER_CPU_VAR(irq_count) 466 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 467 pushq %rdi 468 /* We entered an interrupt context - irqs are off: */ 469 TRACE_IRQS_OFF 470 471 call \func /* rdi points to pt_regs */ 472 .endm 473 474 /* 475 * The interrupt stubs push (~vector+0x80) onto the stack and 476 * then jump to common_interrupt. 477 */ 478 .p2align CONFIG_X86_L1_CACHE_SHIFT 479common_interrupt: 480 ASM_CLAC 481 addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ 482 interrupt do_IRQ 483 /* 0(%rsp): old RSP */ 484ret_from_intr: 485 DISABLE_INTERRUPTS(CLBR_NONE) 486 TRACE_IRQS_OFF 487 decl PER_CPU_VAR(irq_count) 488 489 /* Restore saved previous stack */ 490 popq %rsp 491 492 testb $3, CS(%rsp) 493 jz retint_kernel 494 495 /* Interrupt came from user space */ 496GLOBAL(retint_user) 497 mov %rsp,%rdi 498 call prepare_exit_to_usermode 499 TRACE_IRQS_IRETQ 500 SWAPGS 501 jmp restore_regs_and_iret 502 503/* Returning to kernel space */ 504retint_kernel: 505#ifdef CONFIG_PREEMPT 506 /* Interrupts are off */ 507 /* Check if we need preemption */ 508 bt $9, EFLAGS(%rsp) /* were interrupts off? */ 509 jnc 1f 5100: cmpl $0, PER_CPU_VAR(__preempt_count) 511 jnz 1f 512 call preempt_schedule_irq 513 jmp 0b 5141: 515#endif 516 /* 517 * The iretq could re-enable interrupts: 518 */ 519 TRACE_IRQS_IRETQ 520 521/* 522 * At this label, code paths which return to kernel and to user, 523 * which come from interrupts/exception and from syscalls, merge. 524 */ 525GLOBAL(restore_regs_and_iret) 526 RESTORE_EXTRA_REGS 527restore_c_regs_and_iret: 528 RESTORE_C_REGS 529 REMOVE_PT_GPREGS_FROM_STACK 8 530 INTERRUPT_RETURN 531 532ENTRY(native_iret) 533 /* 534 * Are we returning to a stack segment from the LDT? Note: in 535 * 64-bit mode SS:RSP on the exception stack is always valid. 536 */ 537#ifdef CONFIG_X86_ESPFIX64 538 testb $4, (SS-RIP)(%rsp) 539 jnz native_irq_return_ldt 540#endif 541 542.global native_irq_return_iret 543native_irq_return_iret: 544 /* 545 * This may fault. Non-paranoid faults on return to userspace are 546 * handled by fixup_bad_iret. These include #SS, #GP, and #NP. 547 * Double-faults due to espfix64 are handled in do_double_fault. 548 * Other faults here are fatal. 549 */ 550 iretq 551 552#ifdef CONFIG_X86_ESPFIX64 553native_irq_return_ldt: 554 pushq %rax 555 pushq %rdi 556 SWAPGS 557 movq PER_CPU_VAR(espfix_waddr), %rdi 558 movq %rax, (0*8)(%rdi) /* RAX */ 559 movq (2*8)(%rsp), %rax /* RIP */ 560 movq %rax, (1*8)(%rdi) 561 movq (3*8)(%rsp), %rax /* CS */ 562 movq %rax, (2*8)(%rdi) 563 movq (4*8)(%rsp), %rax /* RFLAGS */ 564 movq %rax, (3*8)(%rdi) 565 movq (6*8)(%rsp), %rax /* SS */ 566 movq %rax, (5*8)(%rdi) 567 movq (5*8)(%rsp), %rax /* RSP */ 568 movq %rax, (4*8)(%rdi) 569 andl $0xffff0000, %eax 570 popq %rdi 571 orq PER_CPU_VAR(espfix_stack), %rax 572 SWAPGS 573 movq %rax, %rsp 574 popq %rax 575 jmp native_irq_return_iret 576#endif 577END(common_interrupt) 578 579/* 580 * APIC interrupts. 581 */ 582.macro apicinterrupt3 num sym do_sym 583ENTRY(\sym) 584 ASM_CLAC 585 pushq $~(\num) 586.Lcommon_\sym: 587 interrupt \do_sym 588 jmp ret_from_intr 589END(\sym) 590.endm 591 592#ifdef CONFIG_TRACING 593#define trace(sym) trace_##sym 594#define smp_trace(sym) smp_trace_##sym 595 596.macro trace_apicinterrupt num sym 597apicinterrupt3 \num trace(\sym) smp_trace(\sym) 598.endm 599#else 600.macro trace_apicinterrupt num sym do_sym 601.endm 602#endif 603 604.macro apicinterrupt num sym do_sym 605apicinterrupt3 \num \sym \do_sym 606trace_apicinterrupt \num \sym 607.endm 608 609#ifdef CONFIG_SMP 610apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 611apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt 612#endif 613 614#ifdef CONFIG_X86_UV 615apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt 616#endif 617 618apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt 619apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi 620 621#ifdef CONFIG_HAVE_KVM 622apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi 623apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi 624#endif 625 626#ifdef CONFIG_X86_MCE_THRESHOLD 627apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt 628#endif 629 630#ifdef CONFIG_X86_MCE_AMD 631apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt 632#endif 633 634#ifdef CONFIG_X86_THERMAL_VECTOR 635apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt 636#endif 637 638#ifdef CONFIG_SMP 639apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt 640apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt 641apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt 642#endif 643 644apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt 645apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt 646 647#ifdef CONFIG_IRQ_WORK 648apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt 649#endif 650 651/* 652 * Exception entry points. 653 */ 654#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) 655 656.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 657ENTRY(\sym) 658 /* Sanity check */ 659 .if \shift_ist != -1 && \paranoid == 0 660 .error "using shift_ist requires paranoid=1" 661 .endif 662 663 ASM_CLAC 664 PARAVIRT_ADJUST_EXCEPTION_FRAME 665 666 .ifeq \has_error_code 667 pushq $-1 /* ORIG_RAX: no syscall to restart */ 668 .endif 669 670 ALLOC_PT_GPREGS_ON_STACK 671 672 .if \paranoid 673 .if \paranoid == 1 674 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ 675 jnz 1f 676 .endif 677 call paranoid_entry 678 .else 679 call error_entry 680 .endif 681 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ 682 683 .if \paranoid 684 .if \shift_ist != -1 685 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ 686 .else 687 TRACE_IRQS_OFF 688 .endif 689 .endif 690 691 movq %rsp, %rdi /* pt_regs pointer */ 692 693 .if \has_error_code 694 movq ORIG_RAX(%rsp), %rsi /* get error code */ 695 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ 696 .else 697 xorl %esi, %esi /* no error code */ 698 .endif 699 700 .if \shift_ist != -1 701 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 702 .endif 703 704 call \do_sym 705 706 .if \shift_ist != -1 707 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 708 .endif 709 710 /* these procedures expect "no swapgs" flag in ebx */ 711 .if \paranoid 712 jmp paranoid_exit 713 .else 714 jmp error_exit 715 .endif 716 717 .if \paranoid == 1 718 /* 719 * Paranoid entry from userspace. Switch stacks and treat it 720 * as a normal entry. This means that paranoid handlers 721 * run in real process context if user_mode(regs). 722 */ 7231: 724 call error_entry 725 726 727 movq %rsp, %rdi /* pt_regs pointer */ 728 call sync_regs 729 movq %rax, %rsp /* switch stack */ 730 731 movq %rsp, %rdi /* pt_regs pointer */ 732 733 .if \has_error_code 734 movq ORIG_RAX(%rsp), %rsi /* get error code */ 735 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ 736 .else 737 xorl %esi, %esi /* no error code */ 738 .endif 739 740 call \do_sym 741 742 jmp error_exit /* %ebx: no swapgs flag */ 743 .endif 744END(\sym) 745.endm 746 747#ifdef CONFIG_TRACING 748.macro trace_idtentry sym do_sym has_error_code:req 749idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code 750idtentry \sym \do_sym has_error_code=\has_error_code 751.endm 752#else 753.macro trace_idtentry sym do_sym has_error_code:req 754idtentry \sym \do_sym has_error_code=\has_error_code 755.endm 756#endif 757 758idtentry divide_error do_divide_error has_error_code=0 759idtentry overflow do_overflow has_error_code=0 760idtentry bounds do_bounds has_error_code=0 761idtentry invalid_op do_invalid_op has_error_code=0 762idtentry device_not_available do_device_not_available has_error_code=0 763idtentry double_fault do_double_fault has_error_code=1 paranoid=2 764idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 765idtentry invalid_TSS do_invalid_TSS has_error_code=1 766idtentry segment_not_present do_segment_not_present has_error_code=1 767idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 768idtentry coprocessor_error do_coprocessor_error has_error_code=0 769idtentry alignment_check do_alignment_check has_error_code=1 770idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 771 772 773 /* 774 * Reload gs selector with exception handling 775 * edi: new selector 776 */ 777ENTRY(native_load_gs_index) 778 pushfq 779 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 780 SWAPGS 781.Lgs_change: 782 movl %edi, %gs 7832: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE 784 SWAPGS 785 popfq 786 ret 787END(native_load_gs_index) 788 789 _ASM_EXTABLE(.Lgs_change, bad_gs) 790 .section .fixup, "ax" 791 /* running with kernelgs */ 792bad_gs: 793 SWAPGS /* switch back to user gs */ 794.macro ZAP_GS 795 /* This can't be a string because the preprocessor needs to see it. */ 796 movl $__USER_DS, %eax 797 movl %eax, %gs 798.endm 799 ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG 800 xorl %eax, %eax 801 movl %eax, %gs 802 jmp 2b 803 .previous 804 805/* Call softirq on interrupt stack. Interrupts are off. */ 806ENTRY(do_softirq_own_stack) 807 pushq %rbp 808 mov %rsp, %rbp 809 incl PER_CPU_VAR(irq_count) 810 cmove PER_CPU_VAR(irq_stack_ptr), %rsp 811 push %rbp /* frame pointer backlink */ 812 call __do_softirq 813 leaveq 814 decl PER_CPU_VAR(irq_count) 815 ret 816END(do_softirq_own_stack) 817 818#ifdef CONFIG_XEN 819idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 820 821/* 822 * A note on the "critical region" in our callback handler. 823 * We want to avoid stacking callback handlers due to events occurring 824 * during handling of the last event. To do this, we keep events disabled 825 * until we've done all processing. HOWEVER, we must enable events before 826 * popping the stack frame (can't be done atomically) and so it would still 827 * be possible to get enough handler activations to overflow the stack. 828 * Although unlikely, bugs of that kind are hard to track down, so we'd 829 * like to avoid the possibility. 830 * So, on entry to the handler we detect whether we interrupted an 831 * existing activation in its critical region -- if so, we pop the current 832 * activation and restart the handler using the previous one. 833 */ 834ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ 835 836/* 837 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 838 * see the correct pointer to the pt_regs 839 */ 840 movq %rdi, %rsp /* we don't return, adjust the stack frame */ 84111: incl PER_CPU_VAR(irq_count) 842 movq %rsp, %rbp 843 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 844 pushq %rbp /* frame pointer backlink */ 845 call xen_evtchn_do_upcall 846 popq %rsp 847 decl PER_CPU_VAR(irq_count) 848#ifndef CONFIG_PREEMPT 849 call xen_maybe_preempt_hcall 850#endif 851 jmp error_exit 852END(xen_do_hypervisor_callback) 853 854/* 855 * Hypervisor uses this for application faults while it executes. 856 * We get here for two reasons: 857 * 1. Fault while reloading DS, ES, FS or GS 858 * 2. Fault while executing IRET 859 * Category 1 we do not need to fix up as Xen has already reloaded all segment 860 * registers that could be reloaded and zeroed the others. 861 * Category 2 we fix up by killing the current process. We cannot use the 862 * normal Linux return path in this case because if we use the IRET hypercall 863 * to pop the stack frame we end up in an infinite loop of failsafe callbacks. 864 * We distinguish between categories by comparing each saved segment register 865 * with its current contents: any discrepancy means we in category 1. 866 */ 867ENTRY(xen_failsafe_callback) 868 movl %ds, %ecx 869 cmpw %cx, 0x10(%rsp) 870 jne 1f 871 movl %es, %ecx 872 cmpw %cx, 0x18(%rsp) 873 jne 1f 874 movl %fs, %ecx 875 cmpw %cx, 0x20(%rsp) 876 jne 1f 877 movl %gs, %ecx 878 cmpw %cx, 0x28(%rsp) 879 jne 1f 880 /* All segments match their saved values => Category 2 (Bad IRET). */ 881 movq (%rsp), %rcx 882 movq 8(%rsp), %r11 883 addq $0x30, %rsp 884 pushq $0 /* RIP */ 885 pushq %r11 886 pushq %rcx 887 jmp general_protection 8881: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 889 movq (%rsp), %rcx 890 movq 8(%rsp), %r11 891 addq $0x30, %rsp 892 pushq $-1 /* orig_ax = -1 => not a system call */ 893 ALLOC_PT_GPREGS_ON_STACK 894 SAVE_C_REGS 895 SAVE_EXTRA_REGS 896 jmp error_exit 897END(xen_failsafe_callback) 898 899apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ 900 xen_hvm_callback_vector xen_evtchn_do_upcall 901 902#endif /* CONFIG_XEN */ 903 904#if IS_ENABLED(CONFIG_HYPERV) 905apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ 906 hyperv_callback_vector hyperv_vector_handler 907#endif /* CONFIG_HYPERV */ 908 909idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 910idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 911idtentry stack_segment do_stack_segment has_error_code=1 912 913#ifdef CONFIG_XEN 914idtentry xen_debug do_debug has_error_code=0 915idtentry xen_int3 do_int3 has_error_code=0 916idtentry xen_stack_segment do_stack_segment has_error_code=1 917#endif 918 919idtentry general_protection do_general_protection has_error_code=1 920trace_idtentry page_fault do_page_fault has_error_code=1 921 922#ifdef CONFIG_KVM_GUEST 923idtentry async_page_fault do_async_page_fault has_error_code=1 924#endif 925 926#ifdef CONFIG_X86_MCE 927idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) 928#endif 929 930/* 931 * Save all registers in pt_regs, and switch gs if needed. 932 * Use slow, but surefire "are we in kernel?" check. 933 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise 934 */ 935ENTRY(paranoid_entry) 936 cld 937 SAVE_C_REGS 8 938 SAVE_EXTRA_REGS 8 939 movl $1, %ebx 940 movl $MSR_GS_BASE, %ecx 941 rdmsr 942 testl %edx, %edx 943 js 1f /* negative -> in kernel */ 944 SWAPGS 945 xorl %ebx, %ebx 9461: ret 947END(paranoid_entry) 948 949/* 950 * "Paranoid" exit path from exception stack. This is invoked 951 * only on return from non-NMI IST interrupts that came 952 * from kernel space. 953 * 954 * We may be returning to very strange contexts (e.g. very early 955 * in syscall entry), so checking for preemption here would 956 * be complicated. Fortunately, we there's no good reason 957 * to try to handle preemption here. 958 * 959 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) 960 */ 961ENTRY(paranoid_exit) 962 DISABLE_INTERRUPTS(CLBR_NONE) 963 TRACE_IRQS_OFF_DEBUG 964 testl %ebx, %ebx /* swapgs needed? */ 965 jnz paranoid_exit_no_swapgs 966 TRACE_IRQS_IRETQ 967 SWAPGS_UNSAFE_STACK 968 jmp paranoid_exit_restore 969paranoid_exit_no_swapgs: 970 TRACE_IRQS_IRETQ_DEBUG 971paranoid_exit_restore: 972 RESTORE_EXTRA_REGS 973 RESTORE_C_REGS 974 REMOVE_PT_GPREGS_FROM_STACK 8 975 INTERRUPT_RETURN 976END(paranoid_exit) 977 978/* 979 * Save all registers in pt_regs, and switch gs if needed. 980 * Return: EBX=0: came from user mode; EBX=1: otherwise 981 */ 982ENTRY(error_entry) 983 cld 984 SAVE_C_REGS 8 985 SAVE_EXTRA_REGS 8 986 xorl %ebx, %ebx 987 testb $3, CS+8(%rsp) 988 jz .Lerror_kernelspace 989 990.Lerror_entry_from_usermode_swapgs: 991 /* 992 * We entered from user mode or we're pretending to have entered 993 * from user mode due to an IRET fault. 994 */ 995 SWAPGS 996 997.Lerror_entry_from_usermode_after_swapgs: 998 /* 999 * We need to tell lockdep that IRQs are off. We can't do this until 1000 * we fix gsbase, and we should do it before enter_from_user_mode 1001 * (which can take locks). 1002 */ 1003 TRACE_IRQS_OFF 1004 CALL_enter_from_user_mode 1005 ret 1006 1007.Lerror_entry_done: 1008 TRACE_IRQS_OFF 1009 ret 1010 1011 /* 1012 * There are two places in the kernel that can potentially fault with 1013 * usergs. Handle them here. B stepping K8s sometimes report a 1014 * truncated RIP for IRET exceptions returning to compat mode. Check 1015 * for these here too. 1016 */ 1017.Lerror_kernelspace: 1018 incl %ebx 1019 leaq native_irq_return_iret(%rip), %rcx 1020 cmpq %rcx, RIP+8(%rsp) 1021 je .Lerror_bad_iret 1022 movl %ecx, %eax /* zero extend */ 1023 cmpq %rax, RIP+8(%rsp) 1024 je .Lbstep_iret 1025 cmpq $.Lgs_change, RIP+8(%rsp) 1026 jne .Lerror_entry_done 1027 1028 /* 1029 * hack: .Lgs_change can fail with user gsbase. If this happens, fix up 1030 * gsbase and proceed. We'll fix up the exception and land in 1031 * .Lgs_change's error handler with kernel gsbase. 1032 */ 1033 jmp .Lerror_entry_from_usermode_swapgs 1034 1035.Lbstep_iret: 1036 /* Fix truncated RIP */ 1037 movq %rcx, RIP+8(%rsp) 1038 /* fall through */ 1039 1040.Lerror_bad_iret: 1041 /* 1042 * We came from an IRET to user mode, so we have user gsbase. 1043 * Switch to kernel gsbase: 1044 */ 1045 SWAPGS 1046 1047 /* 1048 * Pretend that the exception came from user mode: set up pt_regs 1049 * as if we faulted immediately after IRET and clear EBX so that 1050 * error_exit knows that we will be returning to user mode. 1051 */ 1052 mov %rsp, %rdi 1053 call fixup_bad_iret 1054 mov %rax, %rsp 1055 decl %ebx 1056 jmp .Lerror_entry_from_usermode_after_swapgs 1057END(error_entry) 1058 1059 1060/* 1061 * On entry, EBS is a "return to kernel mode" flag: 1062 * 1: already in kernel mode, don't need SWAPGS 1063 * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode 1064 */ 1065ENTRY(error_exit) 1066 movl %ebx, %eax 1067 DISABLE_INTERRUPTS(CLBR_NONE) 1068 TRACE_IRQS_OFF 1069 testl %eax, %eax 1070 jnz retint_kernel 1071 jmp retint_user 1072END(error_exit) 1073 1074/* Runs on exception stack */ 1075ENTRY(nmi) 1076 /* 1077 * Fix up the exception frame if we're on Xen. 1078 * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most 1079 * one value to the stack on native, so it may clobber the rdx 1080 * scratch slot, but it won't clobber any of the important 1081 * slots past it. 1082 * 1083 * Xen is a different story, because the Xen frame itself overlaps 1084 * the "NMI executing" variable. 1085 */ 1086 PARAVIRT_ADJUST_EXCEPTION_FRAME 1087 1088 /* 1089 * We allow breakpoints in NMIs. If a breakpoint occurs, then 1090 * the iretq it performs will take us out of NMI context. 1091 * This means that we can have nested NMIs where the next 1092 * NMI is using the top of the stack of the previous NMI. We 1093 * can't let it execute because the nested NMI will corrupt the 1094 * stack of the previous NMI. NMI handlers are not re-entrant 1095 * anyway. 1096 * 1097 * To handle this case we do the following: 1098 * Check the a special location on the stack that contains 1099 * a variable that is set when NMIs are executing. 1100 * The interrupted task's stack is also checked to see if it 1101 * is an NMI stack. 1102 * If the variable is not set and the stack is not the NMI 1103 * stack then: 1104 * o Set the special variable on the stack 1105 * o Copy the interrupt frame into an "outermost" location on the 1106 * stack 1107 * o Copy the interrupt frame into an "iret" location on the stack 1108 * o Continue processing the NMI 1109 * If the variable is set or the previous stack is the NMI stack: 1110 * o Modify the "iret" location to jump to the repeat_nmi 1111 * o return back to the first NMI 1112 * 1113 * Now on exit of the first NMI, we first clear the stack variable 1114 * The NMI stack will tell any nested NMIs at that point that it is 1115 * nested. Then we pop the stack normally with iret, and if there was 1116 * a nested NMI that updated the copy interrupt stack frame, a 1117 * jump will be made to the repeat_nmi code that will handle the second 1118 * NMI. 1119 * 1120 * However, espfix prevents us from directly returning to userspace 1121 * with a single IRET instruction. Similarly, IRET to user mode 1122 * can fault. We therefore handle NMIs from user space like 1123 * other IST entries. 1124 */ 1125 1126 /* Use %rdx as our temp variable throughout */ 1127 pushq %rdx 1128 1129 testb $3, CS-RIP+8(%rsp) 1130 jz .Lnmi_from_kernel 1131 1132 /* 1133 * NMI from user mode. We need to run on the thread stack, but we 1134 * can't go through the normal entry paths: NMIs are masked, and 1135 * we don't want to enable interrupts, because then we'll end 1136 * up in an awkward situation in which IRQs are on but NMIs 1137 * are off. 1138 * 1139 * We also must not push anything to the stack before switching 1140 * stacks lest we corrupt the "NMI executing" variable. 1141 */ 1142 1143 SWAPGS_UNSAFE_STACK 1144 cld 1145 movq %rsp, %rdx 1146 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 1147 pushq 5*8(%rdx) /* pt_regs->ss */ 1148 pushq 4*8(%rdx) /* pt_regs->rsp */ 1149 pushq 3*8(%rdx) /* pt_regs->flags */ 1150 pushq 2*8(%rdx) /* pt_regs->cs */ 1151 pushq 1*8(%rdx) /* pt_regs->rip */ 1152 pushq $-1 /* pt_regs->orig_ax */ 1153 pushq %rdi /* pt_regs->di */ 1154 pushq %rsi /* pt_regs->si */ 1155 pushq (%rdx) /* pt_regs->dx */ 1156 pushq %rcx /* pt_regs->cx */ 1157 pushq %rax /* pt_regs->ax */ 1158 pushq %r8 /* pt_regs->r8 */ 1159 pushq %r9 /* pt_regs->r9 */ 1160 pushq %r10 /* pt_regs->r10 */ 1161 pushq %r11 /* pt_regs->r11 */ 1162 pushq %rbx /* pt_regs->rbx */ 1163 pushq %rbp /* pt_regs->rbp */ 1164 pushq %r12 /* pt_regs->r12 */ 1165 pushq %r13 /* pt_regs->r13 */ 1166 pushq %r14 /* pt_regs->r14 */ 1167 pushq %r15 /* pt_regs->r15 */ 1168 1169 /* 1170 * At this point we no longer need to worry about stack damage 1171 * due to nesting -- we're on the normal thread stack and we're 1172 * done with the NMI stack. 1173 */ 1174 1175 movq %rsp, %rdi 1176 movq $-1, %rsi 1177 call do_nmi 1178 1179 /* 1180 * Return back to user mode. We must *not* do the normal exit 1181 * work, because we don't want to enable interrupts. Fortunately, 1182 * do_nmi doesn't modify pt_regs. 1183 */ 1184 SWAPGS 1185 jmp restore_c_regs_and_iret 1186 1187.Lnmi_from_kernel: 1188 /* 1189 * Here's what our stack frame will look like: 1190 * +---------------------------------------------------------+ 1191 * | original SS | 1192 * | original Return RSP | 1193 * | original RFLAGS | 1194 * | original CS | 1195 * | original RIP | 1196 * +---------------------------------------------------------+ 1197 * | temp storage for rdx | 1198 * +---------------------------------------------------------+ 1199 * | "NMI executing" variable | 1200 * +---------------------------------------------------------+ 1201 * | iret SS } Copied from "outermost" frame | 1202 * | iret Return RSP } on each loop iteration; overwritten | 1203 * | iret RFLAGS } by a nested NMI to force another | 1204 * | iret CS } iteration if needed. | 1205 * | iret RIP } | 1206 * +---------------------------------------------------------+ 1207 * | outermost SS } initialized in first_nmi; | 1208 * | outermost Return RSP } will not be changed before | 1209 * | outermost RFLAGS } NMI processing is done. | 1210 * | outermost CS } Copied to "iret" frame on each | 1211 * | outermost RIP } iteration. | 1212 * +---------------------------------------------------------+ 1213 * | pt_regs | 1214 * +---------------------------------------------------------+ 1215 * 1216 * The "original" frame is used by hardware. Before re-enabling 1217 * NMIs, we need to be done with it, and we need to leave enough 1218 * space for the asm code here. 1219 * 1220 * We return by executing IRET while RSP points to the "iret" frame. 1221 * That will either return for real or it will loop back into NMI 1222 * processing. 1223 * 1224 * The "outermost" frame is copied to the "iret" frame on each 1225 * iteration of the loop, so each iteration starts with the "iret" 1226 * frame pointing to the final return target. 1227 */ 1228 1229 /* 1230 * Determine whether we're a nested NMI. 1231 * 1232 * If we interrupted kernel code between repeat_nmi and 1233 * end_repeat_nmi, then we are a nested NMI. We must not 1234 * modify the "iret" frame because it's being written by 1235 * the outer NMI. That's okay; the outer NMI handler is 1236 * about to about to call do_nmi anyway, so we can just 1237 * resume the outer NMI. 1238 */ 1239 1240 movq $repeat_nmi, %rdx 1241 cmpq 8(%rsp), %rdx 1242 ja 1f 1243 movq $end_repeat_nmi, %rdx 1244 cmpq 8(%rsp), %rdx 1245 ja nested_nmi_out 12461: 1247 1248 /* 1249 * Now check "NMI executing". If it's set, then we're nested. 1250 * This will not detect if we interrupted an outer NMI just 1251 * before IRET. 1252 */ 1253 cmpl $1, -8(%rsp) 1254 je nested_nmi 1255 1256 /* 1257 * Now test if the previous stack was an NMI stack. This covers 1258 * the case where we interrupt an outer NMI after it clears 1259 * "NMI executing" but before IRET. We need to be careful, though: 1260 * there is one case in which RSP could point to the NMI stack 1261 * despite there being no NMI active: naughty userspace controls 1262 * RSP at the very beginning of the SYSCALL targets. We can 1263 * pull a fast one on naughty userspace, though: we program 1264 * SYSCALL to mask DF, so userspace cannot cause DF to be set 1265 * if it controls the kernel's RSP. We set DF before we clear 1266 * "NMI executing". 1267 */ 1268 lea 6*8(%rsp), %rdx 1269 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ 1270 cmpq %rdx, 4*8(%rsp) 1271 /* If the stack pointer is above the NMI stack, this is a normal NMI */ 1272 ja first_nmi 1273 1274 subq $EXCEPTION_STKSZ, %rdx 1275 cmpq %rdx, 4*8(%rsp) 1276 /* If it is below the NMI stack, it is a normal NMI */ 1277 jb first_nmi 1278 1279 /* Ah, it is within the NMI stack. */ 1280 1281 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) 1282 jz first_nmi /* RSP was user controlled. */ 1283 1284 /* This is a nested NMI. */ 1285 1286nested_nmi: 1287 /* 1288 * Modify the "iret" frame to point to repeat_nmi, forcing another 1289 * iteration of NMI handling. 1290 */ 1291 subq $8, %rsp 1292 leaq -10*8(%rsp), %rdx 1293 pushq $__KERNEL_DS 1294 pushq %rdx 1295 pushfq 1296 pushq $__KERNEL_CS 1297 pushq $repeat_nmi 1298 1299 /* Put stack back */ 1300 addq $(6*8), %rsp 1301 1302nested_nmi_out: 1303 popq %rdx 1304 1305 /* We are returning to kernel mode, so this cannot result in a fault. */ 1306 INTERRUPT_RETURN 1307 1308first_nmi: 1309 /* Restore rdx. */ 1310 movq (%rsp), %rdx 1311 1312 /* Make room for "NMI executing". */ 1313 pushq $0 1314 1315 /* Leave room for the "iret" frame */ 1316 subq $(5*8), %rsp 1317 1318 /* Copy the "original" frame to the "outermost" frame */ 1319 .rept 5 1320 pushq 11*8(%rsp) 1321 .endr 1322 1323 /* Everything up to here is safe from nested NMIs */ 1324 1325#ifdef CONFIG_DEBUG_ENTRY 1326 /* 1327 * For ease of testing, unmask NMIs right away. Disabled by 1328 * default because IRET is very expensive. 1329 */ 1330 pushq $0 /* SS */ 1331 pushq %rsp /* RSP (minus 8 because of the previous push) */ 1332 addq $8, (%rsp) /* Fix up RSP */ 1333 pushfq /* RFLAGS */ 1334 pushq $__KERNEL_CS /* CS */ 1335 pushq $1f /* RIP */ 1336 INTERRUPT_RETURN /* continues at repeat_nmi below */ 13371: 1338#endif 1339 1340repeat_nmi: 1341 /* 1342 * If there was a nested NMI, the first NMI's iret will return 1343 * here. But NMIs are still enabled and we can take another 1344 * nested NMI. The nested NMI checks the interrupted RIP to see 1345 * if it is between repeat_nmi and end_repeat_nmi, and if so 1346 * it will just return, as we are about to repeat an NMI anyway. 1347 * This makes it safe to copy to the stack frame that a nested 1348 * NMI will update. 1349 * 1350 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if 1351 * we're repeating an NMI, gsbase has the same value that it had on 1352 * the first iteration. paranoid_entry will load the kernel 1353 * gsbase if needed before we call do_nmi. "NMI executing" 1354 * is zero. 1355 */ 1356 movq $1, 10*8(%rsp) /* Set "NMI executing". */ 1357 1358 /* 1359 * Copy the "outermost" frame to the "iret" frame. NMIs that nest 1360 * here must not modify the "iret" frame while we're writing to 1361 * it or it will end up containing garbage. 1362 */ 1363 addq $(10*8), %rsp 1364 .rept 5 1365 pushq -6*8(%rsp) 1366 .endr 1367 subq $(5*8), %rsp 1368end_repeat_nmi: 1369 1370 /* 1371 * Everything below this point can be preempted by a nested NMI. 1372 * If this happens, then the inner NMI will change the "iret" 1373 * frame to point back to repeat_nmi. 1374 */ 1375 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1376 ALLOC_PT_GPREGS_ON_STACK 1377 1378 /* 1379 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit 1380 * as we should not be calling schedule in NMI context. 1381 * Even with normal interrupts enabled. An NMI should not be 1382 * setting NEED_RESCHED or anything that normal interrupts and 1383 * exceptions might do. 1384 */ 1385 call paranoid_entry 1386 1387 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1388 movq %rsp, %rdi 1389 movq $-1, %rsi 1390 call do_nmi 1391 1392 testl %ebx, %ebx /* swapgs needed? */ 1393 jnz nmi_restore 1394nmi_swapgs: 1395 SWAPGS_UNSAFE_STACK 1396nmi_restore: 1397 RESTORE_EXTRA_REGS 1398 RESTORE_C_REGS 1399 1400 /* Point RSP at the "iret" frame. */ 1401 REMOVE_PT_GPREGS_FROM_STACK 6*8 1402 1403 /* 1404 * Clear "NMI executing". Set DF first so that we can easily 1405 * distinguish the remaining code between here and IRET from 1406 * the SYSCALL entry and exit paths. On a native kernel, we 1407 * could just inspect RIP, but, on paravirt kernels, 1408 * INTERRUPT_RETURN can translate into a jump into a 1409 * hypercall page. 1410 */ 1411 std 1412 movq $0, 5*8(%rsp) /* clear "NMI executing" */ 1413 1414 /* 1415 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI 1416 * stack in a single instruction. We are returning to kernel 1417 * mode, so this cannot result in a fault. 1418 */ 1419 INTERRUPT_RETURN 1420END(nmi) 1421 1422ENTRY(ignore_sysret) 1423 mov $-ENOSYS, %eax 1424 sysret 1425END(ignore_sysret) 1426