1/* 2 * linux/arch/x86_64/entry.S 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs 6 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> 7 * 8 * entry.S contains the system-call and fault low-level handling routines. 9 * 10 * Some of this is documented in Documentation/x86/entry_64.txt 11 * 12 * A note on terminology: 13 * - iret frame: Architecture defined interrupt frame from SS to RIP 14 * at the top of the kernel process stack. 15 * 16 * Some macro usage: 17 * - ENTRY/END: Define functions in the symbol table. 18 * - TRACE_IRQ_*: Trace hardirq state for lock debugging. 19 * - idtentry: Define exception entry points. 20 */ 21#include <linux/linkage.h> 22#include <asm/segment.h> 23#include <asm/cache.h> 24#include <asm/errno.h> 25#include "calling.h" 26#include <asm/asm-offsets.h> 27#include <asm/msr.h> 28#include <asm/unistd.h> 29#include <asm/thread_info.h> 30#include <asm/hw_irq.h> 31#include <asm/page_types.h> 32#include <asm/irqflags.h> 33#include <asm/paravirt.h> 34#include <asm/percpu.h> 35#include <asm/asm.h> 36#include <asm/smap.h> 37#include <asm/pgtable_types.h> 38#include <linux/err.h> 39 40/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 41#include <linux/elf-em.h> 42#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) 43#define __AUDIT_ARCH_64BIT 0x80000000 44#define __AUDIT_ARCH_LE 0x40000000 45 46.code64 47.section .entry.text, "ax" 48 49#ifdef CONFIG_PARAVIRT 50ENTRY(native_usergs_sysret64) 51 swapgs 52 sysretq 53ENDPROC(native_usergs_sysret64) 54#endif /* CONFIG_PARAVIRT */ 55 56.macro TRACE_IRQS_IRETQ 57#ifdef CONFIG_TRACE_IRQFLAGS 58 bt $9, EFLAGS(%rsp) /* interrupts off? */ 59 jnc 1f 60 TRACE_IRQS_ON 611: 62#endif 63.endm 64 65/* 66 * When dynamic function tracer is enabled it will add a breakpoint 67 * to all locations that it is about to modify, sync CPUs, update 68 * all the code, sync CPUs, then remove the breakpoints. In this time 69 * if lockdep is enabled, it might jump back into the debug handler 70 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). 71 * 72 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to 73 * make sure the stack pointer does not get reset back to the top 74 * of the debug stack, and instead just reuses the current stack. 75 */ 76#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) 77 78.macro TRACE_IRQS_OFF_DEBUG 79 call debug_stack_set_zero 80 TRACE_IRQS_OFF 81 call debug_stack_reset 82.endm 83 84.macro TRACE_IRQS_ON_DEBUG 85 call debug_stack_set_zero 86 TRACE_IRQS_ON 87 call debug_stack_reset 88.endm 89 90.macro TRACE_IRQS_IRETQ_DEBUG 91 bt $9, EFLAGS(%rsp) /* interrupts off? */ 92 jnc 1f 93 TRACE_IRQS_ON_DEBUG 941: 95.endm 96 97#else 98# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF 99# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON 100# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ 101#endif 102 103/* 104 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. 105 * 106 * This is the only entry point used for 64-bit system calls. The 107 * hardware interface is reasonably well designed and the register to 108 * argument mapping Linux uses fits well with the registers that are 109 * available when SYSCALL is used. 110 * 111 * SYSCALL instructions can be found inlined in libc implementations as 112 * well as some other programs and libraries. There are also a handful 113 * of SYSCALL instructions in the vDSO used, for example, as a 114 * clock_gettimeofday fallback. 115 * 116 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 117 * then loads new ss, cs, and rip from previously programmed MSRs. 118 * rflags gets masked by a value from another MSR (so CLD and CLAC 119 * are not needed). SYSCALL does not save anything on the stack 120 * and does not change rsp. 121 * 122 * Registers on entry: 123 * rax system call number 124 * rcx return address 125 * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) 126 * rdi arg0 127 * rsi arg1 128 * rdx arg2 129 * r10 arg3 (needs to be moved to rcx to conform to C ABI) 130 * r8 arg4 131 * r9 arg5 132 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) 133 * 134 * Only called from user space. 135 * 136 * When user can change pt_regs->foo always force IRET. That is because 137 * it deals with uncanonical addresses better. SYSRET has trouble 138 * with them due to bugs in both AMD and Intel CPUs. 139 */ 140 141ENTRY(entry_SYSCALL_64) 142 /* 143 * Interrupts are off on entry. 144 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 145 * it is too small to ever cause noticeable irq latency. 146 */ 147 SWAPGS_UNSAFE_STACK 148 /* 149 * A hypervisor implementation might want to use a label 150 * after the swapgs, so that it can do the swapgs 151 * for the guest and jump here on syscall. 152 */ 153GLOBAL(entry_SYSCALL_64_after_swapgs) 154 155 movq %rsp, PER_CPU_VAR(rsp_scratch) 156 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 157 158 TRACE_IRQS_OFF 159 160 /* Construct struct pt_regs on stack */ 161 pushq $__USER_DS /* pt_regs->ss */ 162 pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ 163 pushq %r11 /* pt_regs->flags */ 164 pushq $__USER_CS /* pt_regs->cs */ 165 pushq %rcx /* pt_regs->ip */ 166 pushq %rax /* pt_regs->orig_ax */ 167 pushq %rdi /* pt_regs->di */ 168 pushq %rsi /* pt_regs->si */ 169 pushq %rdx /* pt_regs->dx */ 170 pushq %rcx /* pt_regs->cx */ 171 pushq $-ENOSYS /* pt_regs->ax */ 172 pushq %r8 /* pt_regs->r8 */ 173 pushq %r9 /* pt_regs->r9 */ 174 pushq %r10 /* pt_regs->r10 */ 175 pushq %r11 /* pt_regs->r11 */ 176 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 177 178 /* 179 * If we need to do entry work or if we guess we'll need to do 180 * exit work, go straight to the slow path. 181 */ 182 testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 183 jnz entry_SYSCALL64_slow_path 184 185entry_SYSCALL_64_fastpath: 186 /* 187 * Easy case: enable interrupts and issue the syscall. If the syscall 188 * needs pt_regs, we'll call a stub that disables interrupts again 189 * and jumps to the slow path. 190 */ 191 TRACE_IRQS_ON 192 ENABLE_INTERRUPTS(CLBR_NONE) 193#if __SYSCALL_MASK == ~0 194 cmpq $__NR_syscall_max, %rax 195#else 196 andl $__SYSCALL_MASK, %eax 197 cmpl $__NR_syscall_max, %eax 198#endif 199 ja 1f /* return -ENOSYS (already in pt_regs->ax) */ 200 movq %r10, %rcx 201 202 /* 203 * This call instruction is handled specially in stub_ptregs_64. 204 * It might end up jumping to the slow path. If it jumps, RAX 205 * and all argument registers are clobbered. 206 */ 207 call *sys_call_table(, %rax, 8) 208.Lentry_SYSCALL_64_after_fastpath_call: 209 210 movq %rax, RAX(%rsp) 2111: 212 213 /* 214 * If we get here, then we know that pt_regs is clean for SYSRET64. 215 * If we see that no exit work is required (which we are required 216 * to check with IRQs off), then we can go straight to SYSRET64. 217 */ 218 DISABLE_INTERRUPTS(CLBR_NONE) 219 TRACE_IRQS_OFF 220 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 221 jnz 1f 222 223 LOCKDEP_SYS_EXIT 224 TRACE_IRQS_ON /* user mode is traced as IRQs on */ 225 movq RIP(%rsp), %rcx 226 movq EFLAGS(%rsp), %r11 227 RESTORE_C_REGS_EXCEPT_RCX_R11 228 movq RSP(%rsp), %rsp 229 USERGS_SYSRET64 230 2311: 232 /* 233 * The fast path looked good when we started, but something changed 234 * along the way and we need to switch to the slow path. Calling 235 * raise(3) will trigger this, for example. IRQs are off. 236 */ 237 TRACE_IRQS_ON 238 ENABLE_INTERRUPTS(CLBR_NONE) 239 SAVE_EXTRA_REGS 240 movq %rsp, %rdi 241 call syscall_return_slowpath /* returns with IRQs disabled */ 242 jmp return_from_SYSCALL_64 243 244entry_SYSCALL64_slow_path: 245 /* IRQs are off. */ 246 SAVE_EXTRA_REGS 247 movq %rsp, %rdi 248 call do_syscall_64 /* returns with IRQs disabled */ 249 250return_from_SYSCALL_64: 251 RESTORE_EXTRA_REGS 252 TRACE_IRQS_IRETQ /* we're about to change IF */ 253 254 /* 255 * Try to use SYSRET instead of IRET if we're returning to 256 * a completely clean 64-bit userspace context. 257 */ 258 movq RCX(%rsp), %rcx 259 movq RIP(%rsp), %r11 260 cmpq %rcx, %r11 /* RCX == RIP */ 261 jne opportunistic_sysret_failed 262 263 /* 264 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP 265 * in kernel space. This essentially lets the user take over 266 * the kernel, since userspace controls RSP. 267 * 268 * If width of "canonical tail" ever becomes variable, this will need 269 * to be updated to remain correct on both old and new CPUs. 270 */ 271 .ifne __VIRTUAL_MASK_SHIFT - 47 272 .error "virtual address width changed -- SYSRET checks need update" 273 .endif 274 275 /* Change top 16 bits to be the sign-extension of 47th bit */ 276 shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 277 sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx 278 279 /* If this changed %rcx, it was not canonical */ 280 cmpq %rcx, %r11 281 jne opportunistic_sysret_failed 282 283 cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ 284 jne opportunistic_sysret_failed 285 286 movq R11(%rsp), %r11 287 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ 288 jne opportunistic_sysret_failed 289 290 /* 291 * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, 292 * restoring TF results in a trap from userspace immediately after 293 * SYSRET. This would cause an infinite loop whenever #DB happens 294 * with register state that satisfies the opportunistic SYSRET 295 * conditions. For example, single-stepping this user code: 296 * 297 * movq $stuck_here, %rcx 298 * pushfq 299 * popq %r11 300 * stuck_here: 301 * 302 * would never get past 'stuck_here'. 303 */ 304 testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 305 jnz opportunistic_sysret_failed 306 307 /* nothing to check for RSP */ 308 309 cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ 310 jne opportunistic_sysret_failed 311 312 /* 313 * We win! This label is here just for ease of understanding 314 * perf profiles. Nothing jumps here. 315 */ 316syscall_return_via_sysret: 317 /* rcx and r11 are already restored (see code above) */ 318 RESTORE_C_REGS_EXCEPT_RCX_R11 319 movq RSP(%rsp), %rsp 320 USERGS_SYSRET64 321 322opportunistic_sysret_failed: 323 SWAPGS 324 jmp restore_c_regs_and_iret 325END(entry_SYSCALL_64) 326 327ENTRY(stub_ptregs_64) 328 /* 329 * Syscalls marked as needing ptregs land here. 330 * If we are on the fast path, we need to save the extra regs, 331 * which we achieve by trying again on the slow path. If we are on 332 * the slow path, the extra regs are already saved. 333 * 334 * RAX stores a pointer to the C function implementing the syscall. 335 * IRQs are on. 336 */ 337 cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) 338 jne 1f 339 340 /* 341 * Called from fast path -- disable IRQs again, pop return address 342 * and jump to slow path 343 */ 344 DISABLE_INTERRUPTS(CLBR_NONE) 345 TRACE_IRQS_OFF 346 popq %rax 347 jmp entry_SYSCALL64_slow_path 348 3491: 350 /* Called from C */ 351 jmp *%rax /* called from C */ 352END(stub_ptregs_64) 353 354.macro ptregs_stub func 355ENTRY(ptregs_\func) 356 leaq \func(%rip), %rax 357 jmp stub_ptregs_64 358END(ptregs_\func) 359.endm 360 361/* Instantiate ptregs_stub for each ptregs-using syscall */ 362#define __SYSCALL_64_QUAL_(sym) 363#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym 364#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) 365#include <asm/syscalls_64.h> 366 367/* 368 * A newly forked process directly context switches into this address. 369 * 370 * rdi: prev task we switched from 371 */ 372ENTRY(ret_from_fork) 373 LOCK ; btr $TIF_FORK, TI_flags(%r8) 374 375 pushq $0x0002 376 popfq /* reset kernel eflags */ 377 378 call schedule_tail /* rdi: 'prev' task parameter */ 379 380 testb $3, CS(%rsp) /* from kernel_thread? */ 381 jnz 1f 382 383 /* 384 * We came from kernel_thread. This code path is quite twisted, and 385 * someone should clean it up. 386 * 387 * copy_thread_tls stashes the function pointer in RBX and the 388 * parameter to be passed in RBP. The called function is permitted 389 * to call do_execve and thereby jump to user mode. 390 */ 391 movq RBP(%rsp), %rdi 392 call *RBX(%rsp) 393 movl $0, RAX(%rsp) 394 395 /* 396 * Fall through as though we're exiting a syscall. This makes a 397 * twisted sort of sense if we just called do_execve. 398 */ 399 4001: 401 movq %rsp, %rdi 402 call syscall_return_slowpath /* returns with IRQs disabled */ 403 TRACE_IRQS_ON /* user mode is traced as IRQS on */ 404 SWAPGS 405 jmp restore_regs_and_iret 406END(ret_from_fork) 407 408/* 409 * Build the entry stubs with some assembler magic. 410 * We pack 1 stub into every 8-byte block. 411 */ 412 .align 8 413ENTRY(irq_entries_start) 414 vector=FIRST_EXTERNAL_VECTOR 415 .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) 416 pushq $(~vector+0x80) /* Note: always in signed byte range */ 417 vector=vector+1 418 jmp common_interrupt 419 .align 8 420 .endr 421END(irq_entries_start) 422 423/* 424 * Interrupt entry/exit. 425 * 426 * Interrupt entry points save only callee clobbered registers in fast path. 427 * 428 * Entry runs with interrupts off. 429 */ 430 431/* 0(%rsp): ~(interrupt number) */ 432 .macro interrupt func 433 cld 434 ALLOC_PT_GPREGS_ON_STACK 435 SAVE_C_REGS 436 SAVE_EXTRA_REGS 437 438 testb $3, CS(%rsp) 439 jz 1f 440 441 /* 442 * IRQ from user mode. Switch to kernel gsbase and inform context 443 * tracking that we're in kernel mode. 444 */ 445 SWAPGS 446 447 /* 448 * We need to tell lockdep that IRQs are off. We can't do this until 449 * we fix gsbase, and we should do it before enter_from_user_mode 450 * (which can take locks). Since TRACE_IRQS_OFF idempotent, 451 * the simplest way to handle it is to just call it twice if 452 * we enter from user mode. There's no reason to optimize this since 453 * TRACE_IRQS_OFF is a no-op if lockdep is off. 454 */ 455 TRACE_IRQS_OFF 456 457 CALL_enter_from_user_mode 458 4591: 460 /* 461 * Save previous stack pointer, optionally switch to interrupt stack. 462 * irq_count is used to check if a CPU is already on an interrupt stack 463 * or not. While this is essentially redundant with preempt_count it is 464 * a little cheaper to use a separate counter in the PDA (short of 465 * moving irq_enter into assembly, which would be too much work) 466 */ 467 movq %rsp, %rdi 468 incl PER_CPU_VAR(irq_count) 469 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 470 pushq %rdi 471 /* We entered an interrupt context - irqs are off: */ 472 TRACE_IRQS_OFF 473 474 call \func /* rdi points to pt_regs */ 475 .endm 476 477 /* 478 * The interrupt stubs push (~vector+0x80) onto the stack and 479 * then jump to common_interrupt. 480 */ 481 .p2align CONFIG_X86_L1_CACHE_SHIFT 482common_interrupt: 483 ASM_CLAC 484 addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ 485 interrupt do_IRQ 486 /* 0(%rsp): old RSP */ 487ret_from_intr: 488 DISABLE_INTERRUPTS(CLBR_NONE) 489 TRACE_IRQS_OFF 490 decl PER_CPU_VAR(irq_count) 491 492 /* Restore saved previous stack */ 493 popq %rsp 494 495 testb $3, CS(%rsp) 496 jz retint_kernel 497 498 /* Interrupt came from user space */ 499GLOBAL(retint_user) 500 mov %rsp,%rdi 501 call prepare_exit_to_usermode 502 TRACE_IRQS_IRETQ 503 SWAPGS 504 jmp restore_regs_and_iret 505 506/* Returning to kernel space */ 507retint_kernel: 508#ifdef CONFIG_PREEMPT 509 /* Interrupts are off */ 510 /* Check if we need preemption */ 511 bt $9, EFLAGS(%rsp) /* were interrupts off? */ 512 jnc 1f 5130: cmpl $0, PER_CPU_VAR(__preempt_count) 514 jnz 1f 515 call preempt_schedule_irq 516 jmp 0b 5171: 518#endif 519 /* 520 * The iretq could re-enable interrupts: 521 */ 522 TRACE_IRQS_IRETQ 523 524/* 525 * At this label, code paths which return to kernel and to user, 526 * which come from interrupts/exception and from syscalls, merge. 527 */ 528GLOBAL(restore_regs_and_iret) 529 RESTORE_EXTRA_REGS 530restore_c_regs_and_iret: 531 RESTORE_C_REGS 532 REMOVE_PT_GPREGS_FROM_STACK 8 533 INTERRUPT_RETURN 534 535ENTRY(native_iret) 536 /* 537 * Are we returning to a stack segment from the LDT? Note: in 538 * 64-bit mode SS:RSP on the exception stack is always valid. 539 */ 540#ifdef CONFIG_X86_ESPFIX64 541 testb $4, (SS-RIP)(%rsp) 542 jnz native_irq_return_ldt 543#endif 544 545.global native_irq_return_iret 546native_irq_return_iret: 547 /* 548 * This may fault. Non-paranoid faults on return to userspace are 549 * handled by fixup_bad_iret. These include #SS, #GP, and #NP. 550 * Double-faults due to espfix64 are handled in do_double_fault. 551 * Other faults here are fatal. 552 */ 553 iretq 554 555#ifdef CONFIG_X86_ESPFIX64 556native_irq_return_ldt: 557 pushq %rax 558 pushq %rdi 559 SWAPGS 560 movq PER_CPU_VAR(espfix_waddr), %rdi 561 movq %rax, (0*8)(%rdi) /* RAX */ 562 movq (2*8)(%rsp), %rax /* RIP */ 563 movq %rax, (1*8)(%rdi) 564 movq (3*8)(%rsp), %rax /* CS */ 565 movq %rax, (2*8)(%rdi) 566 movq (4*8)(%rsp), %rax /* RFLAGS */ 567 movq %rax, (3*8)(%rdi) 568 movq (6*8)(%rsp), %rax /* SS */ 569 movq %rax, (5*8)(%rdi) 570 movq (5*8)(%rsp), %rax /* RSP */ 571 movq %rax, (4*8)(%rdi) 572 andl $0xffff0000, %eax 573 popq %rdi 574 orq PER_CPU_VAR(espfix_stack), %rax 575 SWAPGS 576 movq %rax, %rsp 577 popq %rax 578 jmp native_irq_return_iret 579#endif 580END(common_interrupt) 581 582/* 583 * APIC interrupts. 584 */ 585.macro apicinterrupt3 num sym do_sym 586ENTRY(\sym) 587 ASM_CLAC 588 pushq $~(\num) 589.Lcommon_\sym: 590 interrupt \do_sym 591 jmp ret_from_intr 592END(\sym) 593.endm 594 595#ifdef CONFIG_TRACING 596#define trace(sym) trace_##sym 597#define smp_trace(sym) smp_trace_##sym 598 599.macro trace_apicinterrupt num sym 600apicinterrupt3 \num trace(\sym) smp_trace(\sym) 601.endm 602#else 603.macro trace_apicinterrupt num sym do_sym 604.endm 605#endif 606 607.macro apicinterrupt num sym do_sym 608apicinterrupt3 \num \sym \do_sym 609trace_apicinterrupt \num \sym 610.endm 611 612#ifdef CONFIG_SMP 613apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 614apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt 615#endif 616 617#ifdef CONFIG_X86_UV 618apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt 619#endif 620 621apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt 622apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi 623 624#ifdef CONFIG_HAVE_KVM 625apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi 626apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi 627#endif 628 629#ifdef CONFIG_X86_MCE_THRESHOLD 630apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt 631#endif 632 633#ifdef CONFIG_X86_MCE_AMD 634apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt 635#endif 636 637#ifdef CONFIG_X86_THERMAL_VECTOR 638apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt 639#endif 640 641#ifdef CONFIG_SMP 642apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt 643apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt 644apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt 645#endif 646 647apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt 648apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt 649 650#ifdef CONFIG_IRQ_WORK 651apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt 652#endif 653 654/* 655 * Exception entry points. 656 */ 657#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) 658 659.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 660ENTRY(\sym) 661 /* Sanity check */ 662 .if \shift_ist != -1 && \paranoid == 0 663 .error "using shift_ist requires paranoid=1" 664 .endif 665 666 ASM_CLAC 667 PARAVIRT_ADJUST_EXCEPTION_FRAME 668 669 .ifeq \has_error_code 670 pushq $-1 /* ORIG_RAX: no syscall to restart */ 671 .endif 672 673 ALLOC_PT_GPREGS_ON_STACK 674 675 .if \paranoid 676 .if \paranoid == 1 677 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ 678 jnz 1f 679 .endif 680 call paranoid_entry 681 .else 682 call error_entry 683 .endif 684 /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ 685 686 .if \paranoid 687 .if \shift_ist != -1 688 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ 689 .else 690 TRACE_IRQS_OFF 691 .endif 692 .endif 693 694 movq %rsp, %rdi /* pt_regs pointer */ 695 696 .if \has_error_code 697 movq ORIG_RAX(%rsp), %rsi /* get error code */ 698 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ 699 .else 700 xorl %esi, %esi /* no error code */ 701 .endif 702 703 .if \shift_ist != -1 704 subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 705 .endif 706 707 call \do_sym 708 709 .if \shift_ist != -1 710 addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) 711 .endif 712 713 /* these procedures expect "no swapgs" flag in ebx */ 714 .if \paranoid 715 jmp paranoid_exit 716 .else 717 jmp error_exit 718 .endif 719 720 .if \paranoid == 1 721 /* 722 * Paranoid entry from userspace. Switch stacks and treat it 723 * as a normal entry. This means that paranoid handlers 724 * run in real process context if user_mode(regs). 725 */ 7261: 727 call error_entry 728 729 730 movq %rsp, %rdi /* pt_regs pointer */ 731 call sync_regs 732 movq %rax, %rsp /* switch stack */ 733 734 movq %rsp, %rdi /* pt_regs pointer */ 735 736 .if \has_error_code 737 movq ORIG_RAX(%rsp), %rsi /* get error code */ 738 movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ 739 .else 740 xorl %esi, %esi /* no error code */ 741 .endif 742 743 call \do_sym 744 745 jmp error_exit /* %ebx: no swapgs flag */ 746 .endif 747END(\sym) 748.endm 749 750#ifdef CONFIG_TRACING 751.macro trace_idtentry sym do_sym has_error_code:req 752idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code 753idtentry \sym \do_sym has_error_code=\has_error_code 754.endm 755#else 756.macro trace_idtentry sym do_sym has_error_code:req 757idtentry \sym \do_sym has_error_code=\has_error_code 758.endm 759#endif 760 761idtentry divide_error do_divide_error has_error_code=0 762idtentry overflow do_overflow has_error_code=0 763idtentry bounds do_bounds has_error_code=0 764idtentry invalid_op do_invalid_op has_error_code=0 765idtentry device_not_available do_device_not_available has_error_code=0 766idtentry double_fault do_double_fault has_error_code=1 paranoid=2 767idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 768idtentry invalid_TSS do_invalid_TSS has_error_code=1 769idtentry segment_not_present do_segment_not_present has_error_code=1 770idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 771idtentry coprocessor_error do_coprocessor_error has_error_code=0 772idtentry alignment_check do_alignment_check has_error_code=1 773idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 774 775 776 /* 777 * Reload gs selector with exception handling 778 * edi: new selector 779 */ 780ENTRY(native_load_gs_index) 781 pushfq 782 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 783 SWAPGS 784gs_change: 785 movl %edi, %gs 7862: mfence /* workaround */ 787 SWAPGS 788 popfq 789 ret 790END(native_load_gs_index) 791 792 _ASM_EXTABLE(gs_change, bad_gs) 793 .section .fixup, "ax" 794 /* running with kernelgs */ 795bad_gs: 796 SWAPGS /* switch back to user gs */ 797 xorl %eax, %eax 798 movl %eax, %gs 799 jmp 2b 800 .previous 801 802/* Call softirq on interrupt stack. Interrupts are off. */ 803ENTRY(do_softirq_own_stack) 804 pushq %rbp 805 mov %rsp, %rbp 806 incl PER_CPU_VAR(irq_count) 807 cmove PER_CPU_VAR(irq_stack_ptr), %rsp 808 push %rbp /* frame pointer backlink */ 809 call __do_softirq 810 leaveq 811 decl PER_CPU_VAR(irq_count) 812 ret 813END(do_softirq_own_stack) 814 815#ifdef CONFIG_XEN 816idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 817 818/* 819 * A note on the "critical region" in our callback handler. 820 * We want to avoid stacking callback handlers due to events occurring 821 * during handling of the last event. To do this, we keep events disabled 822 * until we've done all processing. HOWEVER, we must enable events before 823 * popping the stack frame (can't be done atomically) and so it would still 824 * be possible to get enough handler activations to overflow the stack. 825 * Although unlikely, bugs of that kind are hard to track down, so we'd 826 * like to avoid the possibility. 827 * So, on entry to the handler we detect whether we interrupted an 828 * existing activation in its critical region -- if so, we pop the current 829 * activation and restart the handler using the previous one. 830 */ 831ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ 832 833/* 834 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 835 * see the correct pointer to the pt_regs 836 */ 837 movq %rdi, %rsp /* we don't return, adjust the stack frame */ 83811: incl PER_CPU_VAR(irq_count) 839 movq %rsp, %rbp 840 cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp 841 pushq %rbp /* frame pointer backlink */ 842 call xen_evtchn_do_upcall 843 popq %rsp 844 decl PER_CPU_VAR(irq_count) 845#ifndef CONFIG_PREEMPT 846 call xen_maybe_preempt_hcall 847#endif 848 jmp error_exit 849END(xen_do_hypervisor_callback) 850 851/* 852 * Hypervisor uses this for application faults while it executes. 853 * We get here for two reasons: 854 * 1. Fault while reloading DS, ES, FS or GS 855 * 2. Fault while executing IRET 856 * Category 1 we do not need to fix up as Xen has already reloaded all segment 857 * registers that could be reloaded and zeroed the others. 858 * Category 2 we fix up by killing the current process. We cannot use the 859 * normal Linux return path in this case because if we use the IRET hypercall 860 * to pop the stack frame we end up in an infinite loop of failsafe callbacks. 861 * We distinguish between categories by comparing each saved segment register 862 * with its current contents: any discrepancy means we in category 1. 863 */ 864ENTRY(xen_failsafe_callback) 865 movl %ds, %ecx 866 cmpw %cx, 0x10(%rsp) 867 jne 1f 868 movl %es, %ecx 869 cmpw %cx, 0x18(%rsp) 870 jne 1f 871 movl %fs, %ecx 872 cmpw %cx, 0x20(%rsp) 873 jne 1f 874 movl %gs, %ecx 875 cmpw %cx, 0x28(%rsp) 876 jne 1f 877 /* All segments match their saved values => Category 2 (Bad IRET). */ 878 movq (%rsp), %rcx 879 movq 8(%rsp), %r11 880 addq $0x30, %rsp 881 pushq $0 /* RIP */ 882 pushq %r11 883 pushq %rcx 884 jmp general_protection 8851: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 886 movq (%rsp), %rcx 887 movq 8(%rsp), %r11 888 addq $0x30, %rsp 889 pushq $-1 /* orig_ax = -1 => not a system call */ 890 ALLOC_PT_GPREGS_ON_STACK 891 SAVE_C_REGS 892 SAVE_EXTRA_REGS 893 jmp error_exit 894END(xen_failsafe_callback) 895 896apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ 897 xen_hvm_callback_vector xen_evtchn_do_upcall 898 899#endif /* CONFIG_XEN */ 900 901#if IS_ENABLED(CONFIG_HYPERV) 902apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ 903 hyperv_callback_vector hyperv_vector_handler 904#endif /* CONFIG_HYPERV */ 905 906idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 907idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK 908idtentry stack_segment do_stack_segment has_error_code=1 909 910#ifdef CONFIG_XEN 911idtentry xen_debug do_debug has_error_code=0 912idtentry xen_int3 do_int3 has_error_code=0 913idtentry xen_stack_segment do_stack_segment has_error_code=1 914#endif 915 916idtentry general_protection do_general_protection has_error_code=1 917trace_idtentry page_fault do_page_fault has_error_code=1 918 919#ifdef CONFIG_KVM_GUEST 920idtentry async_page_fault do_async_page_fault has_error_code=1 921#endif 922 923#ifdef CONFIG_X86_MCE 924idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) 925#endif 926 927/* 928 * Save all registers in pt_regs, and switch gs if needed. 929 * Use slow, but surefire "are we in kernel?" check. 930 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise 931 */ 932ENTRY(paranoid_entry) 933 cld 934 SAVE_C_REGS 8 935 SAVE_EXTRA_REGS 8 936 movl $1, %ebx 937 movl $MSR_GS_BASE, %ecx 938 rdmsr 939 testl %edx, %edx 940 js 1f /* negative -> in kernel */ 941 SWAPGS 942 xorl %ebx, %ebx 9431: ret 944END(paranoid_entry) 945 946/* 947 * "Paranoid" exit path from exception stack. This is invoked 948 * only on return from non-NMI IST interrupts that came 949 * from kernel space. 950 * 951 * We may be returning to very strange contexts (e.g. very early 952 * in syscall entry), so checking for preemption here would 953 * be complicated. Fortunately, we there's no good reason 954 * to try to handle preemption here. 955 * 956 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) 957 */ 958ENTRY(paranoid_exit) 959 DISABLE_INTERRUPTS(CLBR_NONE) 960 TRACE_IRQS_OFF_DEBUG 961 testl %ebx, %ebx /* swapgs needed? */ 962 jnz paranoid_exit_no_swapgs 963 TRACE_IRQS_IRETQ 964 SWAPGS_UNSAFE_STACK 965 jmp paranoid_exit_restore 966paranoid_exit_no_swapgs: 967 TRACE_IRQS_IRETQ_DEBUG 968paranoid_exit_restore: 969 RESTORE_EXTRA_REGS 970 RESTORE_C_REGS 971 REMOVE_PT_GPREGS_FROM_STACK 8 972 INTERRUPT_RETURN 973END(paranoid_exit) 974 975/* 976 * Save all registers in pt_regs, and switch gs if needed. 977 * Return: EBX=0: came from user mode; EBX=1: otherwise 978 */ 979ENTRY(error_entry) 980 cld 981 SAVE_C_REGS 8 982 SAVE_EXTRA_REGS 8 983 xorl %ebx, %ebx 984 testb $3, CS+8(%rsp) 985 jz .Lerror_kernelspace 986 987.Lerror_entry_from_usermode_swapgs: 988 /* 989 * We entered from user mode or we're pretending to have entered 990 * from user mode due to an IRET fault. 991 */ 992 SWAPGS 993 994.Lerror_entry_from_usermode_after_swapgs: 995 /* 996 * We need to tell lockdep that IRQs are off. We can't do this until 997 * we fix gsbase, and we should do it before enter_from_user_mode 998 * (which can take locks). 999 */ 1000 TRACE_IRQS_OFF 1001 CALL_enter_from_user_mode 1002 ret 1003 1004.Lerror_entry_done: 1005 TRACE_IRQS_OFF 1006 ret 1007 1008 /* 1009 * There are two places in the kernel that can potentially fault with 1010 * usergs. Handle them here. B stepping K8s sometimes report a 1011 * truncated RIP for IRET exceptions returning to compat mode. Check 1012 * for these here too. 1013 */ 1014.Lerror_kernelspace: 1015 incl %ebx 1016 leaq native_irq_return_iret(%rip), %rcx 1017 cmpq %rcx, RIP+8(%rsp) 1018 je .Lerror_bad_iret 1019 movl %ecx, %eax /* zero extend */ 1020 cmpq %rax, RIP+8(%rsp) 1021 je .Lbstep_iret 1022 cmpq $gs_change, RIP+8(%rsp) 1023 jne .Lerror_entry_done 1024 1025 /* 1026 * hack: gs_change can fail with user gsbase. If this happens, fix up 1027 * gsbase and proceed. We'll fix up the exception and land in 1028 * gs_change's error handler with kernel gsbase. 1029 */ 1030 jmp .Lerror_entry_from_usermode_swapgs 1031 1032.Lbstep_iret: 1033 /* Fix truncated RIP */ 1034 movq %rcx, RIP+8(%rsp) 1035 /* fall through */ 1036 1037.Lerror_bad_iret: 1038 /* 1039 * We came from an IRET to user mode, so we have user gsbase. 1040 * Switch to kernel gsbase: 1041 */ 1042 SWAPGS 1043 1044 /* 1045 * Pretend that the exception came from user mode: set up pt_regs 1046 * as if we faulted immediately after IRET and clear EBX so that 1047 * error_exit knows that we will be returning to user mode. 1048 */ 1049 mov %rsp, %rdi 1050 call fixup_bad_iret 1051 mov %rax, %rsp 1052 decl %ebx 1053 jmp .Lerror_entry_from_usermode_after_swapgs 1054END(error_entry) 1055 1056 1057/* 1058 * On entry, EBS is a "return to kernel mode" flag: 1059 * 1: already in kernel mode, don't need SWAPGS 1060 * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode 1061 */ 1062ENTRY(error_exit) 1063 movl %ebx, %eax 1064 DISABLE_INTERRUPTS(CLBR_NONE) 1065 TRACE_IRQS_OFF 1066 testl %eax, %eax 1067 jnz retint_kernel 1068 jmp retint_user 1069END(error_exit) 1070 1071/* Runs on exception stack */ 1072ENTRY(nmi) 1073 /* 1074 * Fix up the exception frame if we're on Xen. 1075 * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most 1076 * one value to the stack on native, so it may clobber the rdx 1077 * scratch slot, but it won't clobber any of the important 1078 * slots past it. 1079 * 1080 * Xen is a different story, because the Xen frame itself overlaps 1081 * the "NMI executing" variable. 1082 */ 1083 PARAVIRT_ADJUST_EXCEPTION_FRAME 1084 1085 /* 1086 * We allow breakpoints in NMIs. If a breakpoint occurs, then 1087 * the iretq it performs will take us out of NMI context. 1088 * This means that we can have nested NMIs where the next 1089 * NMI is using the top of the stack of the previous NMI. We 1090 * can't let it execute because the nested NMI will corrupt the 1091 * stack of the previous NMI. NMI handlers are not re-entrant 1092 * anyway. 1093 * 1094 * To handle this case we do the following: 1095 * Check the a special location on the stack that contains 1096 * a variable that is set when NMIs are executing. 1097 * The interrupted task's stack is also checked to see if it 1098 * is an NMI stack. 1099 * If the variable is not set and the stack is not the NMI 1100 * stack then: 1101 * o Set the special variable on the stack 1102 * o Copy the interrupt frame into an "outermost" location on the 1103 * stack 1104 * o Copy the interrupt frame into an "iret" location on the stack 1105 * o Continue processing the NMI 1106 * If the variable is set or the previous stack is the NMI stack: 1107 * o Modify the "iret" location to jump to the repeat_nmi 1108 * o return back to the first NMI 1109 * 1110 * Now on exit of the first NMI, we first clear the stack variable 1111 * The NMI stack will tell any nested NMIs at that point that it is 1112 * nested. Then we pop the stack normally with iret, and if there was 1113 * a nested NMI that updated the copy interrupt stack frame, a 1114 * jump will be made to the repeat_nmi code that will handle the second 1115 * NMI. 1116 * 1117 * However, espfix prevents us from directly returning to userspace 1118 * with a single IRET instruction. Similarly, IRET to user mode 1119 * can fault. We therefore handle NMIs from user space like 1120 * other IST entries. 1121 */ 1122 1123 /* Use %rdx as our temp variable throughout */ 1124 pushq %rdx 1125 1126 testb $3, CS-RIP+8(%rsp) 1127 jz .Lnmi_from_kernel 1128 1129 /* 1130 * NMI from user mode. We need to run on the thread stack, but we 1131 * can't go through the normal entry paths: NMIs are masked, and 1132 * we don't want to enable interrupts, because then we'll end 1133 * up in an awkward situation in which IRQs are on but NMIs 1134 * are off. 1135 * 1136 * We also must not push anything to the stack before switching 1137 * stacks lest we corrupt the "NMI executing" variable. 1138 */ 1139 1140 SWAPGS_UNSAFE_STACK 1141 cld 1142 movq %rsp, %rdx 1143 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 1144 pushq 5*8(%rdx) /* pt_regs->ss */ 1145 pushq 4*8(%rdx) /* pt_regs->rsp */ 1146 pushq 3*8(%rdx) /* pt_regs->flags */ 1147 pushq 2*8(%rdx) /* pt_regs->cs */ 1148 pushq 1*8(%rdx) /* pt_regs->rip */ 1149 pushq $-1 /* pt_regs->orig_ax */ 1150 pushq %rdi /* pt_regs->di */ 1151 pushq %rsi /* pt_regs->si */ 1152 pushq (%rdx) /* pt_regs->dx */ 1153 pushq %rcx /* pt_regs->cx */ 1154 pushq %rax /* pt_regs->ax */ 1155 pushq %r8 /* pt_regs->r8 */ 1156 pushq %r9 /* pt_regs->r9 */ 1157 pushq %r10 /* pt_regs->r10 */ 1158 pushq %r11 /* pt_regs->r11 */ 1159 pushq %rbx /* pt_regs->rbx */ 1160 pushq %rbp /* pt_regs->rbp */ 1161 pushq %r12 /* pt_regs->r12 */ 1162 pushq %r13 /* pt_regs->r13 */ 1163 pushq %r14 /* pt_regs->r14 */ 1164 pushq %r15 /* pt_regs->r15 */ 1165 1166 /* 1167 * At this point we no longer need to worry about stack damage 1168 * due to nesting -- we're on the normal thread stack and we're 1169 * done with the NMI stack. 1170 */ 1171 1172 movq %rsp, %rdi 1173 movq $-1, %rsi 1174 call do_nmi 1175 1176 /* 1177 * Return back to user mode. We must *not* do the normal exit 1178 * work, because we don't want to enable interrupts. Fortunately, 1179 * do_nmi doesn't modify pt_regs. 1180 */ 1181 SWAPGS 1182 jmp restore_c_regs_and_iret 1183 1184.Lnmi_from_kernel: 1185 /* 1186 * Here's what our stack frame will look like: 1187 * +---------------------------------------------------------+ 1188 * | original SS | 1189 * | original Return RSP | 1190 * | original RFLAGS | 1191 * | original CS | 1192 * | original RIP | 1193 * +---------------------------------------------------------+ 1194 * | temp storage for rdx | 1195 * +---------------------------------------------------------+ 1196 * | "NMI executing" variable | 1197 * +---------------------------------------------------------+ 1198 * | iret SS } Copied from "outermost" frame | 1199 * | iret Return RSP } on each loop iteration; overwritten | 1200 * | iret RFLAGS } by a nested NMI to force another | 1201 * | iret CS } iteration if needed. | 1202 * | iret RIP } | 1203 * +---------------------------------------------------------+ 1204 * | outermost SS } initialized in first_nmi; | 1205 * | outermost Return RSP } will not be changed before | 1206 * | outermost RFLAGS } NMI processing is done. | 1207 * | outermost CS } Copied to "iret" frame on each | 1208 * | outermost RIP } iteration. | 1209 * +---------------------------------------------------------+ 1210 * | pt_regs | 1211 * +---------------------------------------------------------+ 1212 * 1213 * The "original" frame is used by hardware. Before re-enabling 1214 * NMIs, we need to be done with it, and we need to leave enough 1215 * space for the asm code here. 1216 * 1217 * We return by executing IRET while RSP points to the "iret" frame. 1218 * That will either return for real or it will loop back into NMI 1219 * processing. 1220 * 1221 * The "outermost" frame is copied to the "iret" frame on each 1222 * iteration of the loop, so each iteration starts with the "iret" 1223 * frame pointing to the final return target. 1224 */ 1225 1226 /* 1227 * Determine whether we're a nested NMI. 1228 * 1229 * If we interrupted kernel code between repeat_nmi and 1230 * end_repeat_nmi, then we are a nested NMI. We must not 1231 * modify the "iret" frame because it's being written by 1232 * the outer NMI. That's okay; the outer NMI handler is 1233 * about to about to call do_nmi anyway, so we can just 1234 * resume the outer NMI. 1235 */ 1236 1237 movq $repeat_nmi, %rdx 1238 cmpq 8(%rsp), %rdx 1239 ja 1f 1240 movq $end_repeat_nmi, %rdx 1241 cmpq 8(%rsp), %rdx 1242 ja nested_nmi_out 12431: 1244 1245 /* 1246 * Now check "NMI executing". If it's set, then we're nested. 1247 * This will not detect if we interrupted an outer NMI just 1248 * before IRET. 1249 */ 1250 cmpl $1, -8(%rsp) 1251 je nested_nmi 1252 1253 /* 1254 * Now test if the previous stack was an NMI stack. This covers 1255 * the case where we interrupt an outer NMI after it clears 1256 * "NMI executing" but before IRET. We need to be careful, though: 1257 * there is one case in which RSP could point to the NMI stack 1258 * despite there being no NMI active: naughty userspace controls 1259 * RSP at the very beginning of the SYSCALL targets. We can 1260 * pull a fast one on naughty userspace, though: we program 1261 * SYSCALL to mask DF, so userspace cannot cause DF to be set 1262 * if it controls the kernel's RSP. We set DF before we clear 1263 * "NMI executing". 1264 */ 1265 lea 6*8(%rsp), %rdx 1266 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ 1267 cmpq %rdx, 4*8(%rsp) 1268 /* If the stack pointer is above the NMI stack, this is a normal NMI */ 1269 ja first_nmi 1270 1271 subq $EXCEPTION_STKSZ, %rdx 1272 cmpq %rdx, 4*8(%rsp) 1273 /* If it is below the NMI stack, it is a normal NMI */ 1274 jb first_nmi 1275 1276 /* Ah, it is within the NMI stack. */ 1277 1278 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) 1279 jz first_nmi /* RSP was user controlled. */ 1280 1281 /* This is a nested NMI. */ 1282 1283nested_nmi: 1284 /* 1285 * Modify the "iret" frame to point to repeat_nmi, forcing another 1286 * iteration of NMI handling. 1287 */ 1288 subq $8, %rsp 1289 leaq -10*8(%rsp), %rdx 1290 pushq $__KERNEL_DS 1291 pushq %rdx 1292 pushfq 1293 pushq $__KERNEL_CS 1294 pushq $repeat_nmi 1295 1296 /* Put stack back */ 1297 addq $(6*8), %rsp 1298 1299nested_nmi_out: 1300 popq %rdx 1301 1302 /* We are returning to kernel mode, so this cannot result in a fault. */ 1303 INTERRUPT_RETURN 1304 1305first_nmi: 1306 /* Restore rdx. */ 1307 movq (%rsp), %rdx 1308 1309 /* Make room for "NMI executing". */ 1310 pushq $0 1311 1312 /* Leave room for the "iret" frame */ 1313 subq $(5*8), %rsp 1314 1315 /* Copy the "original" frame to the "outermost" frame */ 1316 .rept 5 1317 pushq 11*8(%rsp) 1318 .endr 1319 1320 /* Everything up to here is safe from nested NMIs */ 1321 1322#ifdef CONFIG_DEBUG_ENTRY 1323 /* 1324 * For ease of testing, unmask NMIs right away. Disabled by 1325 * default because IRET is very expensive. 1326 */ 1327 pushq $0 /* SS */ 1328 pushq %rsp /* RSP (minus 8 because of the previous push) */ 1329 addq $8, (%rsp) /* Fix up RSP */ 1330 pushfq /* RFLAGS */ 1331 pushq $__KERNEL_CS /* CS */ 1332 pushq $1f /* RIP */ 1333 INTERRUPT_RETURN /* continues at repeat_nmi below */ 13341: 1335#endif 1336 1337repeat_nmi: 1338 /* 1339 * If there was a nested NMI, the first NMI's iret will return 1340 * here. But NMIs are still enabled and we can take another 1341 * nested NMI. The nested NMI checks the interrupted RIP to see 1342 * if it is between repeat_nmi and end_repeat_nmi, and if so 1343 * it will just return, as we are about to repeat an NMI anyway. 1344 * This makes it safe to copy to the stack frame that a nested 1345 * NMI will update. 1346 * 1347 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if 1348 * we're repeating an NMI, gsbase has the same value that it had on 1349 * the first iteration. paranoid_entry will load the kernel 1350 * gsbase if needed before we call do_nmi. "NMI executing" 1351 * is zero. 1352 */ 1353 movq $1, 10*8(%rsp) /* Set "NMI executing". */ 1354 1355 /* 1356 * Copy the "outermost" frame to the "iret" frame. NMIs that nest 1357 * here must not modify the "iret" frame while we're writing to 1358 * it or it will end up containing garbage. 1359 */ 1360 addq $(10*8), %rsp 1361 .rept 5 1362 pushq -6*8(%rsp) 1363 .endr 1364 subq $(5*8), %rsp 1365end_repeat_nmi: 1366 1367 /* 1368 * Everything below this point can be preempted by a nested NMI. 1369 * If this happens, then the inner NMI will change the "iret" 1370 * frame to point back to repeat_nmi. 1371 */ 1372 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1373 ALLOC_PT_GPREGS_ON_STACK 1374 1375 /* 1376 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit 1377 * as we should not be calling schedule in NMI context. 1378 * Even with normal interrupts enabled. An NMI should not be 1379 * setting NEED_RESCHED or anything that normal interrupts and 1380 * exceptions might do. 1381 */ 1382 call paranoid_entry 1383 1384 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1385 movq %rsp, %rdi 1386 movq $-1, %rsi 1387 call do_nmi 1388 1389 testl %ebx, %ebx /* swapgs needed? */ 1390 jnz nmi_restore 1391nmi_swapgs: 1392 SWAPGS_UNSAFE_STACK 1393nmi_restore: 1394 RESTORE_EXTRA_REGS 1395 RESTORE_C_REGS 1396 1397 /* Point RSP at the "iret" frame. */ 1398 REMOVE_PT_GPREGS_FROM_STACK 6*8 1399 1400 /* 1401 * Clear "NMI executing". Set DF first so that we can easily 1402 * distinguish the remaining code between here and IRET from 1403 * the SYSCALL entry and exit paths. On a native kernel, we 1404 * could just inspect RIP, but, on paravirt kernels, 1405 * INTERRUPT_RETURN can translate into a jump into a 1406 * hypercall page. 1407 */ 1408 std 1409 movq $0, 5*8(%rsp) /* clear "NMI executing" */ 1410 1411 /* 1412 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI 1413 * stack in a single instruction. We are returning to kernel 1414 * mode, so this cannot result in a fault. 1415 */ 1416 INTERRUPT_RETURN 1417END(nmi) 1418 1419ENTRY(ignore_sysret) 1420 mov $-ENOSYS, %eax 1421 sysret 1422END(ignore_sysret) 1423