xref: /openbmc/linux/arch/x86/entry/entry_64.S (revision 167fd210ec0555d371a20435dac7c2c7052df7ed)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
2905a36a2SIngo Molnar/*
3905a36a2SIngo Molnar *  linux/arch/x86_64/entry.S
4905a36a2SIngo Molnar *
5905a36a2SIngo Molnar *  Copyright (C) 1991, 1992  Linus Torvalds
6905a36a2SIngo Molnar *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
7905a36a2SIngo Molnar *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
84d732138SIngo Molnar *
9905a36a2SIngo Molnar * entry.S contains the system-call and fault low-level handling routines.
10905a36a2SIngo Molnar *
11cb1aaebeSMauro Carvalho Chehab * Some of this is documented in Documentation/x86/entry_64.rst
12905a36a2SIngo Molnar *
13905a36a2SIngo Molnar * A note on terminology:
14905a36a2SIngo Molnar * - iret frame:	Architecture defined interrupt frame from SS to RIP
15905a36a2SIngo Molnar *			at the top of the kernel process stack.
16905a36a2SIngo Molnar *
17905a36a2SIngo Molnar * Some macro usage:
186dcc5627SJiri Slaby * - SYM_FUNC_START/END:Define functions in the symbol table.
194d732138SIngo Molnar * - idtentry:		Define exception entry points.
20905a36a2SIngo Molnar */
21905a36a2SIngo Molnar#include <linux/linkage.h>
22905a36a2SIngo Molnar#include <asm/segment.h>
23905a36a2SIngo Molnar#include <asm/cache.h>
24905a36a2SIngo Molnar#include <asm/errno.h>
25905a36a2SIngo Molnar#include <asm/asm-offsets.h>
26905a36a2SIngo Molnar#include <asm/msr.h>
27905a36a2SIngo Molnar#include <asm/unistd.h>
28905a36a2SIngo Molnar#include <asm/thread_info.h>
29905a36a2SIngo Molnar#include <asm/hw_irq.h>
30905a36a2SIngo Molnar#include <asm/page_types.h>
31905a36a2SIngo Molnar#include <asm/irqflags.h>
32905a36a2SIngo Molnar#include <asm/paravirt.h>
33905a36a2SIngo Molnar#include <asm/percpu.h>
34905a36a2SIngo Molnar#include <asm/asm.h>
35905a36a2SIngo Molnar#include <asm/smap.h>
36905a36a2SIngo Molnar#include <asm/pgtable_types.h>
37784d5699SAl Viro#include <asm/export.h>
388c1f7558SJosh Poimboeuf#include <asm/frame.h>
39cfa82a00SThomas Gleixner#include <asm/trapnr.h>
402641f08bSDavid Woodhouse#include <asm/nospec-branch.h>
41905a36a2SIngo Molnar#include <linux/err.h>
42905a36a2SIngo Molnar
436fd166aaSPeter Zijlstra#include "calling.h"
446fd166aaSPeter Zijlstra
45905a36a2SIngo Molnar.code64
46905a36a2SIngo Molnar.section .entry.text, "ax"
47905a36a2SIngo Molnar
48905a36a2SIngo Molnar#ifdef CONFIG_PARAVIRT
49bc7b11c0SJiri SlabySYM_CODE_START(native_usergs_sysret64)
508c1f7558SJosh Poimboeuf	UNWIND_HINT_EMPTY
51905a36a2SIngo Molnar	swapgs
52905a36a2SIngo Molnar	sysretq
53bc7b11c0SJiri SlabySYM_CODE_END(native_usergs_sysret64)
54905a36a2SIngo Molnar#endif /* CONFIG_PARAVIRT */
55905a36a2SIngo Molnar
56905a36a2SIngo Molnar/*
574d732138SIngo Molnar * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
58905a36a2SIngo Molnar *
59fda57b22SAndy Lutomirski * This is the only entry point used for 64-bit system calls.  The
60fda57b22SAndy Lutomirski * hardware interface is reasonably well designed and the register to
61fda57b22SAndy Lutomirski * argument mapping Linux uses fits well with the registers that are
62fda57b22SAndy Lutomirski * available when SYSCALL is used.
63fda57b22SAndy Lutomirski *
64fda57b22SAndy Lutomirski * SYSCALL instructions can be found inlined in libc implementations as
65fda57b22SAndy Lutomirski * well as some other programs and libraries.  There are also a handful
66fda57b22SAndy Lutomirski * of SYSCALL instructions in the vDSO used, for example, as a
67fda57b22SAndy Lutomirski * clock_gettimeofday fallback.
68fda57b22SAndy Lutomirski *
694d732138SIngo Molnar * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
70905a36a2SIngo Molnar * then loads new ss, cs, and rip from previously programmed MSRs.
71905a36a2SIngo Molnar * rflags gets masked by a value from another MSR (so CLD and CLAC
72905a36a2SIngo Molnar * are not needed). SYSCALL does not save anything on the stack
73905a36a2SIngo Molnar * and does not change rsp.
74905a36a2SIngo Molnar *
75905a36a2SIngo Molnar * Registers on entry:
76905a36a2SIngo Molnar * rax  system call number
77905a36a2SIngo Molnar * rcx  return address
78905a36a2SIngo Molnar * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
79905a36a2SIngo Molnar * rdi  arg0
80905a36a2SIngo Molnar * rsi  arg1
81905a36a2SIngo Molnar * rdx  arg2
82905a36a2SIngo Molnar * r10  arg3 (needs to be moved to rcx to conform to C ABI)
83905a36a2SIngo Molnar * r8   arg4
84905a36a2SIngo Molnar * r9   arg5
85905a36a2SIngo Molnar * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
86905a36a2SIngo Molnar *
87905a36a2SIngo Molnar * Only called from user space.
88905a36a2SIngo Molnar *
89905a36a2SIngo Molnar * When user can change pt_regs->foo always force IRET. That is because
90905a36a2SIngo Molnar * it deals with uncanonical addresses better. SYSRET has trouble
91905a36a2SIngo Molnar * with them due to bugs in both AMD and Intel CPUs.
92905a36a2SIngo Molnar */
93905a36a2SIngo Molnar
94bc7b11c0SJiri SlabySYM_CODE_START(entry_SYSCALL_64)
958c1f7558SJosh Poimboeuf	UNWIND_HINT_EMPTY
96905a36a2SIngo Molnar
978a9949bcSAndy Lutomirski	swapgs
98bf904d27SAndy Lutomirski	/* tss.sp2 is scratch space. */
9998f05b51SAndy Lutomirski	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
100bf904d27SAndy Lutomirski	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
101905a36a2SIngo Molnar	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
102905a36a2SIngo Molnar
103905a36a2SIngo Molnar	/* Construct struct pt_regs on stack */
104905a36a2SIngo Molnar	pushq	$__USER_DS				/* pt_regs->ss */
10598f05b51SAndy Lutomirski	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
106905a36a2SIngo Molnar	pushq	%r11					/* pt_regs->flags */
107905a36a2SIngo Molnar	pushq	$__USER_CS				/* pt_regs->cs */
108905a36a2SIngo Molnar	pushq	%rcx					/* pt_regs->ip */
10926ba4e57SJiri SlabySYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
110905a36a2SIngo Molnar	pushq	%rax					/* pt_regs->orig_ax */
11130907fd1SDominik Brodowski
11230907fd1SDominik Brodowski	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
113905a36a2SIngo Molnar
1141e423bffSAndy Lutomirski	/* IRQs are off. */
115dfe64506SLinus Torvalds	movq	%rax, %rdi
116dfe64506SLinus Torvalds	movq	%rsp, %rsi
1171e423bffSAndy Lutomirski	call	do_syscall_64		/* returns with IRQs disabled */
1181e423bffSAndy Lutomirski
119905a36a2SIngo Molnar	/*
120905a36a2SIngo Molnar	 * Try to use SYSRET instead of IRET if we're returning to
1218a055d7fSAndy Lutomirski	 * a completely clean 64-bit userspace context.  If we're not,
1228a055d7fSAndy Lutomirski	 * go to the slow exit path.
123905a36a2SIngo Molnar	 */
124905a36a2SIngo Molnar	movq	RCX(%rsp), %rcx
125905a36a2SIngo Molnar	movq	RIP(%rsp), %r11
1268a055d7fSAndy Lutomirski
1278a055d7fSAndy Lutomirski	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
1288a055d7fSAndy Lutomirski	jne	swapgs_restore_regs_and_return_to_usermode
129905a36a2SIngo Molnar
130905a36a2SIngo Molnar	/*
131905a36a2SIngo Molnar	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
132905a36a2SIngo Molnar	 * in kernel space.  This essentially lets the user take over
133905a36a2SIngo Molnar	 * the kernel, since userspace controls RSP.
134905a36a2SIngo Molnar	 *
135905a36a2SIngo Molnar	 * If width of "canonical tail" ever becomes variable, this will need
136905a36a2SIngo Molnar	 * to be updated to remain correct on both old and new CPUs.
137361b4b58SKirill A. Shutemov	 *
138cbe0317bSKirill A. Shutemov	 * Change top bits to match most significant bit (47th or 56th bit
139cbe0317bSKirill A. Shutemov	 * depending on paging mode) in the address.
140905a36a2SIngo Molnar	 */
14109e61a77SKirill A. Shutemov#ifdef CONFIG_X86_5LEVEL
14239b95522SKirill A. Shutemov	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
14339b95522SKirill A. Shutemov		"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
14409e61a77SKirill A. Shutemov#else
145905a36a2SIngo Molnar	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
146905a36a2SIngo Molnar	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
14709e61a77SKirill A. Shutemov#endif
1484d732138SIngo Molnar
149905a36a2SIngo Molnar	/* If this changed %rcx, it was not canonical */
150905a36a2SIngo Molnar	cmpq	%rcx, %r11
1518a055d7fSAndy Lutomirski	jne	swapgs_restore_regs_and_return_to_usermode
152905a36a2SIngo Molnar
153905a36a2SIngo Molnar	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
1548a055d7fSAndy Lutomirski	jne	swapgs_restore_regs_and_return_to_usermode
155905a36a2SIngo Molnar
156905a36a2SIngo Molnar	movq	R11(%rsp), %r11
157905a36a2SIngo Molnar	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
1588a055d7fSAndy Lutomirski	jne	swapgs_restore_regs_and_return_to_usermode
159905a36a2SIngo Molnar
160905a36a2SIngo Molnar	/*
1613e035305SBorislav Petkov	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
1623e035305SBorislav Petkov	 * restore RF properly. If the slowpath sets it for whatever reason, we
1633e035305SBorislav Petkov	 * need to restore it correctly.
1643e035305SBorislav Petkov	 *
1653e035305SBorislav Petkov	 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
1663e035305SBorislav Petkov	 * trap from userspace immediately after SYSRET.  This would cause an
1673e035305SBorislav Petkov	 * infinite loop whenever #DB happens with register state that satisfies
1683e035305SBorislav Petkov	 * the opportunistic SYSRET conditions.  For example, single-stepping
1693e035305SBorislav Petkov	 * this user code:
170905a36a2SIngo Molnar	 *
171905a36a2SIngo Molnar	 *           movq	$stuck_here, %rcx
172905a36a2SIngo Molnar	 *           pushfq
173905a36a2SIngo Molnar	 *           popq %r11
174905a36a2SIngo Molnar	 *   stuck_here:
175905a36a2SIngo Molnar	 *
176905a36a2SIngo Molnar	 * would never get past 'stuck_here'.
177905a36a2SIngo Molnar	 */
178905a36a2SIngo Molnar	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
1798a055d7fSAndy Lutomirski	jnz	swapgs_restore_regs_and_return_to_usermode
180905a36a2SIngo Molnar
181905a36a2SIngo Molnar	/* nothing to check for RSP */
182905a36a2SIngo Molnar
183905a36a2SIngo Molnar	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
1848a055d7fSAndy Lutomirski	jne	swapgs_restore_regs_and_return_to_usermode
185905a36a2SIngo Molnar
186905a36a2SIngo Molnar	/*
187905a36a2SIngo Molnar	 * We win! This label is here just for ease of understanding
188905a36a2SIngo Molnar	 * perf profiles. Nothing jumps here.
189905a36a2SIngo Molnar	 */
190905a36a2SIngo Molnarsyscall_return_via_sysret:
191905a36a2SIngo Molnar	/* rcx and r11 are already restored (see code above) */
192502af0d7SDominik Brodowski	POP_REGS pop_rdi=0 skip_r11rcx=1
1933e3b9293SAndy Lutomirski
1943e3b9293SAndy Lutomirski	/*
1953e3b9293SAndy Lutomirski	 * Now all regs are restored except RSP and RDI.
1963e3b9293SAndy Lutomirski	 * Save old stack pointer and switch to trampoline stack.
1973e3b9293SAndy Lutomirski	 */
1983e3b9293SAndy Lutomirski	movq	%rsp, %rdi
199c482feefSAndy Lutomirski	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
2001fb14363SJosh Poimboeuf	UNWIND_HINT_EMPTY
2013e3b9293SAndy Lutomirski
2023e3b9293SAndy Lutomirski	pushq	RSP-RDI(%rdi)	/* RSP */
2033e3b9293SAndy Lutomirski	pushq	(%rdi)		/* RDI */
2043e3b9293SAndy Lutomirski
2053e3b9293SAndy Lutomirski	/*
2063e3b9293SAndy Lutomirski	 * We are on the trampoline stack.  All regs except RDI are live.
2073e3b9293SAndy Lutomirski	 * We can do future final exit work right here.
2083e3b9293SAndy Lutomirski	 */
209afaef01cSAlexander Popov	STACKLEAK_ERASE_NOCLOBBER
210afaef01cSAlexander Popov
2116fd166aaSPeter Zijlstra	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
2123e3b9293SAndy Lutomirski
2134fbb3910SAndy Lutomirski	popq	%rdi
2143e3b9293SAndy Lutomirski	popq	%rsp
215905a36a2SIngo Molnar	USERGS_SYSRET64
216bc7b11c0SJiri SlabySYM_CODE_END(entry_SYSCALL_64)
217905a36a2SIngo Molnar
218905a36a2SIngo Molnar/*
2190100301bSBrian Gerst * %rdi: prev task
2200100301bSBrian Gerst * %rsi: next task
2210100301bSBrian Gerst */
222b9f6976bSThomas Gleixner.pushsection .text, "ax"
22396c64806SJosh PoimboeufSYM_FUNC_START(__switch_to_asm)
2240100301bSBrian Gerst	/*
2250100301bSBrian Gerst	 * Save callee-saved registers
2260100301bSBrian Gerst	 * This must match the order in inactive_task_frame
2270100301bSBrian Gerst	 */
2280100301bSBrian Gerst	pushq	%rbp
2290100301bSBrian Gerst	pushq	%rbx
2300100301bSBrian Gerst	pushq	%r12
2310100301bSBrian Gerst	pushq	%r13
2320100301bSBrian Gerst	pushq	%r14
2330100301bSBrian Gerst	pushq	%r15
2340100301bSBrian Gerst
2350100301bSBrian Gerst	/* switch stack */
2360100301bSBrian Gerst	movq	%rsp, TASK_threadsp(%rdi)
2370100301bSBrian Gerst	movq	TASK_threadsp(%rsi), %rsp
2380100301bSBrian Gerst
239050e9baaSLinus Torvalds#ifdef CONFIG_STACKPROTECTOR
2400100301bSBrian Gerst	movq	TASK_stack_canary(%rsi), %rbx
241e6401c13SAndy Lutomirski	movq	%rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
2420100301bSBrian Gerst#endif
2430100301bSBrian Gerst
244c995efd5SDavid Woodhouse#ifdef CONFIG_RETPOLINE
245c995efd5SDavid Woodhouse	/*
246c995efd5SDavid Woodhouse	 * When switching from a shallower to a deeper call stack
247c995efd5SDavid Woodhouse	 * the RSB may either underflow or use entries populated
248c995efd5SDavid Woodhouse	 * with userspace addresses. On CPUs where those concerns
249c995efd5SDavid Woodhouse	 * exist, overwrite the RSB with entries which capture
250c995efd5SDavid Woodhouse	 * speculative execution to prevent attack.
251c995efd5SDavid Woodhouse	 */
252d1c99108SDavid Woodhouse	FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
253c995efd5SDavid Woodhouse#endif
254c995efd5SDavid Woodhouse
2550100301bSBrian Gerst	/* restore callee-saved registers */
2560100301bSBrian Gerst	popq	%r15
2570100301bSBrian Gerst	popq	%r14
2580100301bSBrian Gerst	popq	%r13
2590100301bSBrian Gerst	popq	%r12
2600100301bSBrian Gerst	popq	%rbx
2610100301bSBrian Gerst	popq	%rbp
2620100301bSBrian Gerst
2630100301bSBrian Gerst	jmp	__switch_to
26496c64806SJosh PoimboeufSYM_FUNC_END(__switch_to_asm)
265b9f6976bSThomas Gleixner.popsection
2660100301bSBrian Gerst
2670100301bSBrian Gerst/*
268905a36a2SIngo Molnar * A newly forked process directly context switches into this address.
269905a36a2SIngo Molnar *
2700100301bSBrian Gerst * rax: prev task we switched from
271616d2483SBrian Gerst * rbx: kernel thread func (NULL for user thread)
272616d2483SBrian Gerst * r12: kernel thread arg
273905a36a2SIngo Molnar */
274b9f6976bSThomas Gleixner.pushsection .text, "ax"
275bc7b11c0SJiri SlabySYM_CODE_START(ret_from_fork)
2768c1f7558SJosh Poimboeuf	UNWIND_HINT_EMPTY
2770100301bSBrian Gerst	movq	%rax, %rdi
2784d732138SIngo Molnar	call	schedule_tail			/* rdi: 'prev' task parameter */
279905a36a2SIngo Molnar
280616d2483SBrian Gerst	testq	%rbx, %rbx			/* from kernel_thread? */
281616d2483SBrian Gerst	jnz	1f				/* kernel threads are uncommon */
282905a36a2SIngo Molnar
283616d2483SBrian Gerst2:
2848c1f7558SJosh Poimboeuf	UNWIND_HINT_REGS
285ebd57499SJosh Poimboeuf	movq	%rsp, %rdi
286*167fd210SThomas Gleixner	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
2878a055d7fSAndy Lutomirski	jmp	swapgs_restore_regs_and_return_to_usermode
288616d2483SBrian Gerst
289616d2483SBrian Gerst1:
290616d2483SBrian Gerst	/* kernel thread */
291d31a5802SJosh Poimboeuf	UNWIND_HINT_EMPTY
292616d2483SBrian Gerst	movq	%r12, %rdi
29334fdce69SPeter Zijlstra	CALL_NOSPEC rbx
294616d2483SBrian Gerst	/*
295616d2483SBrian Gerst	 * A kernel thread is allowed to return here after successfully
296616d2483SBrian Gerst	 * calling do_execve().  Exit to userspace to complete the execve()
297616d2483SBrian Gerst	 * syscall.
298616d2483SBrian Gerst	 */
299616d2483SBrian Gerst	movq	$0, RAX(%rsp)
300616d2483SBrian Gerst	jmp	2b
301bc7b11c0SJiri SlabySYM_CODE_END(ret_from_fork)
302b9f6976bSThomas Gleixner.popsection
303905a36a2SIngo Molnar
3041d3e53e8SAndy Lutomirski.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
3051d3e53e8SAndy Lutomirski#ifdef CONFIG_DEBUG_ENTRY
306e17f8234SBoris Ostrovsky	pushq %rax
307e17f8234SBoris Ostrovsky	SAVE_FLAGS(CLBR_RAX)
308e17f8234SBoris Ostrovsky	testl $X86_EFLAGS_IF, %eax
3091d3e53e8SAndy Lutomirski	jz .Lokay_\@
3101d3e53e8SAndy Lutomirski	ud2
3111d3e53e8SAndy Lutomirski.Lokay_\@:
312e17f8234SBoris Ostrovsky	popq %rax
3131d3e53e8SAndy Lutomirski#endif
3141d3e53e8SAndy Lutomirski.endm
3151d3e53e8SAndy Lutomirski
316cfa82a00SThomas Gleixner/**
317cfa82a00SThomas Gleixner * idtentry_body - Macro to emit code calling the C function
318cfa82a00SThomas Gleixner * @cfunc:		C function to be called
319cfa82a00SThomas Gleixner * @has_error_code:	Hardware pushed error code on stack
320cfa82a00SThomas Gleixner */
321e2dcb5f1SThomas Gleixner.macro idtentry_body cfunc has_error_code:req
322cfa82a00SThomas Gleixner
323cfa82a00SThomas Gleixner	call	error_entry
324cfa82a00SThomas Gleixner	UNWIND_HINT_REGS
325cfa82a00SThomas Gleixner
326cfa82a00SThomas Gleixner	movq	%rsp, %rdi			/* pt_regs pointer into 1st argument*/
327cfa82a00SThomas Gleixner
328cfa82a00SThomas Gleixner	.if \has_error_code == 1
329cfa82a00SThomas Gleixner		movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
330cfa82a00SThomas Gleixner		movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
331cfa82a00SThomas Gleixner	.endif
332cfa82a00SThomas Gleixner
333cfa82a00SThomas Gleixner	call	\cfunc
334cfa82a00SThomas Gleixner
335424c7d0aSThomas Gleixner	jmp	error_return
336cfa82a00SThomas Gleixner.endm
337cfa82a00SThomas Gleixner
338cfa82a00SThomas Gleixner/**
339cfa82a00SThomas Gleixner * idtentry - Macro to generate entry stubs for simple IDT entries
340cfa82a00SThomas Gleixner * @vector:		Vector number
341cfa82a00SThomas Gleixner * @asmsym:		ASM symbol for the entry point
342cfa82a00SThomas Gleixner * @cfunc:		C function to be called
343cfa82a00SThomas Gleixner * @has_error_code:	Hardware pushed error code on stack
344cfa82a00SThomas Gleixner *
345cfa82a00SThomas Gleixner * The macro emits code to set up the kernel context for straight forward
346cfa82a00SThomas Gleixner * and simple IDT entries. No IST stack, no paranoid entry checks.
347cfa82a00SThomas Gleixner */
348e2dcb5f1SThomas Gleixner.macro idtentry vector asmsym cfunc has_error_code:req
349cfa82a00SThomas GleixnerSYM_CODE_START(\asmsym)
350cfa82a00SThomas Gleixner	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
351cfa82a00SThomas Gleixner	ASM_CLAC
352cfa82a00SThomas Gleixner
353cfa82a00SThomas Gleixner	.if \has_error_code == 0
354cfa82a00SThomas Gleixner		pushq	$-1			/* ORIG_RAX: no syscall to restart */
355cfa82a00SThomas Gleixner	.endif
356cfa82a00SThomas Gleixner
357cfa82a00SThomas Gleixner	.if \vector == X86_TRAP_BP
358cfa82a00SThomas Gleixner		/*
359cfa82a00SThomas Gleixner		 * If coming from kernel space, create a 6-word gap to allow the
360cfa82a00SThomas Gleixner		 * int3 handler to emulate a call instruction.
361cfa82a00SThomas Gleixner		 */
362cfa82a00SThomas Gleixner		testb	$3, CS-ORIG_RAX(%rsp)
363cfa82a00SThomas Gleixner		jnz	.Lfrom_usermode_no_gap_\@
364cfa82a00SThomas Gleixner		.rept	6
365cfa82a00SThomas Gleixner		pushq	5*8(%rsp)
366cfa82a00SThomas Gleixner		.endr
367cfa82a00SThomas Gleixner		UNWIND_HINT_IRET_REGS offset=8
368cfa82a00SThomas Gleixner.Lfrom_usermode_no_gap_\@:
369cfa82a00SThomas Gleixner	.endif
370cfa82a00SThomas Gleixner
371e2dcb5f1SThomas Gleixner	idtentry_body \cfunc \has_error_code
372cfa82a00SThomas Gleixner
373cfa82a00SThomas Gleixner_ASM_NOKPROBE(\asmsym)
374cfa82a00SThomas GleixnerSYM_CODE_END(\asmsym)
375cfa82a00SThomas Gleixner.endm
376cfa82a00SThomas Gleixner
377cfa82a00SThomas Gleixner/*
3780bf7c314SThomas Gleixner * Interrupt entry/exit.
3790bf7c314SThomas Gleixner *
3800bf7c314SThomas Gleixner + The interrupt stubs push (vector) onto the stack, which is the error_code
3810bf7c314SThomas Gleixner * position of idtentry exceptions, and jump to one of the two idtentry points
3820bf7c314SThomas Gleixner * (common/spurious).
3830bf7c314SThomas Gleixner *
3840bf7c314SThomas Gleixner * common_interrupt is a hotpath, align it to a cache line
3850bf7c314SThomas Gleixner */
3860bf7c314SThomas Gleixner.macro idtentry_irq vector cfunc
3870bf7c314SThomas Gleixner	.p2align CONFIG_X86_L1_CACHE_SHIFT
3880bf7c314SThomas Gleixner	idtentry \vector asm_\cfunc \cfunc has_error_code=1
3890bf7c314SThomas Gleixner.endm
3900bf7c314SThomas Gleixner
3910bf7c314SThomas Gleixner/*
3926368558cSThomas Gleixner * System vectors which invoke their handlers directly and are not
3936368558cSThomas Gleixner * going through the regular common device interrupt handling code.
3946368558cSThomas Gleixner */
3956368558cSThomas Gleixner.macro idtentry_sysvec vector cfunc
3966368558cSThomas Gleixner	idtentry \vector asm_\cfunc \cfunc has_error_code=0
3976368558cSThomas Gleixner.endm
3986368558cSThomas Gleixner
399cfa82a00SThomas Gleixner/**
400cfa82a00SThomas Gleixner * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
401cfa82a00SThomas Gleixner * @vector:		Vector number
402cfa82a00SThomas Gleixner * @asmsym:		ASM symbol for the entry point
403cfa82a00SThomas Gleixner * @cfunc:		C function to be called
404cfa82a00SThomas Gleixner *
405cfa82a00SThomas Gleixner * The macro emits code to set up the kernel context for #MC and #DB
406cfa82a00SThomas Gleixner *
407cfa82a00SThomas Gleixner * If the entry comes from user space it uses the normal entry path
408cfa82a00SThomas Gleixner * including the return to user space work and preemption checks on
409cfa82a00SThomas Gleixner * exit.
410cfa82a00SThomas Gleixner *
411cfa82a00SThomas Gleixner * If hits in kernel mode then it needs to go through the paranoid
412cfa82a00SThomas Gleixner * entry as the exception can hit any random state. No preemption
413cfa82a00SThomas Gleixner * check on exit to keep the paranoid path simple.
414cfa82a00SThomas Gleixner */
415cfa82a00SThomas Gleixner.macro idtentry_mce_db vector asmsym cfunc
416cfa82a00SThomas GleixnerSYM_CODE_START(\asmsym)
417cfa82a00SThomas Gleixner	UNWIND_HINT_IRET_REGS
418cfa82a00SThomas Gleixner	ASM_CLAC
419cfa82a00SThomas Gleixner
420cfa82a00SThomas Gleixner	pushq	$-1			/* ORIG_RAX: no syscall to restart */
421cfa82a00SThomas Gleixner
422cfa82a00SThomas Gleixner	/*
423cfa82a00SThomas Gleixner	 * If the entry is from userspace, switch stacks and treat it as
424cfa82a00SThomas Gleixner	 * a normal entry.
425cfa82a00SThomas Gleixner	 */
426cfa82a00SThomas Gleixner	testb	$3, CS-ORIG_RAX(%rsp)
427cfa82a00SThomas Gleixner	jnz	.Lfrom_usermode_switch_stack_\@
428cfa82a00SThomas Gleixner
429cfa82a00SThomas Gleixner	/*
430cfa82a00SThomas Gleixner	 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
431cfa82a00SThomas Gleixner	 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
432cfa82a00SThomas Gleixner	 */
433cfa82a00SThomas Gleixner	call	paranoid_entry
434cfa82a00SThomas Gleixner
435cfa82a00SThomas Gleixner	UNWIND_HINT_REGS
436cfa82a00SThomas Gleixner
437cfa82a00SThomas Gleixner	movq	%rsp, %rdi		/* pt_regs pointer */
438cfa82a00SThomas Gleixner
439cfa82a00SThomas Gleixner	call	\cfunc
440cfa82a00SThomas Gleixner
441cfa82a00SThomas Gleixner	jmp	paranoid_exit
442cfa82a00SThomas Gleixner
443cfa82a00SThomas Gleixner	/* Switch to the regular task stack and use the noist entry point */
444cfa82a00SThomas Gleixner.Lfrom_usermode_switch_stack_\@:
445e2dcb5f1SThomas Gleixner	idtentry_body noist_\cfunc, has_error_code=0
446cfa82a00SThomas Gleixner
447cfa82a00SThomas Gleixner_ASM_NOKPROBE(\asmsym)
448cfa82a00SThomas GleixnerSYM_CODE_END(\asmsym)
449cfa82a00SThomas Gleixner.endm
450cfa82a00SThomas Gleixner
451cfa82a00SThomas Gleixner/*
452cfa82a00SThomas Gleixner * Double fault entry. Straight paranoid. No checks from which context
453cfa82a00SThomas Gleixner * this comes because for the espfix induced #DF this would do the wrong
454cfa82a00SThomas Gleixner * thing.
455cfa82a00SThomas Gleixner */
456cfa82a00SThomas Gleixner.macro idtentry_df vector asmsym cfunc
457cfa82a00SThomas GleixnerSYM_CODE_START(\asmsym)
458cfa82a00SThomas Gleixner	UNWIND_HINT_IRET_REGS offset=8
459cfa82a00SThomas Gleixner	ASM_CLAC
460cfa82a00SThomas Gleixner
461cfa82a00SThomas Gleixner	/*
462cfa82a00SThomas Gleixner	 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
463cfa82a00SThomas Gleixner	 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
464cfa82a00SThomas Gleixner	 */
465cfa82a00SThomas Gleixner	call	paranoid_entry
466cfa82a00SThomas Gleixner	UNWIND_HINT_REGS
467cfa82a00SThomas Gleixner
468cfa82a00SThomas Gleixner	movq	%rsp, %rdi		/* pt_regs pointer into first argument */
469cfa82a00SThomas Gleixner	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
470cfa82a00SThomas Gleixner	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
471cfa82a00SThomas Gleixner	call	\cfunc
472cfa82a00SThomas Gleixner
473cfa82a00SThomas Gleixner	jmp	paranoid_exit
474cfa82a00SThomas Gleixner
475cfa82a00SThomas Gleixner_ASM_NOKPROBE(\asmsym)
476cfa82a00SThomas GleixnerSYM_CODE_END(\asmsym)
477cfa82a00SThomas Gleixner.endm
478cfa82a00SThomas Gleixner
479905a36a2SIngo Molnar/*
48053aaf262SThomas Gleixner * Include the defines which emit the idt entries which are shared
481f0178fc0SThomas Gleixner * shared between 32 and 64 bit and emit the __irqentry_text_* markers
482f0178fc0SThomas Gleixner * so the stacktrace boundary checks work.
48353aaf262SThomas Gleixner */
484f0178fc0SThomas Gleixner	.align 16
485f0178fc0SThomas Gleixner	.globl __irqentry_text_start
486f0178fc0SThomas Gleixner__irqentry_text_start:
487f0178fc0SThomas Gleixner
48853aaf262SThomas Gleixner#include <asm/idtentry.h>
48953aaf262SThomas Gleixner
490f0178fc0SThomas Gleixner	.align 16
491f0178fc0SThomas Gleixner	.globl __irqentry_text_end
492f0178fc0SThomas Gleixner__irqentry_text_end:
493f0178fc0SThomas Gleixner
494fa5e5c40SThomas GleixnerSYM_CODE_START_LOCAL(common_interrupt_return)
49526ba4e57SJiri SlabySYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
49626c4ef9cSAndy Lutomirski#ifdef CONFIG_DEBUG_ENTRY
49726c4ef9cSAndy Lutomirski	/* Assert that pt_regs indicates user mode. */
4981e4c4f61SBorislav Petkov	testb	$3, CS(%rsp)
49926c4ef9cSAndy Lutomirski	jnz	1f
50026c4ef9cSAndy Lutomirski	ud2
50126c4ef9cSAndy Lutomirski1:
50226c4ef9cSAndy Lutomirski#endif
503502af0d7SDominik Brodowski	POP_REGS pop_rdi=0
5043e3b9293SAndy Lutomirski
5053e3b9293SAndy Lutomirski	/*
5063e3b9293SAndy Lutomirski	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
5073e3b9293SAndy Lutomirski	 * Save old stack pointer and switch to trampoline stack.
5083e3b9293SAndy Lutomirski	 */
5093e3b9293SAndy Lutomirski	movq	%rsp, %rdi
510c482feefSAndy Lutomirski	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
5111fb14363SJosh Poimboeuf	UNWIND_HINT_EMPTY
5123e3b9293SAndy Lutomirski
5133e3b9293SAndy Lutomirski	/* Copy the IRET frame to the trampoline stack. */
5143e3b9293SAndy Lutomirski	pushq	6*8(%rdi)	/* SS */
5153e3b9293SAndy Lutomirski	pushq	5*8(%rdi)	/* RSP */
5163e3b9293SAndy Lutomirski	pushq	4*8(%rdi)	/* EFLAGS */
5173e3b9293SAndy Lutomirski	pushq	3*8(%rdi)	/* CS */
5183e3b9293SAndy Lutomirski	pushq	2*8(%rdi)	/* RIP */
5193e3b9293SAndy Lutomirski
5203e3b9293SAndy Lutomirski	/* Push user RDI on the trampoline stack. */
5213e3b9293SAndy Lutomirski	pushq	(%rdi)
5223e3b9293SAndy Lutomirski
5233e3b9293SAndy Lutomirski	/*
5243e3b9293SAndy Lutomirski	 * We are on the trampoline stack.  All regs except RDI are live.
5253e3b9293SAndy Lutomirski	 * We can do future final exit work right here.
5263e3b9293SAndy Lutomirski	 */
527afaef01cSAlexander Popov	STACKLEAK_ERASE_NOCLOBBER
5283e3b9293SAndy Lutomirski
5296fd166aaSPeter Zijlstra	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
5308a09317bSDave Hansen
5313e3b9293SAndy Lutomirski	/* Restore RDI. */
5323e3b9293SAndy Lutomirski	popq	%rdi
5333e3b9293SAndy Lutomirski	SWAPGS
53426c4ef9cSAndy Lutomirski	INTERRUPT_RETURN
53526c4ef9cSAndy Lutomirski
536905a36a2SIngo Molnar
53726ba4e57SJiri SlabySYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
53826c4ef9cSAndy Lutomirski#ifdef CONFIG_DEBUG_ENTRY
53926c4ef9cSAndy Lutomirski	/* Assert that pt_regs indicates kernel mode. */
5401e4c4f61SBorislav Petkov	testb	$3, CS(%rsp)
54126c4ef9cSAndy Lutomirski	jz	1f
54226c4ef9cSAndy Lutomirski	ud2
54326c4ef9cSAndy Lutomirski1:
54426c4ef9cSAndy Lutomirski#endif
545502af0d7SDominik Brodowski	POP_REGS
546e872045bSAndy Lutomirski	addq	$8, %rsp	/* skip regs->orig_ax */
54710bcc80eSMathieu Desnoyers	/*
54810bcc80eSMathieu Desnoyers	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
54910bcc80eSMathieu Desnoyers	 * when returning from IPI handler.
55010bcc80eSMathieu Desnoyers	 */
551905a36a2SIngo Molnar	INTERRUPT_RETURN
552905a36a2SIngo Molnar
553cc66936eSJiri SlabySYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
5548c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS
555905a36a2SIngo Molnar	/*
556905a36a2SIngo Molnar	 * Are we returning to a stack segment from the LDT?  Note: in
557905a36a2SIngo Molnar	 * 64-bit mode SS:RSP on the exception stack is always valid.
558905a36a2SIngo Molnar	 */
559905a36a2SIngo Molnar#ifdef CONFIG_X86_ESPFIX64
560905a36a2SIngo Molnar	testb	$4, (SS-RIP)(%rsp)
561905a36a2SIngo Molnar	jnz	native_irq_return_ldt
562905a36a2SIngo Molnar#endif
563905a36a2SIngo Molnar
564cc66936eSJiri SlabySYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
565905a36a2SIngo Molnar	/*
566905a36a2SIngo Molnar	 * This may fault.  Non-paranoid faults on return to userspace are
567905a36a2SIngo Molnar	 * handled by fixup_bad_iret.  These include #SS, #GP, and #NP.
568c29c775aSThomas Gleixner	 * Double-faults due to espfix64 are handled in exc_double_fault.
569905a36a2SIngo Molnar	 * Other faults here are fatal.
570905a36a2SIngo Molnar	 */
571905a36a2SIngo Molnar	iretq
572905a36a2SIngo Molnar
573905a36a2SIngo Molnar#ifdef CONFIG_X86_ESPFIX64
574905a36a2SIngo Molnarnative_irq_return_ldt:
57585063facSAndy Lutomirski	/*
57685063facSAndy Lutomirski	 * We are running with user GSBASE.  All GPRs contain their user
57785063facSAndy Lutomirski	 * values.  We have a percpu ESPFIX stack that is eight slots
57885063facSAndy Lutomirski	 * long (see ESPFIX_STACK_SIZE).  espfix_waddr points to the bottom
57985063facSAndy Lutomirski	 * of the ESPFIX stack.
58085063facSAndy Lutomirski	 *
58185063facSAndy Lutomirski	 * We clobber RAX and RDI in this code.  We stash RDI on the
58285063facSAndy Lutomirski	 * normal stack and RAX on the ESPFIX stack.
58385063facSAndy Lutomirski	 *
58485063facSAndy Lutomirski	 * The ESPFIX stack layout we set up looks like this:
58585063facSAndy Lutomirski	 *
58685063facSAndy Lutomirski	 * --- top of ESPFIX stack ---
58785063facSAndy Lutomirski	 * SS
58885063facSAndy Lutomirski	 * RSP
58985063facSAndy Lutomirski	 * RFLAGS
59085063facSAndy Lutomirski	 * CS
59185063facSAndy Lutomirski	 * RIP  <-- RSP points here when we're done
59285063facSAndy Lutomirski	 * RAX  <-- espfix_waddr points here
59385063facSAndy Lutomirski	 * --- bottom of ESPFIX stack ---
59485063facSAndy Lutomirski	 */
59585063facSAndy Lutomirski
59685063facSAndy Lutomirski	pushq	%rdi				/* Stash user RDI */
5978a09317bSDave Hansen	SWAPGS					/* to kernel GS */
5988a09317bSDave Hansen	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
5998a09317bSDave Hansen
600905a36a2SIngo Molnar	movq	PER_CPU_VAR(espfix_waddr), %rdi
60185063facSAndy Lutomirski	movq	%rax, (0*8)(%rdi)		/* user RAX */
60285063facSAndy Lutomirski	movq	(1*8)(%rsp), %rax		/* user RIP */
603905a36a2SIngo Molnar	movq	%rax, (1*8)(%rdi)
60485063facSAndy Lutomirski	movq	(2*8)(%rsp), %rax		/* user CS */
605905a36a2SIngo Molnar	movq	%rax, (2*8)(%rdi)
60685063facSAndy Lutomirski	movq	(3*8)(%rsp), %rax		/* user RFLAGS */
607905a36a2SIngo Molnar	movq	%rax, (3*8)(%rdi)
60885063facSAndy Lutomirski	movq	(5*8)(%rsp), %rax		/* user SS */
609905a36a2SIngo Molnar	movq	%rax, (5*8)(%rdi)
61085063facSAndy Lutomirski	movq	(4*8)(%rsp), %rax		/* user RSP */
611905a36a2SIngo Molnar	movq	%rax, (4*8)(%rdi)
61285063facSAndy Lutomirski	/* Now RAX == RSP. */
61385063facSAndy Lutomirski
61485063facSAndy Lutomirski	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
61585063facSAndy Lutomirski
61685063facSAndy Lutomirski	/*
61785063facSAndy Lutomirski	 * espfix_stack[31:16] == 0.  The page tables are set up such that
61885063facSAndy Lutomirski	 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
61985063facSAndy Lutomirski	 * espfix_waddr for any X.  That is, there are 65536 RO aliases of
62085063facSAndy Lutomirski	 * the same page.  Set up RSP so that RSP[31:16] contains the
62185063facSAndy Lutomirski	 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
62285063facSAndy Lutomirski	 * still points to an RO alias of the ESPFIX stack.
62385063facSAndy Lutomirski	 */
624905a36a2SIngo Molnar	orq	PER_CPU_VAR(espfix_stack), %rax
6258a09317bSDave Hansen
6266fd166aaSPeter Zijlstra	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
6278a09317bSDave Hansen	SWAPGS					/* to user GS */
6288a09317bSDave Hansen	popq	%rdi				/* Restore user RDI */
6298a09317bSDave Hansen
630905a36a2SIngo Molnar	movq	%rax, %rsp
6318c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS offset=8
63285063facSAndy Lutomirski
63385063facSAndy Lutomirski	/*
63485063facSAndy Lutomirski	 * At this point, we cannot write to the stack any more, but we can
63585063facSAndy Lutomirski	 * still read.
63685063facSAndy Lutomirski	 */
63785063facSAndy Lutomirski	popq	%rax				/* Restore user RAX */
63885063facSAndy Lutomirski
63985063facSAndy Lutomirski	/*
64085063facSAndy Lutomirski	 * RSP now points to an ordinary IRET frame, except that the page
64185063facSAndy Lutomirski	 * is read-only and RSP[31:16] are preloaded with the userspace
64285063facSAndy Lutomirski	 * values.  We can now IRET back to userspace.
64385063facSAndy Lutomirski	 */
644905a36a2SIngo Molnar	jmp	native_irq_return_iret
645905a36a2SIngo Molnar#endif
646fa5e5c40SThomas GleixnerSYM_CODE_END(common_interrupt_return)
647fa5e5c40SThomas Gleixner_ASM_NOKPROBE(common_interrupt_return)
648905a36a2SIngo Molnar
649905a36a2SIngo Molnar/*
6504d732138SIngo Molnar * Reload gs selector with exception handling
6514d732138SIngo Molnar * edi:  new selector
652b9f6976bSThomas Gleixner *
653b9f6976bSThomas Gleixner * Is in entry.text as it shouldn't be instrumented.
6544d732138SIngo Molnar */
655410367e3SThomas GleixnerSYM_FUNC_START(asm_load_gs_index)
6568c1f7558SJosh Poimboeuf	FRAME_BEGIN
657c9317202SThomas Gleixner	swapgs
65842c748bbSBorislav Petkov.Lgs_change:
659905a36a2SIngo Molnar	movl	%edi, %gs
66096e5d28aSBorislav Petkov2:	ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
661c9317202SThomas Gleixner	swapgs
6628c1f7558SJosh Poimboeuf	FRAME_END
663905a36a2SIngo Molnar	ret
664410367e3SThomas GleixnerSYM_FUNC_END(asm_load_gs_index)
665410367e3SThomas GleixnerEXPORT_SYMBOL(asm_load_gs_index)
666905a36a2SIngo Molnar
66798ededb6SJiri Slaby	_ASM_EXTABLE(.Lgs_change, .Lbad_gs)
668905a36a2SIngo Molnar	.section .fixup, "ax"
669905a36a2SIngo Molnar	/* running with kernelgs */
670ef77e688SJiri SlabySYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
671c9317202SThomas Gleixner	swapgs					/* switch back to user gs */
672b038c842SAndy Lutomirski.macro ZAP_GS
673b038c842SAndy Lutomirski	/* This can't be a string because the preprocessor needs to see it. */
674b038c842SAndy Lutomirski	movl $__USER_DS, %eax
675b038c842SAndy Lutomirski	movl %eax, %gs
676b038c842SAndy Lutomirski.endm
677b038c842SAndy Lutomirski	ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
678905a36a2SIngo Molnar	xorl	%eax, %eax
679905a36a2SIngo Molnar	movl	%eax, %gs
680905a36a2SIngo Molnar	jmp	2b
681ef77e688SJiri SlabySYM_CODE_END(.Lbad_gs)
682905a36a2SIngo Molnar	.previous
683905a36a2SIngo Molnar
684931b9414SThomas Gleixner/*
685931b9414SThomas Gleixner * rdi: New stack pointer points to the top word of the stack
686931b9414SThomas Gleixner * rsi: Function pointer
687931b9414SThomas Gleixner * rdx: Function argument (can be NULL if none)
688931b9414SThomas Gleixner */
689931b9414SThomas GleixnerSYM_FUNC_START(asm_call_on_stack)
690931b9414SThomas Gleixner	/*
691931b9414SThomas Gleixner	 * Save the frame pointer unconditionally. This allows the ORC
692931b9414SThomas Gleixner	 * unwinder to handle the stack switch.
693931b9414SThomas Gleixner	 */
694931b9414SThomas Gleixner	pushq		%rbp
695931b9414SThomas Gleixner	mov		%rsp, %rbp
696931b9414SThomas Gleixner
697931b9414SThomas Gleixner	/*
698931b9414SThomas Gleixner	 * The unwinder relies on the word at the top of the new stack
699931b9414SThomas Gleixner	 * page linking back to the previous RSP.
700931b9414SThomas Gleixner	 */
701931b9414SThomas Gleixner	mov		%rsp, (%rdi)
702931b9414SThomas Gleixner	mov		%rdi, %rsp
703931b9414SThomas Gleixner	/* Move the argument to the right place */
704931b9414SThomas Gleixner	mov		%rdx, %rdi
705931b9414SThomas Gleixner
706931b9414SThomas Gleixner1:
707931b9414SThomas Gleixner	.pushsection .discard.instr_begin
708931b9414SThomas Gleixner	.long 1b - .
709931b9414SThomas Gleixner	.popsection
710931b9414SThomas Gleixner
711931b9414SThomas Gleixner	CALL_NOSPEC	rsi
712931b9414SThomas Gleixner
713931b9414SThomas Gleixner2:
714931b9414SThomas Gleixner	.pushsection .discard.instr_end
715931b9414SThomas Gleixner	.long 2b - .
716931b9414SThomas Gleixner	.popsection
717931b9414SThomas Gleixner
718931b9414SThomas Gleixner	/* Restore the previous stack pointer from RBP. */
719931b9414SThomas Gleixner	leaveq
720931b9414SThomas Gleixner	ret
721931b9414SThomas GleixnerSYM_FUNC_END(asm_call_on_stack)
722931b9414SThomas Gleixner
72328c11b0fSJuergen Gross#ifdef CONFIG_XEN_PV
724905a36a2SIngo Molnar/*
725905a36a2SIngo Molnar * A note on the "critical region" in our callback handler.
726905a36a2SIngo Molnar * We want to avoid stacking callback handlers due to events occurring
727905a36a2SIngo Molnar * during handling of the last event. To do this, we keep events disabled
728905a36a2SIngo Molnar * until we've done all processing. HOWEVER, we must enable events before
729905a36a2SIngo Molnar * popping the stack frame (can't be done atomically) and so it would still
730905a36a2SIngo Molnar * be possible to get enough handler activations to overflow the stack.
731905a36a2SIngo Molnar * Although unlikely, bugs of that kind are hard to track down, so we'd
732905a36a2SIngo Molnar * like to avoid the possibility.
733905a36a2SIngo Molnar * So, on entry to the handler we detect whether we interrupted an
734905a36a2SIngo Molnar * existing activation in its critical region -- if so, we pop the current
735905a36a2SIngo Molnar * activation and restart the handler using the previous one.
7362f6474e4SThomas Gleixner *
7372f6474e4SThomas Gleixner * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
738905a36a2SIngo Molnar */
7392f6474e4SThomas GleixnerSYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
7404d732138SIngo Molnar
741905a36a2SIngo Molnar/*
742905a36a2SIngo Molnar * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
743905a36a2SIngo Molnar * see the correct pointer to the pt_regs
744905a36a2SIngo Molnar */
7458c1f7558SJosh Poimboeuf	UNWIND_HINT_FUNC
7464d732138SIngo Molnar	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
7478c1f7558SJosh Poimboeuf	UNWIND_HINT_REGS
7481d3e53e8SAndy Lutomirski
7492f6474e4SThomas Gleixner	call	xen_pv_evtchn_do_upcall
7501d3e53e8SAndy Lutomirski
7512f6474e4SThomas Gleixner	jmp	error_return
7522f6474e4SThomas GleixnerSYM_CODE_END(exc_xen_hypervisor_callback)
753905a36a2SIngo Molnar
754905a36a2SIngo Molnar/*
755905a36a2SIngo Molnar * Hypervisor uses this for application faults while it executes.
756905a36a2SIngo Molnar * We get here for two reasons:
757905a36a2SIngo Molnar *  1. Fault while reloading DS, ES, FS or GS
758905a36a2SIngo Molnar *  2. Fault while executing IRET
759905a36a2SIngo Molnar * Category 1 we do not need to fix up as Xen has already reloaded all segment
760905a36a2SIngo Molnar * registers that could be reloaded and zeroed the others.
761905a36a2SIngo Molnar * Category 2 we fix up by killing the current process. We cannot use the
762905a36a2SIngo Molnar * normal Linux return path in this case because if we use the IRET hypercall
763905a36a2SIngo Molnar * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
764905a36a2SIngo Molnar * We distinguish between categories by comparing each saved segment register
765905a36a2SIngo Molnar * with its current contents: any discrepancy means we in category 1.
766905a36a2SIngo Molnar */
767bc7b11c0SJiri SlabySYM_CODE_START(xen_failsafe_callback)
7688c1f7558SJosh Poimboeuf	UNWIND_HINT_EMPTY
769905a36a2SIngo Molnar	movl	%ds, %ecx
770905a36a2SIngo Molnar	cmpw	%cx, 0x10(%rsp)
771905a36a2SIngo Molnar	jne	1f
772905a36a2SIngo Molnar	movl	%es, %ecx
773905a36a2SIngo Molnar	cmpw	%cx, 0x18(%rsp)
774905a36a2SIngo Molnar	jne	1f
775905a36a2SIngo Molnar	movl	%fs, %ecx
776905a36a2SIngo Molnar	cmpw	%cx, 0x20(%rsp)
777905a36a2SIngo Molnar	jne	1f
778905a36a2SIngo Molnar	movl	%gs, %ecx
779905a36a2SIngo Molnar	cmpw	%cx, 0x28(%rsp)
780905a36a2SIngo Molnar	jne	1f
781905a36a2SIngo Molnar	/* All segments match their saved values => Category 2 (Bad IRET). */
782905a36a2SIngo Molnar	movq	(%rsp), %rcx
783905a36a2SIngo Molnar	movq	8(%rsp), %r11
784905a36a2SIngo Molnar	addq	$0x30, %rsp
785905a36a2SIngo Molnar	pushq	$0				/* RIP */
7868c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS offset=8
787be4c11afSThomas Gleixner	jmp	asm_exc_general_protection
788905a36a2SIngo Molnar1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
789905a36a2SIngo Molnar	movq	(%rsp), %rcx
790905a36a2SIngo Molnar	movq	8(%rsp), %r11
791905a36a2SIngo Molnar	addq	$0x30, %rsp
7928c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS
793905a36a2SIngo Molnar	pushq	$-1 /* orig_ax = -1 => not a system call */
7943f01daecSDominik Brodowski	PUSH_AND_CLEAR_REGS
795946c1911SJosh Poimboeuf	ENCODE_FRAME_POINTER
796e88d9741SThomas Gleixner	jmp	error_return
797bc7b11c0SJiri SlabySYM_CODE_END(xen_failsafe_callback)
79828c11b0fSJuergen Gross#endif /* CONFIG_XEN_PV */
799905a36a2SIngo Molnar
800905a36a2SIngo Molnar/*
8019e809d15SDominik Brodowski * Save all registers in pt_regs, and switch gs if needed.
802905a36a2SIngo Molnar * Use slow, but surefire "are we in kernel?" check.
803905a36a2SIngo Molnar * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
804905a36a2SIngo Molnar */
805ef1e0315SJiri SlabySYM_CODE_START_LOCAL(paranoid_entry)
8068c1f7558SJosh Poimboeuf	UNWIND_HINT_FUNC
807905a36a2SIngo Molnar	cld
8089e809d15SDominik Brodowski	PUSH_AND_CLEAR_REGS save_ret=1
8099e809d15SDominik Brodowski	ENCODE_FRAME_POINTER 8
810905a36a2SIngo Molnar	movl	$1, %ebx
811905a36a2SIngo Molnar	movl	$MSR_GS_BASE, %ecx
812905a36a2SIngo Molnar	rdmsr
813905a36a2SIngo Molnar	testl	%edx, %edx
814905a36a2SIngo Molnar	js	1f				/* negative -> in kernel */
815905a36a2SIngo Molnar	SWAPGS
816905a36a2SIngo Molnar	xorl	%ebx, %ebx
8178a09317bSDave Hansen
8188a09317bSDave Hansen1:
81916561f27SDave Hansen	/*
82016561f27SDave Hansen	 * Always stash CR3 in %r14.  This value will be restored,
821ae852495SAndy Lutomirski	 * verbatim, at exit.  Needed if paranoid_entry interrupted
822ae852495SAndy Lutomirski	 * another entry that already switched to the user CR3 value
823ae852495SAndy Lutomirski	 * but has not yet returned to userspace.
82416561f27SDave Hansen	 *
82516561f27SDave Hansen	 * This is also why CS (stashed in the "iret frame" by the
82616561f27SDave Hansen	 * hardware at entry) can not be used: this may be a return
827ae852495SAndy Lutomirski	 * to kernel code, but with a user CR3 value.
82816561f27SDave Hansen	 */
8298a09317bSDave Hansen	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
8308a09317bSDave Hansen
83118ec54fdSJosh Poimboeuf	/*
83218ec54fdSJosh Poimboeuf	 * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
83318ec54fdSJosh Poimboeuf	 * unconditional CR3 write, even in the PTI case.  So do an lfence
83418ec54fdSJosh Poimboeuf	 * to prevent GS speculation, regardless of whether PTI is enabled.
83518ec54fdSJosh Poimboeuf	 */
83618ec54fdSJosh Poimboeuf	FENCE_SWAPGS_KERNEL_ENTRY
83718ec54fdSJosh Poimboeuf
8388a09317bSDave Hansen	ret
839ef1e0315SJiri SlabySYM_CODE_END(paranoid_entry)
840905a36a2SIngo Molnar
841905a36a2SIngo Molnar/*
842905a36a2SIngo Molnar * "Paranoid" exit path from exception stack.  This is invoked
843905a36a2SIngo Molnar * only on return from non-NMI IST interrupts that came
844905a36a2SIngo Molnar * from kernel space.
845905a36a2SIngo Molnar *
846905a36a2SIngo Molnar * We may be returning to very strange contexts (e.g. very early
847905a36a2SIngo Molnar * in syscall entry), so checking for preemption here would
848905a36a2SIngo Molnar * be complicated.  Fortunately, we there's no good reason
849905a36a2SIngo Molnar * to try to handle preemption here.
8504d732138SIngo Molnar *
8514d732138SIngo Molnar * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
852905a36a2SIngo Molnar */
853ef1e0315SJiri SlabySYM_CODE_START_LOCAL(paranoid_exit)
8548c1f7558SJosh Poimboeuf	UNWIND_HINT_REGS
855905a36a2SIngo Molnar	testl	%ebx, %ebx			/* swapgs needed? */
856e5317832SAndy Lutomirski	jnz	.Lparanoid_exit_no_swapgs
85716561f27SDave Hansen	/* Always restore stashed CR3 value (see paranoid_entry) */
85821e94459SPeter Zijlstra	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
859905a36a2SIngo Molnar	SWAPGS_UNSAFE_STACK
86045c08383SThomas Gleixner	jmp	restore_regs_and_return_to_kernel
861e5317832SAndy Lutomirski.Lparanoid_exit_no_swapgs:
86216561f27SDave Hansen	/* Always restore stashed CR3 value (see paranoid_entry) */
863e4865757SIngo Molnar	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
864e5317832SAndy Lutomirski	jmp restore_regs_and_return_to_kernel
865ef1e0315SJiri SlabySYM_CODE_END(paranoid_exit)
866905a36a2SIngo Molnar
867905a36a2SIngo Molnar/*
8689e809d15SDominik Brodowski * Save all registers in pt_regs, and switch GS if needed.
869905a36a2SIngo Molnar */
870ef1e0315SJiri SlabySYM_CODE_START_LOCAL(error_entry)
8719e809d15SDominik Brodowski	UNWIND_HINT_FUNC
872905a36a2SIngo Molnar	cld
8739e809d15SDominik Brodowski	PUSH_AND_CLEAR_REGS save_ret=1
8749e809d15SDominik Brodowski	ENCODE_FRAME_POINTER 8
875905a36a2SIngo Molnar	testb	$3, CS+8(%rsp)
876cb6f64edSAndy Lutomirski	jz	.Lerror_kernelspace
877539f5113SAndy Lutomirski
878cb6f64edSAndy Lutomirski	/*
879cb6f64edSAndy Lutomirski	 * We entered from user mode or we're pretending to have entered
880cb6f64edSAndy Lutomirski	 * from user mode due to an IRET fault.
881cb6f64edSAndy Lutomirski	 */
882905a36a2SIngo Molnar	SWAPGS
88318ec54fdSJosh Poimboeuf	FENCE_SWAPGS_USER_ENTRY
8848a09317bSDave Hansen	/* We have user CR3.  Change to kernel CR3. */
8858a09317bSDave Hansen	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
886539f5113SAndy Lutomirski
887cb6f64edSAndy Lutomirski.Lerror_entry_from_usermode_after_swapgs:
8887f2590a1SAndy Lutomirski	/* Put us onto the real thread stack. */
8897f2590a1SAndy Lutomirski	popq	%r12				/* save return addr in %12 */
8907f2590a1SAndy Lutomirski	movq	%rsp, %rdi			/* arg0 = pt_regs pointer */
8917f2590a1SAndy Lutomirski	call	sync_regs
8927f2590a1SAndy Lutomirski	movq	%rax, %rsp			/* switch stack */
8937f2590a1SAndy Lutomirski	ENCODE_FRAME_POINTER
8947f2590a1SAndy Lutomirski	pushq	%r12
895f1075053SAndy Lutomirski	ret
89602bc7768SAndy Lutomirski
89718ec54fdSJosh Poimboeuf.Lerror_entry_done_lfence:
89818ec54fdSJosh Poimboeuf	FENCE_SWAPGS_KERNEL_ENTRY
899cb6f64edSAndy Lutomirski.Lerror_entry_done:
900905a36a2SIngo Molnar	ret
901905a36a2SIngo Molnar
902905a36a2SIngo Molnar	/*
903905a36a2SIngo Molnar	 * There are two places in the kernel that can potentially fault with
904905a36a2SIngo Molnar	 * usergs. Handle them here.  B stepping K8s sometimes report a
905905a36a2SIngo Molnar	 * truncated RIP for IRET exceptions returning to compat mode. Check
906905a36a2SIngo Molnar	 * for these here too.
907905a36a2SIngo Molnar	 */
908cb6f64edSAndy Lutomirski.Lerror_kernelspace:
909905a36a2SIngo Molnar	leaq	native_irq_return_iret(%rip), %rcx
910905a36a2SIngo Molnar	cmpq	%rcx, RIP+8(%rsp)
911cb6f64edSAndy Lutomirski	je	.Lerror_bad_iret
912905a36a2SIngo Molnar	movl	%ecx, %eax			/* zero extend */
913905a36a2SIngo Molnar	cmpq	%rax, RIP+8(%rsp)
914cb6f64edSAndy Lutomirski	je	.Lbstep_iret
91542c748bbSBorislav Petkov	cmpq	$.Lgs_change, RIP+8(%rsp)
91618ec54fdSJosh Poimboeuf	jne	.Lerror_entry_done_lfence
917539f5113SAndy Lutomirski
918539f5113SAndy Lutomirski	/*
91942c748bbSBorislav Petkov	 * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
920539f5113SAndy Lutomirski	 * gsbase and proceed.  We'll fix up the exception and land in
92142c748bbSBorislav Petkov	 * .Lgs_change's error handler with kernel gsbase.
922539f5113SAndy Lutomirski	 */
9232fa5f04fSWanpeng Li	SWAPGS
92418ec54fdSJosh Poimboeuf	FENCE_SWAPGS_USER_ENTRY
9252fa5f04fSWanpeng Li	jmp .Lerror_entry_done
926905a36a2SIngo Molnar
927cb6f64edSAndy Lutomirski.Lbstep_iret:
928905a36a2SIngo Molnar	/* Fix truncated RIP */
929905a36a2SIngo Molnar	movq	%rcx, RIP+8(%rsp)
930905a36a2SIngo Molnar	/* fall through */
931905a36a2SIngo Molnar
932cb6f64edSAndy Lutomirski.Lerror_bad_iret:
933539f5113SAndy Lutomirski	/*
9348a09317bSDave Hansen	 * We came from an IRET to user mode, so we have user
9358a09317bSDave Hansen	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
936539f5113SAndy Lutomirski	 */
937905a36a2SIngo Molnar	SWAPGS
93818ec54fdSJosh Poimboeuf	FENCE_SWAPGS_USER_ENTRY
9398a09317bSDave Hansen	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
940539f5113SAndy Lutomirski
941539f5113SAndy Lutomirski	/*
942539f5113SAndy Lutomirski	 * Pretend that the exception came from user mode: set up pt_regs
943b3681dd5SAndy Lutomirski	 * as if we faulted immediately after IRET.
944539f5113SAndy Lutomirski	 */
945905a36a2SIngo Molnar	mov	%rsp, %rdi
946905a36a2SIngo Molnar	call	fixup_bad_iret
947905a36a2SIngo Molnar	mov	%rax, %rsp
948cb6f64edSAndy Lutomirski	jmp	.Lerror_entry_from_usermode_after_swapgs
949ef1e0315SJiri SlabySYM_CODE_END(error_entry)
950905a36a2SIngo Molnar
951424c7d0aSThomas GleixnerSYM_CODE_START_LOCAL(error_return)
952424c7d0aSThomas Gleixner	UNWIND_HINT_REGS
953424c7d0aSThomas Gleixner	DEBUG_ENTRY_ASSERT_IRQS_OFF
954424c7d0aSThomas Gleixner	testb	$3, CS(%rsp)
955424c7d0aSThomas Gleixner	jz	restore_regs_and_return_to_kernel
956424c7d0aSThomas Gleixner	jmp	swapgs_restore_regs_and_return_to_usermode
957424c7d0aSThomas GleixnerSYM_CODE_END(error_return)
958424c7d0aSThomas Gleixner
959929bacecSAndy Lutomirski/*
960929bacecSAndy Lutomirski * Runs on exception stack.  Xen PV does not go through this path at all,
961929bacecSAndy Lutomirski * so we can use real assembly here.
9628a09317bSDave Hansen *
9638a09317bSDave Hansen * Registers:
9648a09317bSDave Hansen *	%r14: Used to save/restore the CR3 of the interrupted context
9658a09317bSDave Hansen *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
966929bacecSAndy Lutomirski */
9676271fef0SThomas GleixnerSYM_CODE_START(asm_exc_nmi)
9688c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS
969929bacecSAndy Lutomirski
970fc57a7c6SAndy Lutomirski	/*
971905a36a2SIngo Molnar	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
972905a36a2SIngo Molnar	 * the iretq it performs will take us out of NMI context.
973905a36a2SIngo Molnar	 * This means that we can have nested NMIs where the next
974905a36a2SIngo Molnar	 * NMI is using the top of the stack of the previous NMI. We
975905a36a2SIngo Molnar	 * can't let it execute because the nested NMI will corrupt the
976905a36a2SIngo Molnar	 * stack of the previous NMI. NMI handlers are not re-entrant
977905a36a2SIngo Molnar	 * anyway.
978905a36a2SIngo Molnar	 *
979905a36a2SIngo Molnar	 * To handle this case we do the following:
980905a36a2SIngo Molnar	 *  Check the a special location on the stack that contains
981905a36a2SIngo Molnar	 *  a variable that is set when NMIs are executing.
982905a36a2SIngo Molnar	 *  The interrupted task's stack is also checked to see if it
983905a36a2SIngo Molnar	 *  is an NMI stack.
984905a36a2SIngo Molnar	 *  If the variable is not set and the stack is not the NMI
985905a36a2SIngo Molnar	 *  stack then:
986905a36a2SIngo Molnar	 *    o Set the special variable on the stack
9870b22930eSAndy Lutomirski	 *    o Copy the interrupt frame into an "outermost" location on the
9880b22930eSAndy Lutomirski	 *      stack
9890b22930eSAndy Lutomirski	 *    o Copy the interrupt frame into an "iret" location on the stack
990905a36a2SIngo Molnar	 *    o Continue processing the NMI
991905a36a2SIngo Molnar	 *  If the variable is set or the previous stack is the NMI stack:
9920b22930eSAndy Lutomirski	 *    o Modify the "iret" location to jump to the repeat_nmi
993905a36a2SIngo Molnar	 *    o return back to the first NMI
994905a36a2SIngo Molnar	 *
995905a36a2SIngo Molnar	 * Now on exit of the first NMI, we first clear the stack variable
996905a36a2SIngo Molnar	 * The NMI stack will tell any nested NMIs at that point that it is
997905a36a2SIngo Molnar	 * nested. Then we pop the stack normally with iret, and if there was
998905a36a2SIngo Molnar	 * a nested NMI that updated the copy interrupt stack frame, a
999905a36a2SIngo Molnar	 * jump will be made to the repeat_nmi code that will handle the second
1000905a36a2SIngo Molnar	 * NMI.
10019b6e6a83SAndy Lutomirski	 *
10029b6e6a83SAndy Lutomirski	 * However, espfix prevents us from directly returning to userspace
10039b6e6a83SAndy Lutomirski	 * with a single IRET instruction.  Similarly, IRET to user mode
10049b6e6a83SAndy Lutomirski	 * can fault.  We therefore handle NMIs from user space like
10059b6e6a83SAndy Lutomirski	 * other IST entries.
1006905a36a2SIngo Molnar	 */
1007905a36a2SIngo Molnar
1008e93c1730SAndy Lutomirski	ASM_CLAC
1009e93c1730SAndy Lutomirski
1010905a36a2SIngo Molnar	/* Use %rdx as our temp variable throughout */
1011905a36a2SIngo Molnar	pushq	%rdx
1012905a36a2SIngo Molnar
10139b6e6a83SAndy Lutomirski	testb	$3, CS-RIP+8(%rsp)
10149b6e6a83SAndy Lutomirski	jz	.Lnmi_from_kernel
1015905a36a2SIngo Molnar
1016905a36a2SIngo Molnar	/*
10179b6e6a83SAndy Lutomirski	 * NMI from user mode.  We need to run on the thread stack, but we
10189b6e6a83SAndy Lutomirski	 * can't go through the normal entry paths: NMIs are masked, and
10199b6e6a83SAndy Lutomirski	 * we don't want to enable interrupts, because then we'll end
10209b6e6a83SAndy Lutomirski	 * up in an awkward situation in which IRQs are on but NMIs
10219b6e6a83SAndy Lutomirski	 * are off.
102283c133cfSAndy Lutomirski	 *
102383c133cfSAndy Lutomirski	 * We also must not push anything to the stack before switching
102483c133cfSAndy Lutomirski	 * stacks lest we corrupt the "NMI executing" variable.
10259b6e6a83SAndy Lutomirski	 */
10269b6e6a83SAndy Lutomirski
1027929bacecSAndy Lutomirski	swapgs
10289b6e6a83SAndy Lutomirski	cld
102918ec54fdSJosh Poimboeuf	FENCE_SWAPGS_USER_ENTRY
10308a09317bSDave Hansen	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
10319b6e6a83SAndy Lutomirski	movq	%rsp, %rdx
10329b6e6a83SAndy Lutomirski	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
10338c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS base=%rdx offset=8
10349b6e6a83SAndy Lutomirski	pushq	5*8(%rdx)	/* pt_regs->ss */
10359b6e6a83SAndy Lutomirski	pushq	4*8(%rdx)	/* pt_regs->rsp */
10369b6e6a83SAndy Lutomirski	pushq	3*8(%rdx)	/* pt_regs->flags */
10379b6e6a83SAndy Lutomirski	pushq	2*8(%rdx)	/* pt_regs->cs */
10389b6e6a83SAndy Lutomirski	pushq	1*8(%rdx)	/* pt_regs->rip */
10398c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS
10409b6e6a83SAndy Lutomirski	pushq   $-1		/* pt_regs->orig_ax */
104130907fd1SDominik Brodowski	PUSH_AND_CLEAR_REGS rdx=(%rdx)
1042946c1911SJosh Poimboeuf	ENCODE_FRAME_POINTER
10439b6e6a83SAndy Lutomirski
10449b6e6a83SAndy Lutomirski	/*
10459b6e6a83SAndy Lutomirski	 * At this point we no longer need to worry about stack damage
10469b6e6a83SAndy Lutomirski	 * due to nesting -- we're on the normal thread stack and we're
10479b6e6a83SAndy Lutomirski	 * done with the NMI stack.
10489b6e6a83SAndy Lutomirski	 */
10499b6e6a83SAndy Lutomirski
10509b6e6a83SAndy Lutomirski	movq	%rsp, %rdi
10519b6e6a83SAndy Lutomirski	movq	$-1, %rsi
10526271fef0SThomas Gleixner	call	exc_nmi
10539b6e6a83SAndy Lutomirski
10549b6e6a83SAndy Lutomirski	/*
10559b6e6a83SAndy Lutomirski	 * Return back to user mode.  We must *not* do the normal exit
1056946c1911SJosh Poimboeuf	 * work, because we don't want to enable interrupts.
10579b6e6a83SAndy Lutomirski	 */
10588a055d7fSAndy Lutomirski	jmp	swapgs_restore_regs_and_return_to_usermode
10599b6e6a83SAndy Lutomirski
10609b6e6a83SAndy Lutomirski.Lnmi_from_kernel:
10619b6e6a83SAndy Lutomirski	/*
10620b22930eSAndy Lutomirski	 * Here's what our stack frame will look like:
10630b22930eSAndy Lutomirski	 * +---------------------------------------------------------+
10640b22930eSAndy Lutomirski	 * | original SS                                             |
10650b22930eSAndy Lutomirski	 * | original Return RSP                                     |
10660b22930eSAndy Lutomirski	 * | original RFLAGS                                         |
10670b22930eSAndy Lutomirski	 * | original CS                                             |
10680b22930eSAndy Lutomirski	 * | original RIP                                            |
10690b22930eSAndy Lutomirski	 * +---------------------------------------------------------+
10700b22930eSAndy Lutomirski	 * | temp storage for rdx                                    |
10710b22930eSAndy Lutomirski	 * +---------------------------------------------------------+
10720b22930eSAndy Lutomirski	 * | "NMI executing" variable                                |
10730b22930eSAndy Lutomirski	 * +---------------------------------------------------------+
10740b22930eSAndy Lutomirski	 * | iret SS          } Copied from "outermost" frame        |
10750b22930eSAndy Lutomirski	 * | iret Return RSP  } on each loop iteration; overwritten  |
10760b22930eSAndy Lutomirski	 * | iret RFLAGS      } by a nested NMI to force another     |
10770b22930eSAndy Lutomirski	 * | iret CS          } iteration if needed.                 |
10780b22930eSAndy Lutomirski	 * | iret RIP         }                                      |
10790b22930eSAndy Lutomirski	 * +---------------------------------------------------------+
10800b22930eSAndy Lutomirski	 * | outermost SS          } initialized in first_nmi;       |
10810b22930eSAndy Lutomirski	 * | outermost Return RSP  } will not be changed before      |
10820b22930eSAndy Lutomirski	 * | outermost RFLAGS      } NMI processing is done.         |
10830b22930eSAndy Lutomirski	 * | outermost CS          } Copied to "iret" frame on each  |
10840b22930eSAndy Lutomirski	 * | outermost RIP         } iteration.                      |
10850b22930eSAndy Lutomirski	 * +---------------------------------------------------------+
10860b22930eSAndy Lutomirski	 * | pt_regs                                                 |
10870b22930eSAndy Lutomirski	 * +---------------------------------------------------------+
10880b22930eSAndy Lutomirski	 *
10890b22930eSAndy Lutomirski	 * The "original" frame is used by hardware.  Before re-enabling
10900b22930eSAndy Lutomirski	 * NMIs, we need to be done with it, and we need to leave enough
10910b22930eSAndy Lutomirski	 * space for the asm code here.
10920b22930eSAndy Lutomirski	 *
10930b22930eSAndy Lutomirski	 * We return by executing IRET while RSP points to the "iret" frame.
10940b22930eSAndy Lutomirski	 * That will either return for real or it will loop back into NMI
10950b22930eSAndy Lutomirski	 * processing.
10960b22930eSAndy Lutomirski	 *
10970b22930eSAndy Lutomirski	 * The "outermost" frame is copied to the "iret" frame on each
10980b22930eSAndy Lutomirski	 * iteration of the loop, so each iteration starts with the "iret"
10990b22930eSAndy Lutomirski	 * frame pointing to the final return target.
11000b22930eSAndy Lutomirski	 */
11010b22930eSAndy Lutomirski
11020b22930eSAndy Lutomirski	/*
11030b22930eSAndy Lutomirski	 * Determine whether we're a nested NMI.
11040b22930eSAndy Lutomirski	 *
1105a27507caSAndy Lutomirski	 * If we interrupted kernel code between repeat_nmi and
1106a27507caSAndy Lutomirski	 * end_repeat_nmi, then we are a nested NMI.  We must not
1107a27507caSAndy Lutomirski	 * modify the "iret" frame because it's being written by
1108a27507caSAndy Lutomirski	 * the outer NMI.  That's okay; the outer NMI handler is
11096271fef0SThomas Gleixner	 * about to about to call exc_nmi() anyway, so we can just
1110a27507caSAndy Lutomirski	 * resume the outer NMI.
1111a27507caSAndy Lutomirski	 */
1112a27507caSAndy Lutomirski
1113a27507caSAndy Lutomirski	movq	$repeat_nmi, %rdx
1114a27507caSAndy Lutomirski	cmpq	8(%rsp), %rdx
1115a27507caSAndy Lutomirski	ja	1f
1116a27507caSAndy Lutomirski	movq	$end_repeat_nmi, %rdx
1117a27507caSAndy Lutomirski	cmpq	8(%rsp), %rdx
1118a27507caSAndy Lutomirski	ja	nested_nmi_out
1119a27507caSAndy Lutomirski1:
1120a27507caSAndy Lutomirski
1121a27507caSAndy Lutomirski	/*
1122a27507caSAndy Lutomirski	 * Now check "NMI executing".  If it's set, then we're nested.
11230b22930eSAndy Lutomirski	 * This will not detect if we interrupted an outer NMI just
11240b22930eSAndy Lutomirski	 * before IRET.
1125905a36a2SIngo Molnar	 */
1126905a36a2SIngo Molnar	cmpl	$1, -8(%rsp)
1127905a36a2SIngo Molnar	je	nested_nmi
1128905a36a2SIngo Molnar
1129905a36a2SIngo Molnar	/*
11300b22930eSAndy Lutomirski	 * Now test if the previous stack was an NMI stack.  This covers
11310b22930eSAndy Lutomirski	 * the case where we interrupt an outer NMI after it clears
1132810bc075SAndy Lutomirski	 * "NMI executing" but before IRET.  We need to be careful, though:
1133810bc075SAndy Lutomirski	 * there is one case in which RSP could point to the NMI stack
1134810bc075SAndy Lutomirski	 * despite there being no NMI active: naughty userspace controls
1135810bc075SAndy Lutomirski	 * RSP at the very beginning of the SYSCALL targets.  We can
1136810bc075SAndy Lutomirski	 * pull a fast one on naughty userspace, though: we program
1137810bc075SAndy Lutomirski	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1138810bc075SAndy Lutomirski	 * if it controls the kernel's RSP.  We set DF before we clear
1139810bc075SAndy Lutomirski	 * "NMI executing".
1140905a36a2SIngo Molnar	 */
1141905a36a2SIngo Molnar	lea	6*8(%rsp), %rdx
1142905a36a2SIngo Molnar	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1143905a36a2SIngo Molnar	cmpq	%rdx, 4*8(%rsp)
1144905a36a2SIngo Molnar	/* If the stack pointer is above the NMI stack, this is a normal NMI */
1145905a36a2SIngo Molnar	ja	first_nmi
11464d732138SIngo Molnar
1147905a36a2SIngo Molnar	subq	$EXCEPTION_STKSZ, %rdx
1148905a36a2SIngo Molnar	cmpq	%rdx, 4*8(%rsp)
1149905a36a2SIngo Molnar	/* If it is below the NMI stack, it is a normal NMI */
1150905a36a2SIngo Molnar	jb	first_nmi
1151810bc075SAndy Lutomirski
1152810bc075SAndy Lutomirski	/* Ah, it is within the NMI stack. */
1153810bc075SAndy Lutomirski
1154810bc075SAndy Lutomirski	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1155810bc075SAndy Lutomirski	jz	first_nmi	/* RSP was user controlled. */
1156810bc075SAndy Lutomirski
1157810bc075SAndy Lutomirski	/* This is a nested NMI. */
1158905a36a2SIngo Molnar
1159905a36a2SIngo Molnarnested_nmi:
1160905a36a2SIngo Molnar	/*
11610b22930eSAndy Lutomirski	 * Modify the "iret" frame to point to repeat_nmi, forcing another
11620b22930eSAndy Lutomirski	 * iteration of NMI handling.
1163905a36a2SIngo Molnar	 */
116423a781e9SAndy Lutomirski	subq	$8, %rsp
1165905a36a2SIngo Molnar	leaq	-10*8(%rsp), %rdx
1166905a36a2SIngo Molnar	pushq	$__KERNEL_DS
1167905a36a2SIngo Molnar	pushq	%rdx
1168905a36a2SIngo Molnar	pushfq
1169905a36a2SIngo Molnar	pushq	$__KERNEL_CS
1170905a36a2SIngo Molnar	pushq	$repeat_nmi
1171905a36a2SIngo Molnar
1172905a36a2SIngo Molnar	/* Put stack back */
1173905a36a2SIngo Molnar	addq	$(6*8), %rsp
1174905a36a2SIngo Molnar
1175905a36a2SIngo Molnarnested_nmi_out:
1176905a36a2SIngo Molnar	popq	%rdx
1177905a36a2SIngo Molnar
11780b22930eSAndy Lutomirski	/* We are returning to kernel mode, so this cannot result in a fault. */
1179929bacecSAndy Lutomirski	iretq
1180905a36a2SIngo Molnar
1181905a36a2SIngo Molnarfirst_nmi:
11820b22930eSAndy Lutomirski	/* Restore rdx. */
1183905a36a2SIngo Molnar	movq	(%rsp), %rdx
1184905a36a2SIngo Molnar
118536f1a77bSAndy Lutomirski	/* Make room for "NMI executing". */
118636f1a77bSAndy Lutomirski	pushq	$0
1187905a36a2SIngo Molnar
11880b22930eSAndy Lutomirski	/* Leave room for the "iret" frame */
1189905a36a2SIngo Molnar	subq	$(5*8), %rsp
1190905a36a2SIngo Molnar
11910b22930eSAndy Lutomirski	/* Copy the "original" frame to the "outermost" frame */
1192905a36a2SIngo Molnar	.rept 5
1193905a36a2SIngo Molnar	pushq	11*8(%rsp)
1194905a36a2SIngo Molnar	.endr
11958c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS
1196905a36a2SIngo Molnar
1197905a36a2SIngo Molnar	/* Everything up to here is safe from nested NMIs */
1198905a36a2SIngo Molnar
1199a97439aaSAndy Lutomirski#ifdef CONFIG_DEBUG_ENTRY
1200a97439aaSAndy Lutomirski	/*
1201a97439aaSAndy Lutomirski	 * For ease of testing, unmask NMIs right away.  Disabled by
1202a97439aaSAndy Lutomirski	 * default because IRET is very expensive.
1203a97439aaSAndy Lutomirski	 */
1204a97439aaSAndy Lutomirski	pushq	$0		/* SS */
1205a97439aaSAndy Lutomirski	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
1206a97439aaSAndy Lutomirski	addq	$8, (%rsp)	/* Fix up RSP */
1207a97439aaSAndy Lutomirski	pushfq			/* RFLAGS */
1208a97439aaSAndy Lutomirski	pushq	$__KERNEL_CS	/* CS */
1209a97439aaSAndy Lutomirski	pushq	$1f		/* RIP */
1210929bacecSAndy Lutomirski	iretq			/* continues at repeat_nmi below */
12118c1f7558SJosh Poimboeuf	UNWIND_HINT_IRET_REGS
1212a97439aaSAndy Lutomirski1:
1213a97439aaSAndy Lutomirski#endif
1214a97439aaSAndy Lutomirski
12150b22930eSAndy Lutomirskirepeat_nmi:
1216905a36a2SIngo Molnar	/*
1217905a36a2SIngo Molnar	 * If there was a nested NMI, the first NMI's iret will return
1218905a36a2SIngo Molnar	 * here. But NMIs are still enabled and we can take another
1219905a36a2SIngo Molnar	 * nested NMI. The nested NMI checks the interrupted RIP to see
1220905a36a2SIngo Molnar	 * if it is between repeat_nmi and end_repeat_nmi, and if so
1221905a36a2SIngo Molnar	 * it will just return, as we are about to repeat an NMI anyway.
1222905a36a2SIngo Molnar	 * This makes it safe to copy to the stack frame that a nested
1223905a36a2SIngo Molnar	 * NMI will update.
12240b22930eSAndy Lutomirski	 *
12250b22930eSAndy Lutomirski	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
12260b22930eSAndy Lutomirski	 * we're repeating an NMI, gsbase has the same value that it had on
12270b22930eSAndy Lutomirski	 * the first iteration.  paranoid_entry will load the kernel
12286271fef0SThomas Gleixner	 * gsbase if needed before we call exc_nmi().  "NMI executing"
122936f1a77bSAndy Lutomirski	 * is zero.
1230905a36a2SIngo Molnar	 */
123136f1a77bSAndy Lutomirski	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
1232905a36a2SIngo Molnar
12330b22930eSAndy Lutomirski	/*
12340b22930eSAndy Lutomirski	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
12350b22930eSAndy Lutomirski	 * here must not modify the "iret" frame while we're writing to
12360b22930eSAndy Lutomirski	 * it or it will end up containing garbage.
12370b22930eSAndy Lutomirski	 */
1238905a36a2SIngo Molnar	addq	$(10*8), %rsp
1239905a36a2SIngo Molnar	.rept 5
1240905a36a2SIngo Molnar	pushq	-6*8(%rsp)
1241905a36a2SIngo Molnar	.endr
1242905a36a2SIngo Molnar	subq	$(5*8), %rsp
1243905a36a2SIngo Molnarend_repeat_nmi:
1244905a36a2SIngo Molnar
1245905a36a2SIngo Molnar	/*
12460b22930eSAndy Lutomirski	 * Everything below this point can be preempted by a nested NMI.
12470b22930eSAndy Lutomirski	 * If this happens, then the inner NMI will change the "iret"
12480b22930eSAndy Lutomirski	 * frame to point back to repeat_nmi.
1249905a36a2SIngo Molnar	 */
1250905a36a2SIngo Molnar	pushq	$-1				/* ORIG_RAX: no syscall to restart */
1251905a36a2SIngo Molnar
1252905a36a2SIngo Molnar	/*
1253905a36a2SIngo Molnar	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1254905a36a2SIngo Molnar	 * as we should not be calling schedule in NMI context.
1255905a36a2SIngo Molnar	 * Even with normal interrupts enabled. An NMI should not be
1256905a36a2SIngo Molnar	 * setting NEED_RESCHED or anything that normal interrupts and
1257905a36a2SIngo Molnar	 * exceptions might do.
1258905a36a2SIngo Molnar	 */
1259905a36a2SIngo Molnar	call	paranoid_entry
12608c1f7558SJosh Poimboeuf	UNWIND_HINT_REGS
1261905a36a2SIngo Molnar
1262905a36a2SIngo Molnar	movq	%rsp, %rdi
1263905a36a2SIngo Molnar	movq	$-1, %rsi
12646271fef0SThomas Gleixner	call	exc_nmi
1265905a36a2SIngo Molnar
126616561f27SDave Hansen	/* Always restore stashed CR3 value (see paranoid_entry) */
126721e94459SPeter Zijlstra	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
12688a09317bSDave Hansen
1269905a36a2SIngo Molnar	testl	%ebx, %ebx			/* swapgs needed? */
1270905a36a2SIngo Molnar	jnz	nmi_restore
1271905a36a2SIngo Molnarnmi_swapgs:
1272905a36a2SIngo Molnar	SWAPGS_UNSAFE_STACK
1273905a36a2SIngo Molnarnmi_restore:
1274502af0d7SDominik Brodowski	POP_REGS
12750b22930eSAndy Lutomirski
1276471ee483SAndy Lutomirski	/*
1277471ee483SAndy Lutomirski	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
1278471ee483SAndy Lutomirski	 * at the "iret" frame.
1279471ee483SAndy Lutomirski	 */
1280471ee483SAndy Lutomirski	addq	$6*8, %rsp
1281905a36a2SIngo Molnar
1282810bc075SAndy Lutomirski	/*
1283810bc075SAndy Lutomirski	 * Clear "NMI executing".  Set DF first so that we can easily
1284810bc075SAndy Lutomirski	 * distinguish the remaining code between here and IRET from
1285929bacecSAndy Lutomirski	 * the SYSCALL entry and exit paths.
1286929bacecSAndy Lutomirski	 *
1287929bacecSAndy Lutomirski	 * We arguably should just inspect RIP instead, but I (Andy) wrote
1288929bacecSAndy Lutomirski	 * this code when I had the misapprehension that Xen PV supported
1289929bacecSAndy Lutomirski	 * NMIs, and Xen PV would break that approach.
1290810bc075SAndy Lutomirski	 */
1291810bc075SAndy Lutomirski	std
1292810bc075SAndy Lutomirski	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
12930b22930eSAndy Lutomirski
12940b22930eSAndy Lutomirski	/*
1295929bacecSAndy Lutomirski	 * iretq reads the "iret" frame and exits the NMI stack in a
1296929bacecSAndy Lutomirski	 * single instruction.  We are returning to kernel mode, so this
1297929bacecSAndy Lutomirski	 * cannot result in a fault.  Similarly, we don't need to worry
1298929bacecSAndy Lutomirski	 * about espfix64 on the way back to kernel mode.
12990b22930eSAndy Lutomirski	 */
1300929bacecSAndy Lutomirski	iretq
13016271fef0SThomas GleixnerSYM_CODE_END(asm_exc_nmi)
1302905a36a2SIngo Molnar
1303dffb3f9dSAndy Lutomirski#ifndef CONFIG_IA32_EMULATION
1304dffb3f9dSAndy Lutomirski/*
1305dffb3f9dSAndy Lutomirski * This handles SYSCALL from 32-bit code.  There is no way to program
1306dffb3f9dSAndy Lutomirski * MSRs to fully disable 32-bit SYSCALL.
1307dffb3f9dSAndy Lutomirski */
1308bc7b11c0SJiri SlabySYM_CODE_START(ignore_sysret)
13098c1f7558SJosh Poimboeuf	UNWIND_HINT_EMPTY
1310905a36a2SIngo Molnar	mov	$-ENOSYS, %eax
1311b2b1d94cSJan Beulich	sysretl
1312bc7b11c0SJiri SlabySYM_CODE_END(ignore_sysret)
1313dffb3f9dSAndy Lutomirski#endif
13142deb4be2SAndy Lutomirski
1315b9f6976bSThomas Gleixner.pushsection .text, "ax"
1316bc7b11c0SJiri SlabySYM_CODE_START(rewind_stack_do_exit)
13178c1f7558SJosh Poimboeuf	UNWIND_HINT_FUNC
13182deb4be2SAndy Lutomirski	/* Prevent any naive code from trying to unwind to our caller. */
13192deb4be2SAndy Lutomirski	xorl	%ebp, %ebp
13202deb4be2SAndy Lutomirski
13212deb4be2SAndy Lutomirski	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rax
13228c1f7558SJosh Poimboeuf	leaq	-PTREGS_SIZE(%rax), %rsp
1323f977df7bSJann Horn	UNWIND_HINT_REGS
13242deb4be2SAndy Lutomirski
13252deb4be2SAndy Lutomirski	call	do_exit
1326bc7b11c0SJiri SlabySYM_CODE_END(rewind_stack_do_exit)
1327b9f6976bSThomas Gleixner.popsection
1328