xref: /openbmc/linux/arch/x86/entry/entry_64.S (revision a16be368)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 *  linux/arch/x86_64/entry.S
4 *
5 *  Copyright (C) 1991, 1992  Linus Torvalds
6 *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
7 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
8 *
9 * entry.S contains the system-call and fault low-level handling routines.
10 *
11 * Some of this is documented in Documentation/x86/entry_64.rst
12 *
13 * A note on terminology:
14 * - iret frame:	Architecture defined interrupt frame from SS to RIP
15 *			at the top of the kernel process stack.
16 *
17 * Some macro usage:
18 * - SYM_FUNC_START/END:Define functions in the symbol table.
19 * - TRACE_IRQ_*:	Trace hardirq state for lock debugging.
20 * - idtentry:		Define exception entry points.
21 */
22#include <linux/linkage.h>
23#include <asm/segment.h>
24#include <asm/cache.h>
25#include <asm/errno.h>
26#include <asm/asm-offsets.h>
27#include <asm/msr.h>
28#include <asm/unistd.h>
29#include <asm/thread_info.h>
30#include <asm/hw_irq.h>
31#include <asm/page_types.h>
32#include <asm/irqflags.h>
33#include <asm/paravirt.h>
34#include <asm/percpu.h>
35#include <asm/asm.h>
36#include <asm/smap.h>
37#include <asm/pgtable_types.h>
38#include <asm/export.h>
39#include <asm/frame.h>
40#include <asm/trapnr.h>
41#include <asm/nospec-branch.h>
42#include <linux/err.h>
43
44#include "calling.h"
45
46.code64
47.section .entry.text, "ax"
48
49#ifdef CONFIG_PARAVIRT
50SYM_CODE_START(native_usergs_sysret64)
51	UNWIND_HINT_EMPTY
52	swapgs
53	sysretq
54SYM_CODE_END(native_usergs_sysret64)
55#endif /* CONFIG_PARAVIRT */
56
57.macro TRACE_IRQS_FLAGS flags:req
58#ifdef CONFIG_TRACE_IRQFLAGS
59	btl	$9, \flags		/* interrupts off? */
60	jnc	1f
61	TRACE_IRQS_ON
621:
63#endif
64.endm
65
66.macro TRACE_IRQS_IRETQ
67	TRACE_IRQS_FLAGS EFLAGS(%rsp)
68.endm
69
70/*
71 * When dynamic function tracer is enabled it will add a breakpoint
72 * to all locations that it is about to modify, sync CPUs, update
73 * all the code, sync CPUs, then remove the breakpoints. In this time
74 * if lockdep is enabled, it might jump back into the debug handler
75 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
76 *
77 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
78 * make sure the stack pointer does not get reset back to the top
79 * of the debug stack, and instead just reuses the current stack.
80 */
81#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
82
83.macro TRACE_IRQS_OFF_DEBUG
84	call	debug_stack_set_zero
85	TRACE_IRQS_OFF
86	call	debug_stack_reset
87.endm
88
89.macro TRACE_IRQS_ON_DEBUG
90	call	debug_stack_set_zero
91	TRACE_IRQS_ON
92	call	debug_stack_reset
93.endm
94
95.macro TRACE_IRQS_IRETQ_DEBUG
96	btl	$9, EFLAGS(%rsp)		/* interrupts off? */
97	jnc	1f
98	TRACE_IRQS_ON_DEBUG
991:
100.endm
101
102#else
103# define TRACE_IRQS_OFF_DEBUG			TRACE_IRQS_OFF
104# define TRACE_IRQS_ON_DEBUG			TRACE_IRQS_ON
105# define TRACE_IRQS_IRETQ_DEBUG			TRACE_IRQS_IRETQ
106#endif
107
108/*
109 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
110 *
111 * This is the only entry point used for 64-bit system calls.  The
112 * hardware interface is reasonably well designed and the register to
113 * argument mapping Linux uses fits well with the registers that are
114 * available when SYSCALL is used.
115 *
116 * SYSCALL instructions can be found inlined in libc implementations as
117 * well as some other programs and libraries.  There are also a handful
118 * of SYSCALL instructions in the vDSO used, for example, as a
119 * clock_gettimeofday fallback.
120 *
121 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
122 * then loads new ss, cs, and rip from previously programmed MSRs.
123 * rflags gets masked by a value from another MSR (so CLD and CLAC
124 * are not needed). SYSCALL does not save anything on the stack
125 * and does not change rsp.
126 *
127 * Registers on entry:
128 * rax  system call number
129 * rcx  return address
130 * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
131 * rdi  arg0
132 * rsi  arg1
133 * rdx  arg2
134 * r10  arg3 (needs to be moved to rcx to conform to C ABI)
135 * r8   arg4
136 * r9   arg5
137 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
138 *
139 * Only called from user space.
140 *
141 * When user can change pt_regs->foo always force IRET. That is because
142 * it deals with uncanonical addresses better. SYSRET has trouble
143 * with them due to bugs in both AMD and Intel CPUs.
144 */
145
146SYM_CODE_START(entry_SYSCALL_64)
147	UNWIND_HINT_EMPTY
148	/*
149	 * Interrupts are off on entry.
150	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
151	 * it is too small to ever cause noticeable irq latency.
152	 */
153
154	swapgs
155	/* tss.sp2 is scratch space. */
156	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
157	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
158	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
159
160	/* Construct struct pt_regs on stack */
161	pushq	$__USER_DS				/* pt_regs->ss */
162	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
163	pushq	%r11					/* pt_regs->flags */
164	pushq	$__USER_CS				/* pt_regs->cs */
165	pushq	%rcx					/* pt_regs->ip */
166SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
167	pushq	%rax					/* pt_regs->orig_ax */
168
169	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
170
171	/* IRQs are off. */
172	movq	%rax, %rdi
173	movq	%rsp, %rsi
174	call	do_syscall_64		/* returns with IRQs disabled */
175
176	/*
177	 * Try to use SYSRET instead of IRET if we're returning to
178	 * a completely clean 64-bit userspace context.  If we're not,
179	 * go to the slow exit path.
180	 */
181	movq	RCX(%rsp), %rcx
182	movq	RIP(%rsp), %r11
183
184	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
185	jne	swapgs_restore_regs_and_return_to_usermode
186
187	/*
188	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
189	 * in kernel space.  This essentially lets the user take over
190	 * the kernel, since userspace controls RSP.
191	 *
192	 * If width of "canonical tail" ever becomes variable, this will need
193	 * to be updated to remain correct on both old and new CPUs.
194	 *
195	 * Change top bits to match most significant bit (47th or 56th bit
196	 * depending on paging mode) in the address.
197	 */
198#ifdef CONFIG_X86_5LEVEL
199	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
200		"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
201#else
202	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
203	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
204#endif
205
206	/* If this changed %rcx, it was not canonical */
207	cmpq	%rcx, %r11
208	jne	swapgs_restore_regs_and_return_to_usermode
209
210	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
211	jne	swapgs_restore_regs_and_return_to_usermode
212
213	movq	R11(%rsp), %r11
214	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
215	jne	swapgs_restore_regs_and_return_to_usermode
216
217	/*
218	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
219	 * restore RF properly. If the slowpath sets it for whatever reason, we
220	 * need to restore it correctly.
221	 *
222	 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
223	 * trap from userspace immediately after SYSRET.  This would cause an
224	 * infinite loop whenever #DB happens with register state that satisfies
225	 * the opportunistic SYSRET conditions.  For example, single-stepping
226	 * this user code:
227	 *
228	 *           movq	$stuck_here, %rcx
229	 *           pushfq
230	 *           popq %r11
231	 *   stuck_here:
232	 *
233	 * would never get past 'stuck_here'.
234	 */
235	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
236	jnz	swapgs_restore_regs_and_return_to_usermode
237
238	/* nothing to check for RSP */
239
240	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
241	jne	swapgs_restore_regs_and_return_to_usermode
242
243	/*
244	 * We win! This label is here just for ease of understanding
245	 * perf profiles. Nothing jumps here.
246	 */
247syscall_return_via_sysret:
248	/* rcx and r11 are already restored (see code above) */
249	POP_REGS pop_rdi=0 skip_r11rcx=1
250
251	/*
252	 * Now all regs are restored except RSP and RDI.
253	 * Save old stack pointer and switch to trampoline stack.
254	 */
255	movq	%rsp, %rdi
256	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
257	UNWIND_HINT_EMPTY
258
259	pushq	RSP-RDI(%rdi)	/* RSP */
260	pushq	(%rdi)		/* RDI */
261
262	/*
263	 * We are on the trampoline stack.  All regs except RDI are live.
264	 * We can do future final exit work right here.
265	 */
266	STACKLEAK_ERASE_NOCLOBBER
267
268	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
269
270	popq	%rdi
271	popq	%rsp
272	USERGS_SYSRET64
273SYM_CODE_END(entry_SYSCALL_64)
274
275/*
276 * %rdi: prev task
277 * %rsi: next task
278 */
279.pushsection .text, "ax"
280SYM_FUNC_START(__switch_to_asm)
281	/*
282	 * Save callee-saved registers
283	 * This must match the order in inactive_task_frame
284	 */
285	pushq	%rbp
286	pushq	%rbx
287	pushq	%r12
288	pushq	%r13
289	pushq	%r14
290	pushq	%r15
291
292	/* switch stack */
293	movq	%rsp, TASK_threadsp(%rdi)
294	movq	TASK_threadsp(%rsi), %rsp
295
296#ifdef CONFIG_STACKPROTECTOR
297	movq	TASK_stack_canary(%rsi), %rbx
298	movq	%rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
299#endif
300
301#ifdef CONFIG_RETPOLINE
302	/*
303	 * When switching from a shallower to a deeper call stack
304	 * the RSB may either underflow or use entries populated
305	 * with userspace addresses. On CPUs where those concerns
306	 * exist, overwrite the RSB with entries which capture
307	 * speculative execution to prevent attack.
308	 */
309	FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
310#endif
311
312	/* restore callee-saved registers */
313	popq	%r15
314	popq	%r14
315	popq	%r13
316	popq	%r12
317	popq	%rbx
318	popq	%rbp
319
320	jmp	__switch_to
321SYM_FUNC_END(__switch_to_asm)
322.popsection
323
324/*
325 * A newly forked process directly context switches into this address.
326 *
327 * rax: prev task we switched from
328 * rbx: kernel thread func (NULL for user thread)
329 * r12: kernel thread arg
330 */
331.pushsection .text, "ax"
332SYM_CODE_START(ret_from_fork)
333	UNWIND_HINT_EMPTY
334	movq	%rax, %rdi
335	call	schedule_tail			/* rdi: 'prev' task parameter */
336
337	testq	%rbx, %rbx			/* from kernel_thread? */
338	jnz	1f				/* kernel threads are uncommon */
339
3402:
341	UNWIND_HINT_REGS
342	movq	%rsp, %rdi
343	call	syscall_return_slowpath	/* returns with IRQs disabled */
344	jmp	swapgs_restore_regs_and_return_to_usermode
345
3461:
347	/* kernel thread */
348	UNWIND_HINT_EMPTY
349	movq	%r12, %rdi
350	CALL_NOSPEC rbx
351	/*
352	 * A kernel thread is allowed to return here after successfully
353	 * calling do_execve().  Exit to userspace to complete the execve()
354	 * syscall.
355	 */
356	movq	$0, RAX(%rsp)
357	jmp	2b
358SYM_CODE_END(ret_from_fork)
359.popsection
360
361.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
362#ifdef CONFIG_DEBUG_ENTRY
363	pushq %rax
364	SAVE_FLAGS(CLBR_RAX)
365	testl $X86_EFLAGS_IF, %eax
366	jz .Lokay_\@
367	ud2
368.Lokay_\@:
369	popq %rax
370#endif
371.endm
372
373/*
374 * Enters the IRQ stack if we're not already using it.  NMI-safe.  Clobbers
375 * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
376 * Requires kernel GSBASE.
377 *
378 * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
379 */
380.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
381	DEBUG_ENTRY_ASSERT_IRQS_OFF
382
383	.if \save_ret
384	/*
385	 * If save_ret is set, the original stack contains one additional
386	 * entry -- the return address. Therefore, move the address one
387	 * entry below %rsp to \old_rsp.
388	 */
389	leaq	8(%rsp), \old_rsp
390	.else
391	movq	%rsp, \old_rsp
392	.endif
393
394	.if \regs
395	UNWIND_HINT_REGS base=\old_rsp
396	.endif
397
398	incl	PER_CPU_VAR(irq_count)
399	jnz	.Lirq_stack_push_old_rsp_\@
400
401	/*
402	 * Right now, if we just incremented irq_count to zero, we've
403	 * claimed the IRQ stack but we haven't switched to it yet.
404	 *
405	 * If anything is added that can interrupt us here without using IST,
406	 * it must be *extremely* careful to limit its stack usage.  This
407	 * could include kprobes and a hypothetical future IST-less #DB
408	 * handler.
409	 *
410	 * The OOPS unwinder relies on the word at the top of the IRQ
411	 * stack linking back to the previous RSP for the entire time we're
412	 * on the IRQ stack.  For this to work reliably, we need to write
413	 * it before we actually move ourselves to the IRQ stack.
414	 */
415
416	movq	\old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8)
417	movq	PER_CPU_VAR(hardirq_stack_ptr), %rsp
418
419#ifdef CONFIG_DEBUG_ENTRY
420	/*
421	 * If the first movq above becomes wrong due to IRQ stack layout
422	 * changes, the only way we'll notice is if we try to unwind right
423	 * here.  Assert that we set up the stack right to catch this type
424	 * of bug quickly.
425	 */
426	cmpq	-8(%rsp), \old_rsp
427	je	.Lirq_stack_okay\@
428	ud2
429	.Lirq_stack_okay\@:
430#endif
431
432.Lirq_stack_push_old_rsp_\@:
433	pushq	\old_rsp
434
435	.if \regs
436	UNWIND_HINT_REGS indirect=1
437	.endif
438
439	.if \save_ret
440	/*
441	 * Push the return address to the stack. This return address can
442	 * be found at the "real" original RSP, which was offset by 8 at
443	 * the beginning of this macro.
444	 */
445	pushq	-8(\old_rsp)
446	.endif
447.endm
448
449/*
450 * Undoes ENTER_IRQ_STACK.
451 */
452.macro LEAVE_IRQ_STACK regs=1
453	DEBUG_ENTRY_ASSERT_IRQS_OFF
454	/* We need to be off the IRQ stack before decrementing irq_count. */
455	popq	%rsp
456
457	.if \regs
458	UNWIND_HINT_REGS
459	.endif
460
461	/*
462	 * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
463	 * the irq stack but we're not on it.
464	 */
465
466	decl	PER_CPU_VAR(irq_count)
467.endm
468
469/**
470 * idtentry_body - Macro to emit code calling the C function
471 * @cfunc:		C function to be called
472 * @has_error_code:	Hardware pushed error code on stack
473 */
474.macro idtentry_body cfunc has_error_code:req
475
476	call	error_entry
477	UNWIND_HINT_REGS
478
479	movq	%rsp, %rdi			/* pt_regs pointer into 1st argument*/
480
481	.if \has_error_code == 1
482		movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
483		movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
484	.endif
485
486	call	\cfunc
487
488	jmp	error_return
489.endm
490
491/**
492 * idtentry - Macro to generate entry stubs for simple IDT entries
493 * @vector:		Vector number
494 * @asmsym:		ASM symbol for the entry point
495 * @cfunc:		C function to be called
496 * @has_error_code:	Hardware pushed error code on stack
497 *
498 * The macro emits code to set up the kernel context for straight forward
499 * and simple IDT entries. No IST stack, no paranoid entry checks.
500 */
501.macro idtentry vector asmsym cfunc has_error_code:req
502SYM_CODE_START(\asmsym)
503	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
504	ASM_CLAC
505
506	.if \has_error_code == 0
507		pushq	$-1			/* ORIG_RAX: no syscall to restart */
508	.endif
509
510	.if \vector == X86_TRAP_BP
511		/*
512		 * If coming from kernel space, create a 6-word gap to allow the
513		 * int3 handler to emulate a call instruction.
514		 */
515		testb	$3, CS-ORIG_RAX(%rsp)
516		jnz	.Lfrom_usermode_no_gap_\@
517		.rept	6
518		pushq	5*8(%rsp)
519		.endr
520		UNWIND_HINT_IRET_REGS offset=8
521.Lfrom_usermode_no_gap_\@:
522	.endif
523
524	idtentry_body \cfunc \has_error_code
525
526_ASM_NOKPROBE(\asmsym)
527SYM_CODE_END(\asmsym)
528.endm
529
530/*
531 * Interrupt entry/exit.
532 *
533 + The interrupt stubs push (vector) onto the stack, which is the error_code
534 * position of idtentry exceptions, and jump to one of the two idtentry points
535 * (common/spurious).
536 *
537 * common_interrupt is a hotpath, align it to a cache line
538 */
539.macro idtentry_irq vector cfunc
540	.p2align CONFIG_X86_L1_CACHE_SHIFT
541	idtentry \vector asm_\cfunc \cfunc has_error_code=1
542.endm
543
544/*
545 * System vectors which invoke their handlers directly and are not
546 * going through the regular common device interrupt handling code.
547 */
548.macro idtentry_sysvec vector cfunc
549	idtentry \vector asm_\cfunc \cfunc has_error_code=0
550.endm
551
552/*
553 * MCE and DB exceptions
554 */
555#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
556
557/**
558 * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
559 * @vector:		Vector number
560 * @asmsym:		ASM symbol for the entry point
561 * @cfunc:		C function to be called
562 *
563 * The macro emits code to set up the kernel context for #MC and #DB
564 *
565 * If the entry comes from user space it uses the normal entry path
566 * including the return to user space work and preemption checks on
567 * exit.
568 *
569 * If hits in kernel mode then it needs to go through the paranoid
570 * entry as the exception can hit any random state. No preemption
571 * check on exit to keep the paranoid path simple.
572 *
573 * If the trap is #DB then the interrupt stack entry in the IST is
574 * moved to the second stack, so a potential recursion will have a
575 * fresh IST.
576 */
577.macro idtentry_mce_db vector asmsym cfunc
578SYM_CODE_START(\asmsym)
579	UNWIND_HINT_IRET_REGS
580	ASM_CLAC
581
582	pushq	$-1			/* ORIG_RAX: no syscall to restart */
583
584	/*
585	 * If the entry is from userspace, switch stacks and treat it as
586	 * a normal entry.
587	 */
588	testb	$3, CS-ORIG_RAX(%rsp)
589	jnz	.Lfrom_usermode_switch_stack_\@
590
591	/*
592	 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
593	 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
594	 */
595	call	paranoid_entry
596
597	UNWIND_HINT_REGS
598
599	.if \vector == X86_TRAP_DB
600		TRACE_IRQS_OFF_DEBUG
601	.else
602		TRACE_IRQS_OFF
603	.endif
604
605	movq	%rsp, %rdi		/* pt_regs pointer */
606
607	.if \vector == X86_TRAP_DB
608		subq	$DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB)
609	.endif
610
611	call	\cfunc
612
613	.if \vector == X86_TRAP_DB
614		addq	$DB_STACK_OFFSET, CPU_TSS_IST(IST_INDEX_DB)
615	.endif
616
617	jmp	paranoid_exit
618
619	/* Switch to the regular task stack and use the noist entry point */
620.Lfrom_usermode_switch_stack_\@:
621	idtentry_body noist_\cfunc, has_error_code=0
622
623_ASM_NOKPROBE(\asmsym)
624SYM_CODE_END(\asmsym)
625.endm
626
627/*
628 * Double fault entry. Straight paranoid. No checks from which context
629 * this comes because for the espfix induced #DF this would do the wrong
630 * thing.
631 */
632.macro idtentry_df vector asmsym cfunc
633SYM_CODE_START(\asmsym)
634	UNWIND_HINT_IRET_REGS offset=8
635	ASM_CLAC
636
637	/*
638	 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
639	 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
640	 */
641	call	paranoid_entry
642	UNWIND_HINT_REGS
643
644	movq	%rsp, %rdi		/* pt_regs pointer into first argument */
645	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
646	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
647	call	\cfunc
648
649	jmp	paranoid_exit
650
651_ASM_NOKPROBE(\asmsym)
652SYM_CODE_END(\asmsym)
653.endm
654
655/*
656 * Include the defines which emit the idt entries which are shared
657 * shared between 32 and 64 bit.
658 */
659#include <asm/idtentry.h>
660
661/*
662 * Interrupt entry helper function.
663 *
664 * Entry runs with interrupts off. Stack layout at entry:
665 * +----------------------------------------------------+
666 * | regs->ss						|
667 * | regs->rsp						|
668 * | regs->eflags					|
669 * | regs->cs						|
670 * | regs->ip						|
671 * +----------------------------------------------------+
672 * | regs->orig_ax = ~(interrupt number)		|
673 * +----------------------------------------------------+
674 * | return address					|
675 * +----------------------------------------------------+
676 */
677SYM_CODE_START(interrupt_entry)
678	UNWIND_HINT_IRET_REGS offset=16
679	ASM_CLAC
680	cld
681
682	testb	$3, CS-ORIG_RAX+8(%rsp)
683	jz	1f
684	SWAPGS
685	FENCE_SWAPGS_USER_ENTRY
686	/*
687	 * Switch to the thread stack. The IRET frame and orig_ax are
688	 * on the stack, as well as the return address. RDI..R12 are
689	 * not (yet) on the stack and space has not (yet) been
690	 * allocated for them.
691	 */
692	pushq	%rdi
693
694	/* Need to switch before accessing the thread stack. */
695	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
696	movq	%rsp, %rdi
697	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
698
699	 /*
700	  * We have RDI, return address, and orig_ax on the stack on
701	  * top of the IRET frame. That means offset=24
702	  */
703	UNWIND_HINT_IRET_REGS base=%rdi offset=24
704
705	pushq	7*8(%rdi)		/* regs->ss */
706	pushq	6*8(%rdi)		/* regs->rsp */
707	pushq	5*8(%rdi)		/* regs->eflags */
708	pushq	4*8(%rdi)		/* regs->cs */
709	pushq	3*8(%rdi)		/* regs->ip */
710	UNWIND_HINT_IRET_REGS
711	pushq	2*8(%rdi)		/* regs->orig_ax */
712	pushq	8(%rdi)			/* return address */
713
714	movq	(%rdi), %rdi
715	jmp	2f
7161:
717	FENCE_SWAPGS_KERNEL_ENTRY
7182:
719	PUSH_AND_CLEAR_REGS save_ret=1
720	ENCODE_FRAME_POINTER 8
721
722	testb	$3, CS+8(%rsp)
723	jz	1f
724
725	/*
726	 * IRQ from user mode.
727	 *
728	 * We need to tell lockdep that IRQs are off.  We can't do this until
729	 * we fix gsbase, and we should do it before enter_from_user_mode
730	 * (which can take locks).  Since TRACE_IRQS_OFF is idempotent,
731	 * the simplest way to handle it is to just call it twice if
732	 * we enter from user mode.  There's no reason to optimize this since
733	 * TRACE_IRQS_OFF is a no-op if lockdep is off.
734	 */
735	TRACE_IRQS_OFF
736
737	CALL_enter_from_user_mode
738
7391:
740	ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
741	/* We entered an interrupt context - irqs are off: */
742	TRACE_IRQS_OFF
743
744	ret
745SYM_CODE_END(interrupt_entry)
746_ASM_NOKPROBE(interrupt_entry)
747
748SYM_CODE_START_LOCAL(common_interrupt_return)
749ret_from_intr:
750	DISABLE_INTERRUPTS(CLBR_ANY)
751	TRACE_IRQS_OFF
752
753	LEAVE_IRQ_STACK
754
755	testb	$3, CS(%rsp)
756	jz	retint_kernel
757
758	/* Interrupt came from user space */
759.Lretint_user:
760	mov	%rsp,%rdi
761	call	prepare_exit_to_usermode
762
763SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
764#ifdef CONFIG_DEBUG_ENTRY
765	/* Assert that pt_regs indicates user mode. */
766	testb	$3, CS(%rsp)
767	jnz	1f
768	ud2
7691:
770#endif
771	POP_REGS pop_rdi=0
772
773	/*
774	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
775	 * Save old stack pointer and switch to trampoline stack.
776	 */
777	movq	%rsp, %rdi
778	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
779	UNWIND_HINT_EMPTY
780
781	/* Copy the IRET frame to the trampoline stack. */
782	pushq	6*8(%rdi)	/* SS */
783	pushq	5*8(%rdi)	/* RSP */
784	pushq	4*8(%rdi)	/* EFLAGS */
785	pushq	3*8(%rdi)	/* CS */
786	pushq	2*8(%rdi)	/* RIP */
787
788	/* Push user RDI on the trampoline stack. */
789	pushq	(%rdi)
790
791	/*
792	 * We are on the trampoline stack.  All regs except RDI are live.
793	 * We can do future final exit work right here.
794	 */
795	STACKLEAK_ERASE_NOCLOBBER
796
797	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
798
799	/* Restore RDI. */
800	popq	%rdi
801	SWAPGS
802	INTERRUPT_RETURN
803
804
805/* Returning to kernel space */
806retint_kernel:
807#ifdef CONFIG_PREEMPTION
808	/* Interrupts are off */
809	/* Check if we need preemption */
810	btl	$9, EFLAGS(%rsp)		/* were interrupts off? */
811	jnc	1f
812	cmpl	$0, PER_CPU_VAR(__preempt_count)
813	jnz	1f
814	call	preempt_schedule_irq
8151:
816#endif
817	/*
818	 * The iretq could re-enable interrupts:
819	 */
820	TRACE_IRQS_IRETQ
821
822SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
823#ifdef CONFIG_DEBUG_ENTRY
824	/* Assert that pt_regs indicates kernel mode. */
825	testb	$3, CS(%rsp)
826	jz	1f
827	ud2
8281:
829#endif
830	POP_REGS
831	addq	$8, %rsp	/* skip regs->orig_ax */
832	/*
833	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
834	 * when returning from IPI handler.
835	 */
836	INTERRUPT_RETURN
837
838SYM_INNER_LABEL_ALIGN(native_iret, SYM_L_GLOBAL)
839	UNWIND_HINT_IRET_REGS
840	/*
841	 * Are we returning to a stack segment from the LDT?  Note: in
842	 * 64-bit mode SS:RSP on the exception stack is always valid.
843	 */
844#ifdef CONFIG_X86_ESPFIX64
845	testb	$4, (SS-RIP)(%rsp)
846	jnz	native_irq_return_ldt
847#endif
848
849SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
850	/*
851	 * This may fault.  Non-paranoid faults on return to userspace are
852	 * handled by fixup_bad_iret.  These include #SS, #GP, and #NP.
853	 * Double-faults due to espfix64 are handled in exc_double_fault.
854	 * Other faults here are fatal.
855	 */
856	iretq
857
858#ifdef CONFIG_X86_ESPFIX64
859native_irq_return_ldt:
860	/*
861	 * We are running with user GSBASE.  All GPRs contain their user
862	 * values.  We have a percpu ESPFIX stack that is eight slots
863	 * long (see ESPFIX_STACK_SIZE).  espfix_waddr points to the bottom
864	 * of the ESPFIX stack.
865	 *
866	 * We clobber RAX and RDI in this code.  We stash RDI on the
867	 * normal stack and RAX on the ESPFIX stack.
868	 *
869	 * The ESPFIX stack layout we set up looks like this:
870	 *
871	 * --- top of ESPFIX stack ---
872	 * SS
873	 * RSP
874	 * RFLAGS
875	 * CS
876	 * RIP  <-- RSP points here when we're done
877	 * RAX  <-- espfix_waddr points here
878	 * --- bottom of ESPFIX stack ---
879	 */
880
881	pushq	%rdi				/* Stash user RDI */
882	SWAPGS					/* to kernel GS */
883	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
884
885	movq	PER_CPU_VAR(espfix_waddr), %rdi
886	movq	%rax, (0*8)(%rdi)		/* user RAX */
887	movq	(1*8)(%rsp), %rax		/* user RIP */
888	movq	%rax, (1*8)(%rdi)
889	movq	(2*8)(%rsp), %rax		/* user CS */
890	movq	%rax, (2*8)(%rdi)
891	movq	(3*8)(%rsp), %rax		/* user RFLAGS */
892	movq	%rax, (3*8)(%rdi)
893	movq	(5*8)(%rsp), %rax		/* user SS */
894	movq	%rax, (5*8)(%rdi)
895	movq	(4*8)(%rsp), %rax		/* user RSP */
896	movq	%rax, (4*8)(%rdi)
897	/* Now RAX == RSP. */
898
899	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
900
901	/*
902	 * espfix_stack[31:16] == 0.  The page tables are set up such that
903	 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
904	 * espfix_waddr for any X.  That is, there are 65536 RO aliases of
905	 * the same page.  Set up RSP so that RSP[31:16] contains the
906	 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
907	 * still points to an RO alias of the ESPFIX stack.
908	 */
909	orq	PER_CPU_VAR(espfix_stack), %rax
910
911	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
912	SWAPGS					/* to user GS */
913	popq	%rdi				/* Restore user RDI */
914
915	movq	%rax, %rsp
916	UNWIND_HINT_IRET_REGS offset=8
917
918	/*
919	 * At this point, we cannot write to the stack any more, but we can
920	 * still read.
921	 */
922	popq	%rax				/* Restore user RAX */
923
924	/*
925	 * RSP now points to an ordinary IRET frame, except that the page
926	 * is read-only and RSP[31:16] are preloaded with the userspace
927	 * values.  We can now IRET back to userspace.
928	 */
929	jmp	native_irq_return_iret
930#endif
931SYM_CODE_END(common_interrupt_return)
932_ASM_NOKPROBE(common_interrupt_return)
933
934/*
935 * APIC interrupts.
936 */
937.macro apicinterrupt3 num sym do_sym
938SYM_CODE_START(\sym)
939	UNWIND_HINT_IRET_REGS
940	pushq	$~(\num)
941	call	interrupt_entry
942	UNWIND_HINT_REGS indirect=1
943	call	\do_sym	/* rdi points to pt_regs */
944	jmp	ret_from_intr
945SYM_CODE_END(\sym)
946_ASM_NOKPROBE(\sym)
947.endm
948
949/* Make sure APIC interrupt handlers end up in the irqentry section: */
950#define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
951#define POP_SECTION_IRQENTRY	.popsection
952
953.macro apicinterrupt num sym do_sym
954PUSH_SECTION_IRQENTRY
955apicinterrupt3 \num \sym \do_sym
956POP_SECTION_IRQENTRY
957.endm
958
959#ifdef CONFIG_SMP
960apicinterrupt RESCHEDULE_VECTOR			reschedule_interrupt		smp_reschedule_interrupt
961#endif
962
963/*
964 * Reload gs selector with exception handling
965 * edi:  new selector
966 *
967 * Is in entry.text as it shouldn't be instrumented.
968 */
969SYM_FUNC_START(asm_load_gs_index)
970	FRAME_BEGIN
971	swapgs
972.Lgs_change:
973	movl	%edi, %gs
9742:	ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
975	swapgs
976	FRAME_END
977	ret
978SYM_FUNC_END(asm_load_gs_index)
979EXPORT_SYMBOL(asm_load_gs_index)
980
981	_ASM_EXTABLE(.Lgs_change, .Lbad_gs)
982	.section .fixup, "ax"
983	/* running with kernelgs */
984SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
985	swapgs					/* switch back to user gs */
986.macro ZAP_GS
987	/* This can't be a string because the preprocessor needs to see it. */
988	movl $__USER_DS, %eax
989	movl %eax, %gs
990.endm
991	ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
992	xorl	%eax, %eax
993	movl	%eax, %gs
994	jmp	2b
995SYM_CODE_END(.Lbad_gs)
996	.previous
997
998/*
999 * rdi: New stack pointer points to the top word of the stack
1000 * rsi: Function pointer
1001 * rdx: Function argument (can be NULL if none)
1002 */
1003SYM_FUNC_START(asm_call_on_stack)
1004	/*
1005	 * Save the frame pointer unconditionally. This allows the ORC
1006	 * unwinder to handle the stack switch.
1007	 */
1008	pushq		%rbp
1009	mov		%rsp, %rbp
1010
1011	/*
1012	 * The unwinder relies on the word at the top of the new stack
1013	 * page linking back to the previous RSP.
1014	 */
1015	mov		%rsp, (%rdi)
1016	mov		%rdi, %rsp
1017	/* Move the argument to the right place */
1018	mov		%rdx, %rdi
1019
10201:
1021	.pushsection .discard.instr_begin
1022	.long 1b - .
1023	.popsection
1024
1025	CALL_NOSPEC	rsi
1026
10272:
1028	.pushsection .discard.instr_end
1029	.long 2b - .
1030	.popsection
1031
1032	/* Restore the previous stack pointer from RBP. */
1033	leaveq
1034	ret
1035SYM_FUNC_END(asm_call_on_stack)
1036
1037#ifdef CONFIG_XEN_PV
1038/*
1039 * A note on the "critical region" in our callback handler.
1040 * We want to avoid stacking callback handlers due to events occurring
1041 * during handling of the last event. To do this, we keep events disabled
1042 * until we've done all processing. HOWEVER, we must enable events before
1043 * popping the stack frame (can't be done atomically) and so it would still
1044 * be possible to get enough handler activations to overflow the stack.
1045 * Although unlikely, bugs of that kind are hard to track down, so we'd
1046 * like to avoid the possibility.
1047 * So, on entry to the handler we detect whether we interrupted an
1048 * existing activation in its critical region -- if so, we pop the current
1049 * activation and restart the handler using the previous one.
1050 *
1051 * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
1052 */
1053SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
1054
1055/*
1056 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1057 * see the correct pointer to the pt_regs
1058 */
1059	UNWIND_HINT_FUNC
1060	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
1061	UNWIND_HINT_REGS
1062
1063	call	xen_pv_evtchn_do_upcall
1064
1065	jmp	error_return
1066SYM_CODE_END(exc_xen_hypervisor_callback)
1067
1068/*
1069 * Hypervisor uses this for application faults while it executes.
1070 * We get here for two reasons:
1071 *  1. Fault while reloading DS, ES, FS or GS
1072 *  2. Fault while executing IRET
1073 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1074 * registers that could be reloaded and zeroed the others.
1075 * Category 2 we fix up by killing the current process. We cannot use the
1076 * normal Linux return path in this case because if we use the IRET hypercall
1077 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1078 * We distinguish between categories by comparing each saved segment register
1079 * with its current contents: any discrepancy means we in category 1.
1080 */
1081SYM_CODE_START(xen_failsafe_callback)
1082	UNWIND_HINT_EMPTY
1083	movl	%ds, %ecx
1084	cmpw	%cx, 0x10(%rsp)
1085	jne	1f
1086	movl	%es, %ecx
1087	cmpw	%cx, 0x18(%rsp)
1088	jne	1f
1089	movl	%fs, %ecx
1090	cmpw	%cx, 0x20(%rsp)
1091	jne	1f
1092	movl	%gs, %ecx
1093	cmpw	%cx, 0x28(%rsp)
1094	jne	1f
1095	/* All segments match their saved values => Category 2 (Bad IRET). */
1096	movq	(%rsp), %rcx
1097	movq	8(%rsp), %r11
1098	addq	$0x30, %rsp
1099	pushq	$0				/* RIP */
1100	UNWIND_HINT_IRET_REGS offset=8
1101	jmp	asm_exc_general_protection
11021:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1103	movq	(%rsp), %rcx
1104	movq	8(%rsp), %r11
1105	addq	$0x30, %rsp
1106	UNWIND_HINT_IRET_REGS
1107	pushq	$-1 /* orig_ax = -1 => not a system call */
1108	PUSH_AND_CLEAR_REGS
1109	ENCODE_FRAME_POINTER
1110	jmp	error_return
1111SYM_CODE_END(xen_failsafe_callback)
1112#endif /* CONFIG_XEN_PV */
1113
1114#ifdef CONFIG_XEN_PVHVM
1115apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
1116	xen_hvm_callback_vector xen_evtchn_do_upcall
1117#endif
1118
1119/*
1120 * Save all registers in pt_regs, and switch gs if needed.
1121 * Use slow, but surefire "are we in kernel?" check.
1122 * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
1123 */
1124SYM_CODE_START_LOCAL(paranoid_entry)
1125	UNWIND_HINT_FUNC
1126	cld
1127	PUSH_AND_CLEAR_REGS save_ret=1
1128	ENCODE_FRAME_POINTER 8
1129	movl	$1, %ebx
1130	movl	$MSR_GS_BASE, %ecx
1131	rdmsr
1132	testl	%edx, %edx
1133	js	1f				/* negative -> in kernel */
1134	SWAPGS
1135	xorl	%ebx, %ebx
1136
11371:
1138	/*
1139	 * Always stash CR3 in %r14.  This value will be restored,
1140	 * verbatim, at exit.  Needed if paranoid_entry interrupted
1141	 * another entry that already switched to the user CR3 value
1142	 * but has not yet returned to userspace.
1143	 *
1144	 * This is also why CS (stashed in the "iret frame" by the
1145	 * hardware at entry) can not be used: this may be a return
1146	 * to kernel code, but with a user CR3 value.
1147	 */
1148	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
1149
1150	/*
1151	 * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
1152	 * unconditional CR3 write, even in the PTI case.  So do an lfence
1153	 * to prevent GS speculation, regardless of whether PTI is enabled.
1154	 */
1155	FENCE_SWAPGS_KERNEL_ENTRY
1156
1157	ret
1158SYM_CODE_END(paranoid_entry)
1159
1160/*
1161 * "Paranoid" exit path from exception stack.  This is invoked
1162 * only on return from non-NMI IST interrupts that came
1163 * from kernel space.
1164 *
1165 * We may be returning to very strange contexts (e.g. very early
1166 * in syscall entry), so checking for preemption here would
1167 * be complicated.  Fortunately, we there's no good reason
1168 * to try to handle preemption here.
1169 *
1170 * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
1171 */
1172SYM_CODE_START_LOCAL(paranoid_exit)
1173	UNWIND_HINT_REGS
1174	DISABLE_INTERRUPTS(CLBR_ANY)
1175	TRACE_IRQS_OFF_DEBUG
1176	testl	%ebx, %ebx			/* swapgs needed? */
1177	jnz	.Lparanoid_exit_no_swapgs
1178	TRACE_IRQS_IRETQ
1179	/* Always restore stashed CR3 value (see paranoid_entry) */
1180	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
1181	SWAPGS_UNSAFE_STACK
1182	jmp	restore_regs_and_return_to_kernel
1183.Lparanoid_exit_no_swapgs:
1184	TRACE_IRQS_IRETQ_DEBUG
1185	/* Always restore stashed CR3 value (see paranoid_entry) */
1186	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
1187	jmp restore_regs_and_return_to_kernel
1188SYM_CODE_END(paranoid_exit)
1189
1190/*
1191 * Save all registers in pt_regs, and switch GS if needed.
1192 */
1193SYM_CODE_START_LOCAL(error_entry)
1194	UNWIND_HINT_FUNC
1195	cld
1196	PUSH_AND_CLEAR_REGS save_ret=1
1197	ENCODE_FRAME_POINTER 8
1198	testb	$3, CS+8(%rsp)
1199	jz	.Lerror_kernelspace
1200
1201	/*
1202	 * We entered from user mode or we're pretending to have entered
1203	 * from user mode due to an IRET fault.
1204	 */
1205	SWAPGS
1206	FENCE_SWAPGS_USER_ENTRY
1207	/* We have user CR3.  Change to kernel CR3. */
1208	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1209
1210.Lerror_entry_from_usermode_after_swapgs:
1211	/* Put us onto the real thread stack. */
1212	popq	%r12				/* save return addr in %12 */
1213	movq	%rsp, %rdi			/* arg0 = pt_regs pointer */
1214	call	sync_regs
1215	movq	%rax, %rsp			/* switch stack */
1216	ENCODE_FRAME_POINTER
1217	pushq	%r12
1218	ret
1219
1220.Lerror_entry_done_lfence:
1221	FENCE_SWAPGS_KERNEL_ENTRY
1222.Lerror_entry_done:
1223	ret
1224
1225	/*
1226	 * There are two places in the kernel that can potentially fault with
1227	 * usergs. Handle them here.  B stepping K8s sometimes report a
1228	 * truncated RIP for IRET exceptions returning to compat mode. Check
1229	 * for these here too.
1230	 */
1231.Lerror_kernelspace:
1232	leaq	native_irq_return_iret(%rip), %rcx
1233	cmpq	%rcx, RIP+8(%rsp)
1234	je	.Lerror_bad_iret
1235	movl	%ecx, %eax			/* zero extend */
1236	cmpq	%rax, RIP+8(%rsp)
1237	je	.Lbstep_iret
1238	cmpq	$.Lgs_change, RIP+8(%rsp)
1239	jne	.Lerror_entry_done_lfence
1240
1241	/*
1242	 * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
1243	 * gsbase and proceed.  We'll fix up the exception and land in
1244	 * .Lgs_change's error handler with kernel gsbase.
1245	 */
1246	SWAPGS
1247	FENCE_SWAPGS_USER_ENTRY
1248	jmp .Lerror_entry_done
1249
1250.Lbstep_iret:
1251	/* Fix truncated RIP */
1252	movq	%rcx, RIP+8(%rsp)
1253	/* fall through */
1254
1255.Lerror_bad_iret:
1256	/*
1257	 * We came from an IRET to user mode, so we have user
1258	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
1259	 */
1260	SWAPGS
1261	FENCE_SWAPGS_USER_ENTRY
1262	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1263
1264	/*
1265	 * Pretend that the exception came from user mode: set up pt_regs
1266	 * as if we faulted immediately after IRET.
1267	 */
1268	mov	%rsp, %rdi
1269	call	fixup_bad_iret
1270	mov	%rax, %rsp
1271	jmp	.Lerror_entry_from_usermode_after_swapgs
1272SYM_CODE_END(error_entry)
1273
1274SYM_CODE_START_LOCAL(error_return)
1275	UNWIND_HINT_REGS
1276	DEBUG_ENTRY_ASSERT_IRQS_OFF
1277	testb	$3, CS(%rsp)
1278	jz	restore_regs_and_return_to_kernel
1279	jmp	swapgs_restore_regs_and_return_to_usermode
1280SYM_CODE_END(error_return)
1281
1282/*
1283 * Runs on exception stack.  Xen PV does not go through this path at all,
1284 * so we can use real assembly here.
1285 *
1286 * Registers:
1287 *	%r14: Used to save/restore the CR3 of the interrupted context
1288 *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
1289 */
1290SYM_CODE_START(asm_exc_nmi)
1291	UNWIND_HINT_IRET_REGS
1292
1293	/*
1294	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1295	 * the iretq it performs will take us out of NMI context.
1296	 * This means that we can have nested NMIs where the next
1297	 * NMI is using the top of the stack of the previous NMI. We
1298	 * can't let it execute because the nested NMI will corrupt the
1299	 * stack of the previous NMI. NMI handlers are not re-entrant
1300	 * anyway.
1301	 *
1302	 * To handle this case we do the following:
1303	 *  Check the a special location on the stack that contains
1304	 *  a variable that is set when NMIs are executing.
1305	 *  The interrupted task's stack is also checked to see if it
1306	 *  is an NMI stack.
1307	 *  If the variable is not set and the stack is not the NMI
1308	 *  stack then:
1309	 *    o Set the special variable on the stack
1310	 *    o Copy the interrupt frame into an "outermost" location on the
1311	 *      stack
1312	 *    o Copy the interrupt frame into an "iret" location on the stack
1313	 *    o Continue processing the NMI
1314	 *  If the variable is set or the previous stack is the NMI stack:
1315	 *    o Modify the "iret" location to jump to the repeat_nmi
1316	 *    o return back to the first NMI
1317	 *
1318	 * Now on exit of the first NMI, we first clear the stack variable
1319	 * The NMI stack will tell any nested NMIs at that point that it is
1320	 * nested. Then we pop the stack normally with iret, and if there was
1321	 * a nested NMI that updated the copy interrupt stack frame, a
1322	 * jump will be made to the repeat_nmi code that will handle the second
1323	 * NMI.
1324	 *
1325	 * However, espfix prevents us from directly returning to userspace
1326	 * with a single IRET instruction.  Similarly, IRET to user mode
1327	 * can fault.  We therefore handle NMIs from user space like
1328	 * other IST entries.
1329	 */
1330
1331	ASM_CLAC
1332
1333	/* Use %rdx as our temp variable throughout */
1334	pushq	%rdx
1335
1336	testb	$3, CS-RIP+8(%rsp)
1337	jz	.Lnmi_from_kernel
1338
1339	/*
1340	 * NMI from user mode.  We need to run on the thread stack, but we
1341	 * can't go through the normal entry paths: NMIs are masked, and
1342	 * we don't want to enable interrupts, because then we'll end
1343	 * up in an awkward situation in which IRQs are on but NMIs
1344	 * are off.
1345	 *
1346	 * We also must not push anything to the stack before switching
1347	 * stacks lest we corrupt the "NMI executing" variable.
1348	 */
1349
1350	swapgs
1351	cld
1352	FENCE_SWAPGS_USER_ENTRY
1353	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
1354	movq	%rsp, %rdx
1355	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
1356	UNWIND_HINT_IRET_REGS base=%rdx offset=8
1357	pushq	5*8(%rdx)	/* pt_regs->ss */
1358	pushq	4*8(%rdx)	/* pt_regs->rsp */
1359	pushq	3*8(%rdx)	/* pt_regs->flags */
1360	pushq	2*8(%rdx)	/* pt_regs->cs */
1361	pushq	1*8(%rdx)	/* pt_regs->rip */
1362	UNWIND_HINT_IRET_REGS
1363	pushq   $-1		/* pt_regs->orig_ax */
1364	PUSH_AND_CLEAR_REGS rdx=(%rdx)
1365	ENCODE_FRAME_POINTER
1366
1367	/*
1368	 * At this point we no longer need to worry about stack damage
1369	 * due to nesting -- we're on the normal thread stack and we're
1370	 * done with the NMI stack.
1371	 */
1372
1373	movq	%rsp, %rdi
1374	movq	$-1, %rsi
1375	call	exc_nmi
1376
1377	/*
1378	 * Return back to user mode.  We must *not* do the normal exit
1379	 * work, because we don't want to enable interrupts.
1380	 */
1381	jmp	swapgs_restore_regs_and_return_to_usermode
1382
1383.Lnmi_from_kernel:
1384	/*
1385	 * Here's what our stack frame will look like:
1386	 * +---------------------------------------------------------+
1387	 * | original SS                                             |
1388	 * | original Return RSP                                     |
1389	 * | original RFLAGS                                         |
1390	 * | original CS                                             |
1391	 * | original RIP                                            |
1392	 * +---------------------------------------------------------+
1393	 * | temp storage for rdx                                    |
1394	 * +---------------------------------------------------------+
1395	 * | "NMI executing" variable                                |
1396	 * +---------------------------------------------------------+
1397	 * | iret SS          } Copied from "outermost" frame        |
1398	 * | iret Return RSP  } on each loop iteration; overwritten  |
1399	 * | iret RFLAGS      } by a nested NMI to force another     |
1400	 * | iret CS          } iteration if needed.                 |
1401	 * | iret RIP         }                                      |
1402	 * +---------------------------------------------------------+
1403	 * | outermost SS          } initialized in first_nmi;       |
1404	 * | outermost Return RSP  } will not be changed before      |
1405	 * | outermost RFLAGS      } NMI processing is done.         |
1406	 * | outermost CS          } Copied to "iret" frame on each  |
1407	 * | outermost RIP         } iteration.                      |
1408	 * +---------------------------------------------------------+
1409	 * | pt_regs                                                 |
1410	 * +---------------------------------------------------------+
1411	 *
1412	 * The "original" frame is used by hardware.  Before re-enabling
1413	 * NMIs, we need to be done with it, and we need to leave enough
1414	 * space for the asm code here.
1415	 *
1416	 * We return by executing IRET while RSP points to the "iret" frame.
1417	 * That will either return for real or it will loop back into NMI
1418	 * processing.
1419	 *
1420	 * The "outermost" frame is copied to the "iret" frame on each
1421	 * iteration of the loop, so each iteration starts with the "iret"
1422	 * frame pointing to the final return target.
1423	 */
1424
1425	/*
1426	 * Determine whether we're a nested NMI.
1427	 *
1428	 * If we interrupted kernel code between repeat_nmi and
1429	 * end_repeat_nmi, then we are a nested NMI.  We must not
1430	 * modify the "iret" frame because it's being written by
1431	 * the outer NMI.  That's okay; the outer NMI handler is
1432	 * about to about to call exc_nmi() anyway, so we can just
1433	 * resume the outer NMI.
1434	 */
1435
1436	movq	$repeat_nmi, %rdx
1437	cmpq	8(%rsp), %rdx
1438	ja	1f
1439	movq	$end_repeat_nmi, %rdx
1440	cmpq	8(%rsp), %rdx
1441	ja	nested_nmi_out
14421:
1443
1444	/*
1445	 * Now check "NMI executing".  If it's set, then we're nested.
1446	 * This will not detect if we interrupted an outer NMI just
1447	 * before IRET.
1448	 */
1449	cmpl	$1, -8(%rsp)
1450	je	nested_nmi
1451
1452	/*
1453	 * Now test if the previous stack was an NMI stack.  This covers
1454	 * the case where we interrupt an outer NMI after it clears
1455	 * "NMI executing" but before IRET.  We need to be careful, though:
1456	 * there is one case in which RSP could point to the NMI stack
1457	 * despite there being no NMI active: naughty userspace controls
1458	 * RSP at the very beginning of the SYSCALL targets.  We can
1459	 * pull a fast one on naughty userspace, though: we program
1460	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1461	 * if it controls the kernel's RSP.  We set DF before we clear
1462	 * "NMI executing".
1463	 */
1464	lea	6*8(%rsp), %rdx
1465	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
1466	cmpq	%rdx, 4*8(%rsp)
1467	/* If the stack pointer is above the NMI stack, this is a normal NMI */
1468	ja	first_nmi
1469
1470	subq	$EXCEPTION_STKSZ, %rdx
1471	cmpq	%rdx, 4*8(%rsp)
1472	/* If it is below the NMI stack, it is a normal NMI */
1473	jb	first_nmi
1474
1475	/* Ah, it is within the NMI stack. */
1476
1477	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1478	jz	first_nmi	/* RSP was user controlled. */
1479
1480	/* This is a nested NMI. */
1481
1482nested_nmi:
1483	/*
1484	 * Modify the "iret" frame to point to repeat_nmi, forcing another
1485	 * iteration of NMI handling.
1486	 */
1487	subq	$8, %rsp
1488	leaq	-10*8(%rsp), %rdx
1489	pushq	$__KERNEL_DS
1490	pushq	%rdx
1491	pushfq
1492	pushq	$__KERNEL_CS
1493	pushq	$repeat_nmi
1494
1495	/* Put stack back */
1496	addq	$(6*8), %rsp
1497
1498nested_nmi_out:
1499	popq	%rdx
1500
1501	/* We are returning to kernel mode, so this cannot result in a fault. */
1502	iretq
1503
1504first_nmi:
1505	/* Restore rdx. */
1506	movq	(%rsp), %rdx
1507
1508	/* Make room for "NMI executing". */
1509	pushq	$0
1510
1511	/* Leave room for the "iret" frame */
1512	subq	$(5*8), %rsp
1513
1514	/* Copy the "original" frame to the "outermost" frame */
1515	.rept 5
1516	pushq	11*8(%rsp)
1517	.endr
1518	UNWIND_HINT_IRET_REGS
1519
1520	/* Everything up to here is safe from nested NMIs */
1521
1522#ifdef CONFIG_DEBUG_ENTRY
1523	/*
1524	 * For ease of testing, unmask NMIs right away.  Disabled by
1525	 * default because IRET is very expensive.
1526	 */
1527	pushq	$0		/* SS */
1528	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
1529	addq	$8, (%rsp)	/* Fix up RSP */
1530	pushfq			/* RFLAGS */
1531	pushq	$__KERNEL_CS	/* CS */
1532	pushq	$1f		/* RIP */
1533	iretq			/* continues at repeat_nmi below */
1534	UNWIND_HINT_IRET_REGS
15351:
1536#endif
1537
1538repeat_nmi:
1539	/*
1540	 * If there was a nested NMI, the first NMI's iret will return
1541	 * here. But NMIs are still enabled and we can take another
1542	 * nested NMI. The nested NMI checks the interrupted RIP to see
1543	 * if it is between repeat_nmi and end_repeat_nmi, and if so
1544	 * it will just return, as we are about to repeat an NMI anyway.
1545	 * This makes it safe to copy to the stack frame that a nested
1546	 * NMI will update.
1547	 *
1548	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
1549	 * we're repeating an NMI, gsbase has the same value that it had on
1550	 * the first iteration.  paranoid_entry will load the kernel
1551	 * gsbase if needed before we call exc_nmi().  "NMI executing"
1552	 * is zero.
1553	 */
1554	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
1555
1556	/*
1557	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
1558	 * here must not modify the "iret" frame while we're writing to
1559	 * it or it will end up containing garbage.
1560	 */
1561	addq	$(10*8), %rsp
1562	.rept 5
1563	pushq	-6*8(%rsp)
1564	.endr
1565	subq	$(5*8), %rsp
1566end_repeat_nmi:
1567
1568	/*
1569	 * Everything below this point can be preempted by a nested NMI.
1570	 * If this happens, then the inner NMI will change the "iret"
1571	 * frame to point back to repeat_nmi.
1572	 */
1573	pushq	$-1				/* ORIG_RAX: no syscall to restart */
1574
1575	/*
1576	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1577	 * as we should not be calling schedule in NMI context.
1578	 * Even with normal interrupts enabled. An NMI should not be
1579	 * setting NEED_RESCHED or anything that normal interrupts and
1580	 * exceptions might do.
1581	 */
1582	call	paranoid_entry
1583	UNWIND_HINT_REGS
1584
1585	/* paranoidentry exc_nmi(), 0; without TRACE_IRQS_OFF */
1586	movq	%rsp, %rdi
1587	movq	$-1, %rsi
1588	call	exc_nmi
1589
1590	/* Always restore stashed CR3 value (see paranoid_entry) */
1591	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
1592
1593	testl	%ebx, %ebx			/* swapgs needed? */
1594	jnz	nmi_restore
1595nmi_swapgs:
1596	SWAPGS_UNSAFE_STACK
1597nmi_restore:
1598	POP_REGS
1599
1600	/*
1601	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
1602	 * at the "iret" frame.
1603	 */
1604	addq	$6*8, %rsp
1605
1606	/*
1607	 * Clear "NMI executing".  Set DF first so that we can easily
1608	 * distinguish the remaining code between here and IRET from
1609	 * the SYSCALL entry and exit paths.
1610	 *
1611	 * We arguably should just inspect RIP instead, but I (Andy) wrote
1612	 * this code when I had the misapprehension that Xen PV supported
1613	 * NMIs, and Xen PV would break that approach.
1614	 */
1615	std
1616	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
1617
1618	/*
1619	 * iretq reads the "iret" frame and exits the NMI stack in a
1620	 * single instruction.  We are returning to kernel mode, so this
1621	 * cannot result in a fault.  Similarly, we don't need to worry
1622	 * about espfix64 on the way back to kernel mode.
1623	 */
1624	iretq
1625SYM_CODE_END(asm_exc_nmi)
1626
1627#ifndef CONFIG_IA32_EMULATION
1628/*
1629 * This handles SYSCALL from 32-bit code.  There is no way to program
1630 * MSRs to fully disable 32-bit SYSCALL.
1631 */
1632SYM_CODE_START(ignore_sysret)
1633	UNWIND_HINT_EMPTY
1634	mov	$-ENOSYS, %eax
1635	sysretl
1636SYM_CODE_END(ignore_sysret)
1637#endif
1638
1639.pushsection .text, "ax"
1640SYM_CODE_START(rewind_stack_do_exit)
1641	UNWIND_HINT_FUNC
1642	/* Prevent any naive code from trying to unwind to our caller. */
1643	xorl	%ebp, %ebp
1644
1645	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rax
1646	leaq	-PTREGS_SIZE(%rax), %rsp
1647	UNWIND_HINT_REGS
1648
1649	call	do_exit
1650SYM_CODE_END(rewind_stack_do_exit)
1651.popsection
1652