xref: /openbmc/linux/arch/x86/entry/entry_32.S (revision 9726bfcd)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 *  Copyright (C) 1991,1992  Linus Torvalds
4 *
5 * entry_32.S contains the system-call and low-level fault and trap handling routines.
6 *
7 * Stack layout while running C code:
8 *	ptrace needs to have all registers on the stack.
9 *	If the order here is changed, it needs to be
10 *	updated in fork.c:copy_process(), signal.c:do_signal(),
11 *	ptrace.c and ptrace.h
12 *
13 *	 0(%esp) - %ebx
14 *	 4(%esp) - %ecx
15 *	 8(%esp) - %edx
16 *	 C(%esp) - %esi
17 *	10(%esp) - %edi
18 *	14(%esp) - %ebp
19 *	18(%esp) - %eax
20 *	1C(%esp) - %ds
21 *	20(%esp) - %es
22 *	24(%esp) - %fs
23 *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
24 *	2C(%esp) - orig_eax
25 *	30(%esp) - %eip
26 *	34(%esp) - %cs
27 *	38(%esp) - %eflags
28 *	3C(%esp) - %oldesp
29 *	40(%esp) - %oldss
30 */
31
32#include <linux/linkage.h>
33#include <linux/err.h>
34#include <asm/thread_info.h>
35#include <asm/irqflags.h>
36#include <asm/errno.h>
37#include <asm/segment.h>
38#include <asm/smp.h>
39#include <asm/percpu.h>
40#include <asm/processor-flags.h>
41#include <asm/irq_vectors.h>
42#include <asm/cpufeatures.h>
43#include <asm/alternative-asm.h>
44#include <asm/asm.h>
45#include <asm/smap.h>
46#include <asm/frame.h>
47#include <asm/nospec-branch.h>
48
49#include "calling.h"
50
51	.section .entry.text, "ax"
52
53/*
54 * We use macros for low-level operations which need to be overridden
55 * for paravirtualization.  The following will never clobber any registers:
56 *   INTERRUPT_RETURN (aka. "iret")
57 *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
58 *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
59 *
60 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
61 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
62 * Allowing a register to be clobbered can shrink the paravirt replacement
63 * enough to patch inline, increasing performance.
64 */
65
66#ifdef CONFIG_PREEMPT
67# define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
68#else
69# define preempt_stop(clobbers)
70#endif
71
72.macro TRACE_IRQS_IRET
73#ifdef CONFIG_TRACE_IRQFLAGS
74	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
75	jz	1f
76	TRACE_IRQS_ON
771:
78#endif
79.endm
80
81#define PTI_SWITCH_MASK         (1 << PAGE_SHIFT)
82
83/*
84 * User gs save/restore
85 *
86 * %gs is used for userland TLS and kernel only uses it for stack
87 * canary which is required to be at %gs:20 by gcc.  Read the comment
88 * at the top of stackprotector.h for more info.
89 *
90 * Local labels 98 and 99 are used.
91 */
92#ifdef CONFIG_X86_32_LAZY_GS
93
94 /* unfortunately push/pop can't be no-op */
95.macro PUSH_GS
96	pushl	$0
97.endm
98.macro POP_GS pop=0
99	addl	$(4 + \pop), %esp
100.endm
101.macro POP_GS_EX
102.endm
103
104 /* all the rest are no-op */
105.macro PTGS_TO_GS
106.endm
107.macro PTGS_TO_GS_EX
108.endm
109.macro GS_TO_REG reg
110.endm
111.macro REG_TO_PTGS reg
112.endm
113.macro SET_KERNEL_GS reg
114.endm
115
116#else	/* CONFIG_X86_32_LAZY_GS */
117
118.macro PUSH_GS
119	pushl	%gs
120.endm
121
122.macro POP_GS pop=0
12398:	popl	%gs
124  .if \pop <> 0
125	add	$\pop, %esp
126  .endif
127.endm
128.macro POP_GS_EX
129.pushsection .fixup, "ax"
13099:	movl	$0, (%esp)
131	jmp	98b
132.popsection
133	_ASM_EXTABLE(98b, 99b)
134.endm
135
136.macro PTGS_TO_GS
13798:	mov	PT_GS(%esp), %gs
138.endm
139.macro PTGS_TO_GS_EX
140.pushsection .fixup, "ax"
14199:	movl	$0, PT_GS(%esp)
142	jmp	98b
143.popsection
144	_ASM_EXTABLE(98b, 99b)
145.endm
146
147.macro GS_TO_REG reg
148	movl	%gs, \reg
149.endm
150.macro REG_TO_PTGS reg
151	movl	\reg, PT_GS(%esp)
152.endm
153.macro SET_KERNEL_GS reg
154	movl	$(__KERNEL_STACK_CANARY), \reg
155	movl	\reg, %gs
156.endm
157
158#endif /* CONFIG_X86_32_LAZY_GS */
159
160/* Unconditionally switch to user cr3 */
161.macro SWITCH_TO_USER_CR3 scratch_reg:req
162	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
163
164	movl	%cr3, \scratch_reg
165	orl	$PTI_SWITCH_MASK, \scratch_reg
166	movl	\scratch_reg, %cr3
167.Lend_\@:
168.endm
169
170.macro BUG_IF_WRONG_CR3 no_user_check=0
171#ifdef CONFIG_DEBUG_ENTRY
172	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
173	.if \no_user_check == 0
174	/* coming from usermode? */
175	testl	$SEGMENT_RPL_MASK, PT_CS(%esp)
176	jz	.Lend_\@
177	.endif
178	/* On user-cr3? */
179	movl	%cr3, %eax
180	testl	$PTI_SWITCH_MASK, %eax
181	jnz	.Lend_\@
182	/* From userspace with kernel cr3 - BUG */
183	ud2
184.Lend_\@:
185#endif
186.endm
187
188/*
189 * Switch to kernel cr3 if not already loaded and return current cr3 in
190 * \scratch_reg
191 */
192.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
193	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
194	movl	%cr3, \scratch_reg
195	/* Test if we are already on kernel CR3 */
196	testl	$PTI_SWITCH_MASK, \scratch_reg
197	jz	.Lend_\@
198	andl	$(~PTI_SWITCH_MASK), \scratch_reg
199	movl	\scratch_reg, %cr3
200	/* Return original CR3 in \scratch_reg */
201	orl	$PTI_SWITCH_MASK, \scratch_reg
202.Lend_\@:
203.endm
204
205#define CS_FROM_ENTRY_STACK	(1 << 31)
206#define CS_FROM_USER_CR3	(1 << 30)
207#define CS_FROM_KERNEL		(1 << 29)
208
209.macro FIXUP_FRAME
210	/*
211	 * The high bits of the CS dword (__csh) are used for CS_FROM_*.
212	 * Clear them in case hardware didn't do this for us.
213	 */
214	andl	$0x0000ffff, 3*4(%esp)
215
216#ifdef CONFIG_VM86
217	testl	$X86_EFLAGS_VM, 4*4(%esp)
218	jnz	.Lfrom_usermode_no_fixup_\@
219#endif
220	testl	$SEGMENT_RPL_MASK, 3*4(%esp)
221	jnz	.Lfrom_usermode_no_fixup_\@
222
223	orl	$CS_FROM_KERNEL, 3*4(%esp)
224
225	/*
226	 * When we're here from kernel mode; the (exception) stack looks like:
227	 *
228	 *  5*4(%esp) - <previous context>
229	 *  4*4(%esp) - flags
230	 *  3*4(%esp) - cs
231	 *  2*4(%esp) - ip
232	 *  1*4(%esp) - orig_eax
233	 *  0*4(%esp) - gs / function
234	 *
235	 * Lets build a 5 entry IRET frame after that, such that struct pt_regs
236	 * is complete and in particular regs->sp is correct. This gives us
237	 * the original 5 enties as gap:
238	 *
239	 * 12*4(%esp) - <previous context>
240	 * 11*4(%esp) - gap / flags
241	 * 10*4(%esp) - gap / cs
242	 *  9*4(%esp) - gap / ip
243	 *  8*4(%esp) - gap / orig_eax
244	 *  7*4(%esp) - gap / gs / function
245	 *  6*4(%esp) - ss
246	 *  5*4(%esp) - sp
247	 *  4*4(%esp) - flags
248	 *  3*4(%esp) - cs
249	 *  2*4(%esp) - ip
250	 *  1*4(%esp) - orig_eax
251	 *  0*4(%esp) - gs / function
252	 */
253
254	pushl	%ss		# ss
255	pushl	%esp		# sp (points at ss)
256	addl	$6*4, (%esp)	# point sp back at the previous context
257	pushl	6*4(%esp)	# flags
258	pushl	6*4(%esp)	# cs
259	pushl	6*4(%esp)	# ip
260	pushl	6*4(%esp)	# orig_eax
261	pushl	6*4(%esp)	# gs / function
262.Lfrom_usermode_no_fixup_\@:
263.endm
264
265.macro IRET_FRAME
266	testl $CS_FROM_KERNEL, 1*4(%esp)
267	jz .Lfinished_frame_\@
268
269	/*
270	 * Reconstruct the 3 entry IRET frame right after the (modified)
271	 * regs->sp without lowering %esp in between, such that an NMI in the
272	 * middle doesn't scribble our stack.
273	 */
274	pushl	%eax
275	pushl	%ecx
276	movl	5*4(%esp), %eax		# (modified) regs->sp
277
278	movl	4*4(%esp), %ecx		# flags
279	movl	%ecx, -4(%eax)
280
281	movl	3*4(%esp), %ecx		# cs
282	andl	$0x0000ffff, %ecx
283	movl	%ecx, -8(%eax)
284
285	movl	2*4(%esp), %ecx		# ip
286	movl	%ecx, -12(%eax)
287
288	movl	1*4(%esp), %ecx		# eax
289	movl	%ecx, -16(%eax)
290
291	popl	%ecx
292	lea	-16(%eax), %esp
293	popl	%eax
294.Lfinished_frame_\@:
295.endm
296
297.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
298	cld
299.if \skip_gs == 0
300	PUSH_GS
301.endif
302	FIXUP_FRAME
303	pushl	%fs
304	pushl	%es
305	pushl	%ds
306	pushl	\pt_regs_ax
307	pushl	%ebp
308	pushl	%edi
309	pushl	%esi
310	pushl	%edx
311	pushl	%ecx
312	pushl	%ebx
313	movl	$(__USER_DS), %edx
314	movl	%edx, %ds
315	movl	%edx, %es
316	movl	$(__KERNEL_PERCPU), %edx
317	movl	%edx, %fs
318.if \skip_gs == 0
319	SET_KERNEL_GS %edx
320.endif
321	/* Switch to kernel stack if necessary */
322.if \switch_stacks > 0
323	SWITCH_TO_KERNEL_STACK
324.endif
325.endm
326
327.macro SAVE_ALL_NMI cr3_reg:req
328	SAVE_ALL
329
330	BUG_IF_WRONG_CR3
331
332	/*
333	 * Now switch the CR3 when PTI is enabled.
334	 *
335	 * We can enter with either user or kernel cr3, the code will
336	 * store the old cr3 in \cr3_reg and switches to the kernel cr3
337	 * if necessary.
338	 */
339	SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
340
341.Lend_\@:
342.endm
343
344.macro RESTORE_INT_REGS
345	popl	%ebx
346	popl	%ecx
347	popl	%edx
348	popl	%esi
349	popl	%edi
350	popl	%ebp
351	popl	%eax
352.endm
353
354.macro RESTORE_REGS pop=0
355	RESTORE_INT_REGS
3561:	popl	%ds
3572:	popl	%es
3583:	popl	%fs
359	POP_GS \pop
360.pushsection .fixup, "ax"
3614:	movl	$0, (%esp)
362	jmp	1b
3635:	movl	$0, (%esp)
364	jmp	2b
3656:	movl	$0, (%esp)
366	jmp	3b
367.popsection
368	_ASM_EXTABLE(1b, 4b)
369	_ASM_EXTABLE(2b, 5b)
370	_ASM_EXTABLE(3b, 6b)
371	POP_GS_EX
372.endm
373
374.macro RESTORE_ALL_NMI cr3_reg:req pop=0
375	/*
376	 * Now switch the CR3 when PTI is enabled.
377	 *
378	 * We enter with kernel cr3 and switch the cr3 to the value
379	 * stored on \cr3_reg, which is either a user or a kernel cr3.
380	 */
381	ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
382
383	testl	$PTI_SWITCH_MASK, \cr3_reg
384	jz	.Lswitched_\@
385
386	/* User cr3 in \cr3_reg - write it to hardware cr3 */
387	movl	\cr3_reg, %cr3
388
389.Lswitched_\@:
390
391	BUG_IF_WRONG_CR3
392
393	RESTORE_REGS pop=\pop
394.endm
395
396.macro CHECK_AND_APPLY_ESPFIX
397#ifdef CONFIG_X86_ESPFIX32
398#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
399
400	ALTERNATIVE	"jmp .Lend_\@", "", X86_BUG_ESPFIX
401
402	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
403	/*
404	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
405	 * are returning to the kernel.
406	 * See comments in process.c:copy_thread() for details.
407	 */
408	movb	PT_OLDSS(%esp), %ah
409	movb	PT_CS(%esp), %al
410	andl	$(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
411	cmpl	$((SEGMENT_LDT << 8) | USER_RPL), %eax
412	jne	.Lend_\@	# returning to user-space with LDT SS
413
414	/*
415	 * Setup and switch to ESPFIX stack
416	 *
417	 * We're returning to userspace with a 16 bit stack. The CPU will not
418	 * restore the high word of ESP for us on executing iret... This is an
419	 * "official" bug of all the x86-compatible CPUs, which we can work
420	 * around to make dosemu and wine happy. We do this by preloading the
421	 * high word of ESP with the high word of the userspace ESP while
422	 * compensating for the offset by changing to the ESPFIX segment with
423	 * a base address that matches for the difference.
424	 */
425	mov	%esp, %edx			/* load kernel esp */
426	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
427	mov	%dx, %ax			/* eax: new kernel esp */
428	sub	%eax, %edx			/* offset (low word is 0) */
429	shr	$16, %edx
430	mov	%dl, GDT_ESPFIX_SS + 4		/* bits 16..23 */
431	mov	%dh, GDT_ESPFIX_SS + 7		/* bits 24..31 */
432	pushl	$__ESPFIX_SS
433	pushl	%eax				/* new kernel esp */
434	/*
435	 * Disable interrupts, but do not irqtrace this section: we
436	 * will soon execute iret and the tracer was already set to
437	 * the irqstate after the IRET:
438	 */
439	DISABLE_INTERRUPTS(CLBR_ANY)
440	lss	(%esp), %esp			/* switch to espfix segment */
441.Lend_\@:
442#endif /* CONFIG_X86_ESPFIX32 */
443.endm
444
445/*
446 * Called with pt_regs fully populated and kernel segments loaded,
447 * so we can access PER_CPU and use the integer registers.
448 *
449 * We need to be very careful here with the %esp switch, because an NMI
450 * can happen everywhere. If the NMI handler finds itself on the
451 * entry-stack, it will overwrite the task-stack and everything we
452 * copied there. So allocate the stack-frame on the task-stack and
453 * switch to it before we do any copying.
454 */
455
456.macro SWITCH_TO_KERNEL_STACK
457
458	ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
459
460	BUG_IF_WRONG_CR3
461
462	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
463
464	/*
465	 * %eax now contains the entry cr3 and we carry it forward in
466	 * that register for the time this macro runs
467	 */
468
469	/* Are we on the entry stack? Bail out if not! */
470	movl	PER_CPU_VAR(cpu_entry_area), %ecx
471	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
472	subl	%esp, %ecx	/* ecx = (end of entry_stack) - esp */
473	cmpl	$SIZEOF_entry_stack, %ecx
474	jae	.Lend_\@
475
476	/* Load stack pointer into %esi and %edi */
477	movl	%esp, %esi
478	movl	%esi, %edi
479
480	/* Move %edi to the top of the entry stack */
481	andl	$(MASK_entry_stack), %edi
482	addl	$(SIZEOF_entry_stack), %edi
483
484	/* Load top of task-stack into %edi */
485	movl	TSS_entry2task_stack(%edi), %edi
486
487	/* Special case - entry from kernel mode via entry stack */
488#ifdef CONFIG_VM86
489	movl	PT_EFLAGS(%esp), %ecx		# mix EFLAGS and CS
490	movb	PT_CS(%esp), %cl
491	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
492#else
493	movl	PT_CS(%esp), %ecx
494	andl	$SEGMENT_RPL_MASK, %ecx
495#endif
496	cmpl	$USER_RPL, %ecx
497	jb	.Lentry_from_kernel_\@
498
499	/* Bytes to copy */
500	movl	$PTREGS_SIZE, %ecx
501
502#ifdef CONFIG_VM86
503	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esi)
504	jz	.Lcopy_pt_regs_\@
505
506	/*
507	 * Stack-frame contains 4 additional segment registers when
508	 * coming from VM86 mode
509	 */
510	addl	$(4 * 4), %ecx
511
512#endif
513.Lcopy_pt_regs_\@:
514
515	/* Allocate frame on task-stack */
516	subl	%ecx, %edi
517
518	/* Switch to task-stack */
519	movl	%edi, %esp
520
521	/*
522	 * We are now on the task-stack and can safely copy over the
523	 * stack-frame
524	 */
525	shrl	$2, %ecx
526	cld
527	rep movsl
528
529	jmp .Lend_\@
530
531.Lentry_from_kernel_\@:
532
533	/*
534	 * This handles the case when we enter the kernel from
535	 * kernel-mode and %esp points to the entry-stack. When this
536	 * happens we need to switch to the task-stack to run C code,
537	 * but switch back to the entry-stack again when we approach
538	 * iret and return to the interrupted code-path. This usually
539	 * happens when we hit an exception while restoring user-space
540	 * segment registers on the way back to user-space or when the
541	 * sysenter handler runs with eflags.tf set.
542	 *
543	 * When we switch to the task-stack here, we can't trust the
544	 * contents of the entry-stack anymore, as the exception handler
545	 * might be scheduled out or moved to another CPU. Therefore we
546	 * copy the complete entry-stack to the task-stack and set a
547	 * marker in the iret-frame (bit 31 of the CS dword) to detect
548	 * what we've done on the iret path.
549	 *
550	 * On the iret path we copy everything back and switch to the
551	 * entry-stack, so that the interrupted kernel code-path
552	 * continues on the same stack it was interrupted with.
553	 *
554	 * Be aware that an NMI can happen anytime in this code.
555	 *
556	 * %esi: Entry-Stack pointer (same as %esp)
557	 * %edi: Top of the task stack
558	 * %eax: CR3 on kernel entry
559	 */
560
561	/* Calculate number of bytes on the entry stack in %ecx */
562	movl	%esi, %ecx
563
564	/* %ecx to the top of entry-stack */
565	andl	$(MASK_entry_stack), %ecx
566	addl	$(SIZEOF_entry_stack), %ecx
567
568	/* Number of bytes on the entry stack to %ecx */
569	sub	%esi, %ecx
570
571	/* Mark stackframe as coming from entry stack */
572	orl	$CS_FROM_ENTRY_STACK, PT_CS(%esp)
573
574	/*
575	 * Test the cr3 used to enter the kernel and add a marker
576	 * so that we can switch back to it before iret.
577	 */
578	testl	$PTI_SWITCH_MASK, %eax
579	jz	.Lcopy_pt_regs_\@
580	orl	$CS_FROM_USER_CR3, PT_CS(%esp)
581
582	/*
583	 * %esi and %edi are unchanged, %ecx contains the number of
584	 * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
585	 * the stack-frame on task-stack and copy everything over
586	 */
587	jmp .Lcopy_pt_regs_\@
588
589.Lend_\@:
590.endm
591
592/*
593 * Switch back from the kernel stack to the entry stack.
594 *
595 * The %esp register must point to pt_regs on the task stack. It will
596 * first calculate the size of the stack-frame to copy, depending on
597 * whether we return to VM86 mode or not. With that it uses 'rep movsl'
598 * to copy the contents of the stack over to the entry stack.
599 *
600 * We must be very careful here, as we can't trust the contents of the
601 * task-stack once we switched to the entry-stack. When an NMI happens
602 * while on the entry-stack, the NMI handler will switch back to the top
603 * of the task stack, overwriting our stack-frame we are about to copy.
604 * Therefore we switch the stack only after everything is copied over.
605 */
606.macro SWITCH_TO_ENTRY_STACK
607
608	ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
609
610	/* Bytes to copy */
611	movl	$PTREGS_SIZE, %ecx
612
613#ifdef CONFIG_VM86
614	testl	$(X86_EFLAGS_VM), PT_EFLAGS(%esp)
615	jz	.Lcopy_pt_regs_\@
616
617	/* Additional 4 registers to copy when returning to VM86 mode */
618	addl    $(4 * 4), %ecx
619
620.Lcopy_pt_regs_\@:
621#endif
622
623	/* Initialize source and destination for movsl */
624	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
625	subl	%ecx, %edi
626	movl	%esp, %esi
627
628	/* Save future stack pointer in %ebx */
629	movl	%edi, %ebx
630
631	/* Copy over the stack-frame */
632	shrl	$2, %ecx
633	cld
634	rep movsl
635
636	/*
637	 * Switch to entry-stack - needs to happen after everything is
638	 * copied because the NMI handler will overwrite the task-stack
639	 * when on entry-stack
640	 */
641	movl	%ebx, %esp
642
643.Lend_\@:
644.endm
645
646/*
647 * This macro handles the case when we return to kernel-mode on the iret
648 * path and have to switch back to the entry stack and/or user-cr3
649 *
650 * See the comments below the .Lentry_from_kernel_\@ label in the
651 * SWITCH_TO_KERNEL_STACK macro for more details.
652 */
653.macro PARANOID_EXIT_TO_KERNEL_MODE
654
655	/*
656	 * Test if we entered the kernel with the entry-stack. Most
657	 * likely we did not, because this code only runs on the
658	 * return-to-kernel path.
659	 */
660	testl	$CS_FROM_ENTRY_STACK, PT_CS(%esp)
661	jz	.Lend_\@
662
663	/* Unlikely slow-path */
664
665	/* Clear marker from stack-frame */
666	andl	$(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
667
668	/* Copy the remaining task-stack contents to entry-stack */
669	movl	%esp, %esi
670	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
671
672	/* Bytes on the task-stack to ecx */
673	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
674	subl	%esi, %ecx
675
676	/* Allocate stack-frame on entry-stack */
677	subl	%ecx, %edi
678
679	/*
680	 * Save future stack-pointer, we must not switch until the
681	 * copy is done, otherwise the NMI handler could destroy the
682	 * contents of the task-stack we are about to copy.
683	 */
684	movl	%edi, %ebx
685
686	/* Do the copy */
687	shrl	$2, %ecx
688	cld
689	rep movsl
690
691	/* Safe to switch to entry-stack now */
692	movl	%ebx, %esp
693
694	/*
695	 * We came from entry-stack and need to check if we also need to
696	 * switch back to user cr3.
697	 */
698	testl	$CS_FROM_USER_CR3, PT_CS(%esp)
699	jz	.Lend_\@
700
701	/* Clear marker from stack-frame */
702	andl	$(~CS_FROM_USER_CR3), PT_CS(%esp)
703
704	SWITCH_TO_USER_CR3 scratch_reg=%eax
705
706.Lend_\@:
707.endm
708/*
709 * %eax: prev task
710 * %edx: next task
711 */
712ENTRY(__switch_to_asm)
713	/*
714	 * Save callee-saved registers
715	 * This must match the order in struct inactive_task_frame
716	 */
717	pushl	%ebp
718	pushl	%ebx
719	pushl	%edi
720	pushl	%esi
721	pushfl
722
723	/* switch stack */
724	movl	%esp, TASK_threadsp(%eax)
725	movl	TASK_threadsp(%edx), %esp
726
727#ifdef CONFIG_STACKPROTECTOR
728	movl	TASK_stack_canary(%edx), %ebx
729	movl	%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
730#endif
731
732#ifdef CONFIG_RETPOLINE
733	/*
734	 * When switching from a shallower to a deeper call stack
735	 * the RSB may either underflow or use entries populated
736	 * with userspace addresses. On CPUs where those concerns
737	 * exist, overwrite the RSB with entries which capture
738	 * speculative execution to prevent attack.
739	 */
740	FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
741#endif
742
743	/* restore callee-saved registers */
744	popfl
745	popl	%esi
746	popl	%edi
747	popl	%ebx
748	popl	%ebp
749
750	jmp	__switch_to
751END(__switch_to_asm)
752
753/*
754 * The unwinder expects the last frame on the stack to always be at the same
755 * offset from the end of the page, which allows it to validate the stack.
756 * Calling schedule_tail() directly would break that convention because its an
757 * asmlinkage function so its argument has to be pushed on the stack.  This
758 * wrapper creates a proper "end of stack" frame header before the call.
759 */
760ENTRY(schedule_tail_wrapper)
761	FRAME_BEGIN
762
763	pushl	%eax
764	call	schedule_tail
765	popl	%eax
766
767	FRAME_END
768	ret
769ENDPROC(schedule_tail_wrapper)
770/*
771 * A newly forked process directly context switches into this address.
772 *
773 * eax: prev task we switched from
774 * ebx: kernel thread func (NULL for user thread)
775 * edi: kernel thread arg
776 */
777ENTRY(ret_from_fork)
778	call	schedule_tail_wrapper
779
780	testl	%ebx, %ebx
781	jnz	1f		/* kernel threads are uncommon */
782
7832:
784	/* When we fork, we trace the syscall return in the child, too. */
785	movl    %esp, %eax
786	call    syscall_return_slowpath
787	STACKLEAK_ERASE
788	jmp     restore_all
789
790	/* kernel thread */
7911:	movl	%edi, %eax
792	CALL_NOSPEC %ebx
793	/*
794	 * A kernel thread is allowed to return here after successfully
795	 * calling do_execve().  Exit to userspace to complete the execve()
796	 * syscall.
797	 */
798	movl	$0, PT_EAX(%esp)
799	jmp	2b
800END(ret_from_fork)
801
802/*
803 * Return to user mode is not as complex as all this looks,
804 * but we want the default path for a system call return to
805 * go as quickly as possible which is why some of this is
806 * less clear than it otherwise should be.
807 */
808
809	# userspace resumption stub bypassing syscall exit tracing
810	ALIGN
811ret_from_exception:
812	preempt_stop(CLBR_ANY)
813ret_from_intr:
814#ifdef CONFIG_VM86
815	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
816	movb	PT_CS(%esp), %al
817	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
818#else
819	/*
820	 * We can be coming here from child spawned by kernel_thread().
821	 */
822	movl	PT_CS(%esp), %eax
823	andl	$SEGMENT_RPL_MASK, %eax
824#endif
825	cmpl	$USER_RPL, %eax
826	jb	restore_all_kernel		# not returning to v8086 or userspace
827
828ENTRY(resume_userspace)
829	DISABLE_INTERRUPTS(CLBR_ANY)
830	TRACE_IRQS_OFF
831	movl	%esp, %eax
832	call	prepare_exit_to_usermode
833	jmp	restore_all
834END(ret_from_exception)
835
836GLOBAL(__begin_SYSENTER_singlestep_region)
837/*
838 * All code from here through __end_SYSENTER_singlestep_region is subject
839 * to being single-stepped if a user program sets TF and executes SYSENTER.
840 * There is absolutely nothing that we can do to prevent this from happening
841 * (thanks Intel!).  To keep our handling of this situation as simple as
842 * possible, we handle TF just like AC and NT, except that our #DB handler
843 * will ignore all of the single-step traps generated in this range.
844 */
845
846#ifdef CONFIG_XEN_PV
847/*
848 * Xen doesn't set %esp to be precisely what the normal SYSENTER
849 * entry point expects, so fix it up before using the normal path.
850 */
851ENTRY(xen_sysenter_target)
852	addl	$5*4, %esp			/* remove xen-provided frame */
853	jmp	.Lsysenter_past_esp
854#endif
855
856/*
857 * 32-bit SYSENTER entry.
858 *
859 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
860 * if X86_FEATURE_SEP is available.  This is the preferred system call
861 * entry on 32-bit systems.
862 *
863 * The SYSENTER instruction, in principle, should *only* occur in the
864 * vDSO.  In practice, a small number of Android devices were shipped
865 * with a copy of Bionic that inlined a SYSENTER instruction.  This
866 * never happened in any of Google's Bionic versions -- it only happened
867 * in a narrow range of Intel-provided versions.
868 *
869 * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
870 * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
871 * SYSENTER does not save anything on the stack,
872 * and does not save old EIP (!!!), ESP, or EFLAGS.
873 *
874 * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
875 * user and/or vm86 state), we explicitly disable the SYSENTER
876 * instruction in vm86 mode by reprogramming the MSRs.
877 *
878 * Arguments:
879 * eax  system call number
880 * ebx  arg1
881 * ecx  arg2
882 * edx  arg3
883 * esi  arg4
884 * edi  arg5
885 * ebp  user stack
886 * 0(%ebp) arg6
887 */
888ENTRY(entry_SYSENTER_32)
889	/*
890	 * On entry-stack with all userspace-regs live - save and
891	 * restore eflags and %eax to use it as scratch-reg for the cr3
892	 * switch.
893	 */
894	pushfl
895	pushl	%eax
896	BUG_IF_WRONG_CR3 no_user_check=1
897	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
898	popl	%eax
899	popfl
900
901	/* Stack empty again, switch to task stack */
902	movl	TSS_entry2task_stack(%esp), %esp
903
904.Lsysenter_past_esp:
905	pushl	$__USER_DS		/* pt_regs->ss */
906	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
907	pushfl				/* pt_regs->flags (except IF = 0) */
908	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
909	pushl	$__USER_CS		/* pt_regs->cs */
910	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
911	pushl	%eax			/* pt_regs->orig_ax */
912	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest, stack already switched */
913
914	/*
915	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
916	 * and TF ourselves.  To save a few cycles, we can check whether
917	 * either was set instead of doing an unconditional popfq.
918	 * This needs to happen before enabling interrupts so that
919	 * we don't get preempted with NT set.
920	 *
921	 * If TF is set, we will single-step all the way to here -- do_debug
922	 * will ignore all the traps.  (Yes, this is slow, but so is
923	 * single-stepping in general.  This allows us to avoid having
924	 * a more complicated code to handle the case where a user program
925	 * forces us to single-step through the SYSENTER entry code.)
926	 *
927	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
928	 * out-of-line as an optimization: NT is unlikely to be set in the
929	 * majority of the cases and instead of polluting the I$ unnecessarily,
930	 * we're keeping that code behind a branch which will predict as
931	 * not-taken and therefore its instructions won't be fetched.
932	 */
933	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
934	jnz	.Lsysenter_fix_flags
935.Lsysenter_flags_fixed:
936
937	/*
938	 * User mode is traced as though IRQs are on, and SYSENTER
939	 * turned them off.
940	 */
941	TRACE_IRQS_OFF
942
943	movl	%esp, %eax
944	call	do_fast_syscall_32
945	/* XEN PV guests always use IRET path */
946	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
947		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
948
949	STACKLEAK_ERASE
950
951/* Opportunistic SYSEXIT */
952	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
953
954	/*
955	 * Setup entry stack - we keep the pointer in %eax and do the
956	 * switch after almost all user-state is restored.
957	 */
958
959	/* Load entry stack pointer and allocate frame for eflags/eax */
960	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
961	subl	$(2*4), %eax
962
963	/* Copy eflags and eax to entry stack */
964	movl	PT_EFLAGS(%esp), %edi
965	movl	PT_EAX(%esp), %esi
966	movl	%edi, (%eax)
967	movl	%esi, 4(%eax)
968
969	/* Restore user registers and segments */
970	movl	PT_EIP(%esp), %edx	/* pt_regs->ip */
971	movl	PT_OLDESP(%esp), %ecx	/* pt_regs->sp */
9721:	mov	PT_FS(%esp), %fs
973	PTGS_TO_GS
974
975	popl	%ebx			/* pt_regs->bx */
976	addl	$2*4, %esp		/* skip pt_regs->cx and pt_regs->dx */
977	popl	%esi			/* pt_regs->si */
978	popl	%edi			/* pt_regs->di */
979	popl	%ebp			/* pt_regs->bp */
980
981	/* Switch to entry stack */
982	movl	%eax, %esp
983
984	/* Now ready to switch the cr3 */
985	SWITCH_TO_USER_CR3 scratch_reg=%eax
986
987	/*
988	 * Restore all flags except IF. (We restore IF separately because
989	 * STI gives a one-instruction window in which we won't be interrupted,
990	 * whereas POPF does not.)
991	 */
992	btrl	$X86_EFLAGS_IF_BIT, (%esp)
993	BUG_IF_WRONG_CR3 no_user_check=1
994	popfl
995	popl	%eax
996
997	/*
998	 * Return back to the vDSO, which will pop ecx and edx.
999	 * Don't bother with DS and ES (they already contain __USER_DS).
1000	 */
1001	sti
1002	sysexit
1003
1004.pushsection .fixup, "ax"
10052:	movl	$0, PT_FS(%esp)
1006	jmp	1b
1007.popsection
1008	_ASM_EXTABLE(1b, 2b)
1009	PTGS_TO_GS_EX
1010
1011.Lsysenter_fix_flags:
1012	pushl	$X86_EFLAGS_FIXED
1013	popfl
1014	jmp	.Lsysenter_flags_fixed
1015GLOBAL(__end_SYSENTER_singlestep_region)
1016ENDPROC(entry_SYSENTER_32)
1017
1018/*
1019 * 32-bit legacy system call entry.
1020 *
1021 * 32-bit x86 Linux system calls traditionally used the INT $0x80
1022 * instruction.  INT $0x80 lands here.
1023 *
1024 * This entry point can be used by any 32-bit perform system calls.
1025 * Instances of INT $0x80 can be found inline in various programs and
1026 * libraries.  It is also used by the vDSO's __kernel_vsyscall
1027 * fallback for hardware that doesn't support a faster entry method.
1028 * Restarted 32-bit system calls also fall back to INT $0x80
1029 * regardless of what instruction was originally used to do the system
1030 * call.  (64-bit programs can use INT $0x80 as well, but they can
1031 * only run on 64-bit kernels and therefore land in
1032 * entry_INT80_compat.)
1033 *
1034 * This is considered a slow path.  It is not used by most libc
1035 * implementations on modern hardware except during process startup.
1036 *
1037 * Arguments:
1038 * eax  system call number
1039 * ebx  arg1
1040 * ecx  arg2
1041 * edx  arg3
1042 * esi  arg4
1043 * edi  arg5
1044 * ebp  arg6
1045 */
1046ENTRY(entry_INT80_32)
1047	ASM_CLAC
1048	pushl	%eax			/* pt_regs->orig_ax */
1049
1050	SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1	/* save rest */
1051
1052	/*
1053	 * User mode is traced as though IRQs are on, and the interrupt gate
1054	 * turned them off.
1055	 */
1056	TRACE_IRQS_OFF
1057
1058	movl	%esp, %eax
1059	call	do_int80_syscall_32
1060.Lsyscall_32_done:
1061
1062	STACKLEAK_ERASE
1063
1064restore_all:
1065	TRACE_IRQS_IRET
1066	SWITCH_TO_ENTRY_STACK
1067.Lrestore_all_notrace:
1068	CHECK_AND_APPLY_ESPFIX
1069.Lrestore_nocheck:
1070	/* Switch back to user CR3 */
1071	SWITCH_TO_USER_CR3 scratch_reg=%eax
1072
1073	BUG_IF_WRONG_CR3
1074
1075	/* Restore user state */
1076	RESTORE_REGS pop=4			# skip orig_eax/error_code
1077.Lirq_return:
1078	IRET_FRAME
1079	/*
1080	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
1081	 * when returning from IPI handler and when returning from
1082	 * scheduler to user-space.
1083	 */
1084	INTERRUPT_RETURN
1085
1086restore_all_kernel:
1087#ifdef CONFIG_PREEMPT
1088	DISABLE_INTERRUPTS(CLBR_ANY)
1089	cmpl	$0, PER_CPU_VAR(__preempt_count)
1090	jnz	.Lno_preempt
1091	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
1092	jz	.Lno_preempt
1093	call	preempt_schedule_irq
1094.Lno_preempt:
1095#endif
1096	TRACE_IRQS_IRET
1097	PARANOID_EXIT_TO_KERNEL_MODE
1098	BUG_IF_WRONG_CR3
1099	RESTORE_REGS 4
1100	jmp	.Lirq_return
1101
1102.section .fixup, "ax"
1103ENTRY(iret_exc	)
1104	pushl	$0				# no error code
1105	pushl	$do_iret_error
1106
1107#ifdef CONFIG_DEBUG_ENTRY
1108	/*
1109	 * The stack-frame here is the one that iret faulted on, so its a
1110	 * return-to-user frame. We are on kernel-cr3 because we come here from
1111	 * the fixup code. This confuses the CR3 checker, so switch to user-cr3
1112	 * as the checker expects it.
1113	 */
1114	pushl	%eax
1115	SWITCH_TO_USER_CR3 scratch_reg=%eax
1116	popl	%eax
1117#endif
1118
1119	jmp	common_exception
1120.previous
1121	_ASM_EXTABLE(.Lirq_return, iret_exc)
1122ENDPROC(entry_INT80_32)
1123
1124.macro FIXUP_ESPFIX_STACK
1125/*
1126 * Switch back for ESPFIX stack to the normal zerobased stack
1127 *
1128 * We can't call C functions using the ESPFIX stack. This code reads
1129 * the high word of the segment base from the GDT and swiches to the
1130 * normal stack and adjusts ESP with the matching offset.
1131 */
1132#ifdef CONFIG_X86_ESPFIX32
1133	/* fixup the stack */
1134	mov	GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
1135	mov	GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
1136	shl	$16, %eax
1137	addl	%esp, %eax			/* the adjusted stack pointer */
1138	pushl	$__KERNEL_DS
1139	pushl	%eax
1140	lss	(%esp), %esp			/* switch to the normal stack segment */
1141#endif
1142.endm
1143.macro UNWIND_ESPFIX_STACK
1144#ifdef CONFIG_X86_ESPFIX32
1145	movl	%ss, %eax
1146	/* see if on espfix stack */
1147	cmpw	$__ESPFIX_SS, %ax
1148	jne	27f
1149	movl	$__KERNEL_DS, %eax
1150	movl	%eax, %ds
1151	movl	%eax, %es
1152	/* switch to normal stack */
1153	FIXUP_ESPFIX_STACK
115427:
1155#endif
1156.endm
1157
1158/*
1159 * Build the entry stubs with some assembler magic.
1160 * We pack 1 stub into every 8-byte block.
1161 */
1162	.align 8
1163ENTRY(irq_entries_start)
1164    vector=FIRST_EXTERNAL_VECTOR
1165    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
1166	pushl	$(~vector+0x80)			/* Note: always in signed byte range */
1167    vector=vector+1
1168	jmp	common_interrupt
1169	.align	8
1170    .endr
1171END(irq_entries_start)
1172
1173#ifdef CONFIG_X86_LOCAL_APIC
1174	.align 8
1175ENTRY(spurious_entries_start)
1176    vector=FIRST_SYSTEM_VECTOR
1177    .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
1178	pushl	$(~vector+0x80)			/* Note: always in signed byte range */
1179    vector=vector+1
1180	jmp	common_spurious
1181	.align	8
1182    .endr
1183END(spurious_entries_start)
1184
1185common_spurious:
1186	ASM_CLAC
1187	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
1188	SAVE_ALL switch_stacks=1
1189	ENCODE_FRAME_POINTER
1190	TRACE_IRQS_OFF
1191	movl	%esp, %eax
1192	call	smp_spurious_interrupt
1193	jmp	ret_from_intr
1194ENDPROC(common_spurious)
1195#endif
1196
1197/*
1198 * the CPU automatically disables interrupts when executing an IRQ vector,
1199 * so IRQ-flags tracing has to follow that:
1200 */
1201	.p2align CONFIG_X86_L1_CACHE_SHIFT
1202common_interrupt:
1203	ASM_CLAC
1204	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
1205
1206	SAVE_ALL switch_stacks=1
1207	ENCODE_FRAME_POINTER
1208	TRACE_IRQS_OFF
1209	movl	%esp, %eax
1210	call	do_IRQ
1211	jmp	ret_from_intr
1212ENDPROC(common_interrupt)
1213
1214#define BUILD_INTERRUPT3(name, nr, fn)			\
1215ENTRY(name)						\
1216	ASM_CLAC;					\
1217	pushl	$~(nr);					\
1218	SAVE_ALL switch_stacks=1;			\
1219	ENCODE_FRAME_POINTER;				\
1220	TRACE_IRQS_OFF					\
1221	movl	%esp, %eax;				\
1222	call	fn;					\
1223	jmp	ret_from_intr;				\
1224ENDPROC(name)
1225
1226#define BUILD_INTERRUPT(name, nr)		\
1227	BUILD_INTERRUPT3(name, nr, smp_##name);	\
1228
1229/* The include is where all of the SMP etc. interrupts come from */
1230#include <asm/entry_arch.h>
1231
1232ENTRY(coprocessor_error)
1233	ASM_CLAC
1234	pushl	$0
1235	pushl	$do_coprocessor_error
1236	jmp	common_exception
1237END(coprocessor_error)
1238
1239ENTRY(simd_coprocessor_error)
1240	ASM_CLAC
1241	pushl	$0
1242#ifdef CONFIG_X86_INVD_BUG
1243	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
1244	ALTERNATIVE "pushl	$do_general_protection",	\
1245		    "pushl	$do_simd_coprocessor_error",	\
1246		    X86_FEATURE_XMM
1247#else
1248	pushl	$do_simd_coprocessor_error
1249#endif
1250	jmp	common_exception
1251END(simd_coprocessor_error)
1252
1253ENTRY(device_not_available)
1254	ASM_CLAC
1255	pushl	$-1				# mark this as an int
1256	pushl	$do_device_not_available
1257	jmp	common_exception
1258END(device_not_available)
1259
1260#ifdef CONFIG_PARAVIRT
1261ENTRY(native_iret)
1262	iret
1263	_ASM_EXTABLE(native_iret, iret_exc)
1264END(native_iret)
1265#endif
1266
1267ENTRY(overflow)
1268	ASM_CLAC
1269	pushl	$0
1270	pushl	$do_overflow
1271	jmp	common_exception
1272END(overflow)
1273
1274ENTRY(bounds)
1275	ASM_CLAC
1276	pushl	$0
1277	pushl	$do_bounds
1278	jmp	common_exception
1279END(bounds)
1280
1281ENTRY(invalid_op)
1282	ASM_CLAC
1283	pushl	$0
1284	pushl	$do_invalid_op
1285	jmp	common_exception
1286END(invalid_op)
1287
1288ENTRY(coprocessor_segment_overrun)
1289	ASM_CLAC
1290	pushl	$0
1291	pushl	$do_coprocessor_segment_overrun
1292	jmp	common_exception
1293END(coprocessor_segment_overrun)
1294
1295ENTRY(invalid_TSS)
1296	ASM_CLAC
1297	pushl	$do_invalid_TSS
1298	jmp	common_exception
1299END(invalid_TSS)
1300
1301ENTRY(segment_not_present)
1302	ASM_CLAC
1303	pushl	$do_segment_not_present
1304	jmp	common_exception
1305END(segment_not_present)
1306
1307ENTRY(stack_segment)
1308	ASM_CLAC
1309	pushl	$do_stack_segment
1310	jmp	common_exception
1311END(stack_segment)
1312
1313ENTRY(alignment_check)
1314	ASM_CLAC
1315	pushl	$do_alignment_check
1316	jmp	common_exception
1317END(alignment_check)
1318
1319ENTRY(divide_error)
1320	ASM_CLAC
1321	pushl	$0				# no error code
1322	pushl	$do_divide_error
1323	jmp	common_exception
1324END(divide_error)
1325
1326#ifdef CONFIG_X86_MCE
1327ENTRY(machine_check)
1328	ASM_CLAC
1329	pushl	$0
1330	pushl	machine_check_vector
1331	jmp	common_exception
1332END(machine_check)
1333#endif
1334
1335ENTRY(spurious_interrupt_bug)
1336	ASM_CLAC
1337	pushl	$0
1338	pushl	$do_spurious_interrupt_bug
1339	jmp	common_exception
1340END(spurious_interrupt_bug)
1341
1342#ifdef CONFIG_XEN_PV
1343ENTRY(xen_hypervisor_callback)
1344	pushl	$-1				/* orig_ax = -1 => not a system call */
1345	SAVE_ALL
1346	ENCODE_FRAME_POINTER
1347	TRACE_IRQS_OFF
1348
1349	/*
1350	 * Check to see if we got the event in the critical
1351	 * region in xen_iret_direct, after we've reenabled
1352	 * events and checked for pending events.  This simulates
1353	 * iret instruction's behaviour where it delivers a
1354	 * pending interrupt when enabling interrupts:
1355	 */
1356	movl	PT_EIP(%esp), %eax
1357	cmpl	$xen_iret_start_crit, %eax
1358	jb	1f
1359	cmpl	$xen_iret_end_crit, %eax
1360	jae	1f
1361
1362	jmp	xen_iret_crit_fixup
1363
1364ENTRY(xen_do_upcall)
13651:	mov	%esp, %eax
1366	call	xen_evtchn_do_upcall
1367#ifndef CONFIG_PREEMPT
1368	call	xen_maybe_preempt_hcall
1369#endif
1370	jmp	ret_from_intr
1371ENDPROC(xen_hypervisor_callback)
1372
1373/*
1374 * Hypervisor uses this for application faults while it executes.
1375 * We get here for two reasons:
1376 *  1. Fault while reloading DS, ES, FS or GS
1377 *  2. Fault while executing IRET
1378 * Category 1 we fix up by reattempting the load, and zeroing the segment
1379 * register if the load fails.
1380 * Category 2 we fix up by jumping to do_iret_error. We cannot use the
1381 * normal Linux return path in this case because if we use the IRET hypercall
1382 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1383 * We distinguish between categories by maintaining a status value in EAX.
1384 */
1385ENTRY(xen_failsafe_callback)
1386	pushl	%eax
1387	movl	$1, %eax
13881:	mov	4(%esp), %ds
13892:	mov	8(%esp), %es
13903:	mov	12(%esp), %fs
13914:	mov	16(%esp), %gs
1392	/* EAX == 0 => Category 1 (Bad segment)
1393	   EAX != 0 => Category 2 (Bad IRET) */
1394	testl	%eax, %eax
1395	popl	%eax
1396	lea	16(%esp), %esp
1397	jz	5f
1398	jmp	iret_exc
13995:	pushl	$-1				/* orig_ax = -1 => not a system call */
1400	SAVE_ALL
1401	ENCODE_FRAME_POINTER
1402	jmp	ret_from_exception
1403
1404.section .fixup, "ax"
14056:	xorl	%eax, %eax
1406	movl	%eax, 4(%esp)
1407	jmp	1b
14087:	xorl	%eax, %eax
1409	movl	%eax, 8(%esp)
1410	jmp	2b
14118:	xorl	%eax, %eax
1412	movl	%eax, 12(%esp)
1413	jmp	3b
14149:	xorl	%eax, %eax
1415	movl	%eax, 16(%esp)
1416	jmp	4b
1417.previous
1418	_ASM_EXTABLE(1b, 6b)
1419	_ASM_EXTABLE(2b, 7b)
1420	_ASM_EXTABLE(3b, 8b)
1421	_ASM_EXTABLE(4b, 9b)
1422ENDPROC(xen_failsafe_callback)
1423#endif /* CONFIG_XEN_PV */
1424
1425#ifdef CONFIG_XEN_PVHVM
1426BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1427		 xen_evtchn_do_upcall)
1428#endif
1429
1430
1431#if IS_ENABLED(CONFIG_HYPERV)
1432
1433BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1434		 hyperv_vector_handler)
1435
1436BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
1437		 hyperv_reenlightenment_intr)
1438
1439BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
1440		 hv_stimer0_vector_handler)
1441
1442#endif /* CONFIG_HYPERV */
1443
1444ENTRY(page_fault)
1445	ASM_CLAC
1446	pushl	$do_page_fault
1447	jmp	common_exception_read_cr2
1448END(page_fault)
1449
1450common_exception_read_cr2:
1451	/* the function address is in %gs's slot on the stack */
1452	SAVE_ALL switch_stacks=1 skip_gs=1
1453
1454	ENCODE_FRAME_POINTER
1455	UNWIND_ESPFIX_STACK
1456
1457	/* fixup %gs */
1458	GS_TO_REG %ecx
1459	movl	PT_GS(%esp), %edi
1460	REG_TO_PTGS %ecx
1461	SET_KERNEL_GS %ecx
1462
1463	GET_CR2_INTO(%ecx)			# might clobber %eax
1464
1465	/* fixup orig %eax */
1466	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
1467	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
1468
1469	TRACE_IRQS_OFF
1470	movl	%esp, %eax			# pt_regs pointer
1471	CALL_NOSPEC %edi
1472	jmp	ret_from_exception
1473END(common_exception_read_cr2)
1474
1475common_exception:
1476	/* the function address is in %gs's slot on the stack */
1477	SAVE_ALL switch_stacks=1 skip_gs=1
1478	ENCODE_FRAME_POINTER
1479	UNWIND_ESPFIX_STACK
1480
1481	/* fixup %gs */
1482	GS_TO_REG %ecx
1483	movl	PT_GS(%esp), %edi		# get the function address
1484	REG_TO_PTGS %ecx
1485	SET_KERNEL_GS %ecx
1486
1487	/* fixup orig %eax */
1488	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
1489	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
1490
1491	TRACE_IRQS_OFF
1492	movl	%esp, %eax			# pt_regs pointer
1493	CALL_NOSPEC %edi
1494	jmp	ret_from_exception
1495END(common_exception)
1496
1497ENTRY(debug)
1498	/*
1499	 * Entry from sysenter is now handled in common_exception
1500	 */
1501	ASM_CLAC
1502	pushl	$-1				# mark this as an int
1503	pushl	$do_debug
1504	jmp	common_exception
1505END(debug)
1506
1507/*
1508 * NMI is doubly nasty.  It can happen on the first instruction of
1509 * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
1510 * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
1511 * switched stacks.  We handle both conditions by simply checking whether we
1512 * interrupted kernel code running on the SYSENTER stack.
1513 */
1514ENTRY(nmi)
1515	ASM_CLAC
1516
1517#ifdef CONFIG_X86_ESPFIX32
1518	pushl	%eax
1519	movl	%ss, %eax
1520	cmpw	$__ESPFIX_SS, %ax
1521	popl	%eax
1522	je	.Lnmi_espfix_stack
1523#endif
1524
1525	pushl	%eax				# pt_regs->orig_ax
1526	SAVE_ALL_NMI cr3_reg=%edi
1527	ENCODE_FRAME_POINTER
1528	xorl	%edx, %edx			# zero error code
1529	movl	%esp, %eax			# pt_regs pointer
1530
1531	/* Are we currently on the SYSENTER stack? */
1532	movl	PER_CPU_VAR(cpu_entry_area), %ecx
1533	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
1534	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */
1535	cmpl	$SIZEOF_entry_stack, %ecx
1536	jb	.Lnmi_from_sysenter_stack
1537
1538	/* Not on SYSENTER stack. */
1539	call	do_nmi
1540	jmp	.Lnmi_return
1541
1542.Lnmi_from_sysenter_stack:
1543	/*
1544	 * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
1545	 * is using the thread stack right now, so it's safe for us to use it.
1546	 */
1547	movl	%esp, %ebx
1548	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
1549	call	do_nmi
1550	movl	%ebx, %esp
1551
1552.Lnmi_return:
1553	CHECK_AND_APPLY_ESPFIX
1554	RESTORE_ALL_NMI cr3_reg=%edi pop=4
1555	jmp	.Lirq_return
1556
1557#ifdef CONFIG_X86_ESPFIX32
1558.Lnmi_espfix_stack:
1559	/*
1560	 * create the pointer to lss back
1561	 */
1562	pushl	%ss
1563	pushl	%esp
1564	addl	$4, (%esp)
1565	/* copy the iret frame of 12 bytes */
1566	.rept 3
1567	pushl	16(%esp)
1568	.endr
1569	pushl	%eax
1570	SAVE_ALL_NMI cr3_reg=%edi
1571	ENCODE_FRAME_POINTER
1572	FIXUP_ESPFIX_STACK			# %eax == %esp
1573	xorl	%edx, %edx			# zero error code
1574	call	do_nmi
1575	RESTORE_ALL_NMI cr3_reg=%edi
1576	lss	12+4(%esp), %esp		# back to espfix stack
1577	jmp	.Lirq_return
1578#endif
1579END(nmi)
1580
1581ENTRY(int3)
1582	ASM_CLAC
1583	pushl	$-1				# mark this as an int
1584
1585	SAVE_ALL switch_stacks=1
1586	ENCODE_FRAME_POINTER
1587	TRACE_IRQS_OFF
1588	xorl	%edx, %edx			# zero error code
1589	movl	%esp, %eax			# pt_regs pointer
1590	call	do_int3
1591	jmp	ret_from_exception
1592END(int3)
1593
1594ENTRY(general_protection)
1595	pushl	$do_general_protection
1596	jmp	common_exception
1597END(general_protection)
1598
1599#ifdef CONFIG_KVM_GUEST
1600ENTRY(async_page_fault)
1601	ASM_CLAC
1602	pushl	$do_async_page_fault
1603	jmp	common_exception_read_cr2
1604END(async_page_fault)
1605#endif
1606
1607ENTRY(rewind_stack_do_exit)
1608	/* Prevent any naive code from trying to unwind to our caller. */
1609	xorl	%ebp, %ebp
1610
1611	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esi
1612	leal	-TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
1613
1614	call	do_exit
16151:	jmp 1b
1616END(rewind_stack_do_exit)
1617