xref: /openbmc/linux/arch/x86/entry/entry_32.S (revision 1fa0a7dc)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 *  Copyright (C) 1991,1992  Linus Torvalds
4 *
5 * entry_32.S contains the system-call and low-level fault and trap handling routines.
6 *
7 * Stack layout while running C code:
8 *	ptrace needs to have all registers on the stack.
9 *	If the order here is changed, it needs to be
10 *	updated in fork.c:copy_process(), signal.c:do_signal(),
11 *	ptrace.c and ptrace.h
12 *
13 *	 0(%esp) - %ebx
14 *	 4(%esp) - %ecx
15 *	 8(%esp) - %edx
16 *	 C(%esp) - %esi
17 *	10(%esp) - %edi
18 *	14(%esp) - %ebp
19 *	18(%esp) - %eax
20 *	1C(%esp) - %ds
21 *	20(%esp) - %es
22 *	24(%esp) - %fs
23 *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
24 *	2C(%esp) - orig_eax
25 *	30(%esp) - %eip
26 *	34(%esp) - %cs
27 *	38(%esp) - %eflags
28 *	3C(%esp) - %oldesp
29 *	40(%esp) - %oldss
30 */
31
32#include <linux/linkage.h>
33#include <linux/err.h>
34#include <asm/thread_info.h>
35#include <asm/irqflags.h>
36#include <asm/errno.h>
37#include <asm/segment.h>
38#include <asm/smp.h>
39#include <asm/percpu.h>
40#include <asm/processor-flags.h>
41#include <asm/irq_vectors.h>
42#include <asm/cpufeatures.h>
43#include <asm/alternative-asm.h>
44#include <asm/asm.h>
45#include <asm/smap.h>
46#include <asm/frame.h>
47#include <asm/nospec-branch.h>
48
49#include "calling.h"
50
51	.section .entry.text, "ax"
52
53/*
54 * We use macros for low-level operations which need to be overridden
55 * for paravirtualization.  The following will never clobber any registers:
56 *   INTERRUPT_RETURN (aka. "iret")
57 *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
58 *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
59 *
60 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
61 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
62 * Allowing a register to be clobbered can shrink the paravirt replacement
63 * enough to patch inline, increasing performance.
64 */
65
66#ifdef CONFIG_PREEMPT
67# define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
68#else
69# define preempt_stop(clobbers)
70# define resume_kernel		restore_all_kernel
71#endif
72
73.macro TRACE_IRQS_IRET
74#ifdef CONFIG_TRACE_IRQFLAGS
75	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
76	jz	1f
77	TRACE_IRQS_ON
781:
79#endif
80.endm
81
82#define PTI_SWITCH_MASK         (1 << PAGE_SHIFT)
83
84/*
85 * User gs save/restore
86 *
87 * %gs is used for userland TLS and kernel only uses it for stack
88 * canary which is required to be at %gs:20 by gcc.  Read the comment
89 * at the top of stackprotector.h for more info.
90 *
91 * Local labels 98 and 99 are used.
92 */
93#ifdef CONFIG_X86_32_LAZY_GS
94
95 /* unfortunately push/pop can't be no-op */
96.macro PUSH_GS
97	pushl	$0
98.endm
99.macro POP_GS pop=0
100	addl	$(4 + \pop), %esp
101.endm
102.macro POP_GS_EX
103.endm
104
105 /* all the rest are no-op */
106.macro PTGS_TO_GS
107.endm
108.macro PTGS_TO_GS_EX
109.endm
110.macro GS_TO_REG reg
111.endm
112.macro REG_TO_PTGS reg
113.endm
114.macro SET_KERNEL_GS reg
115.endm
116
117#else	/* CONFIG_X86_32_LAZY_GS */
118
119.macro PUSH_GS
120	pushl	%gs
121.endm
122
123.macro POP_GS pop=0
12498:	popl	%gs
125  .if \pop <> 0
126	add	$\pop, %esp
127  .endif
128.endm
129.macro POP_GS_EX
130.pushsection .fixup, "ax"
13199:	movl	$0, (%esp)
132	jmp	98b
133.popsection
134	_ASM_EXTABLE(98b, 99b)
135.endm
136
137.macro PTGS_TO_GS
13898:	mov	PT_GS(%esp), %gs
139.endm
140.macro PTGS_TO_GS_EX
141.pushsection .fixup, "ax"
14299:	movl	$0, PT_GS(%esp)
143	jmp	98b
144.popsection
145	_ASM_EXTABLE(98b, 99b)
146.endm
147
148.macro GS_TO_REG reg
149	movl	%gs, \reg
150.endm
151.macro REG_TO_PTGS reg
152	movl	\reg, PT_GS(%esp)
153.endm
154.macro SET_KERNEL_GS reg
155	movl	$(__KERNEL_STACK_CANARY), \reg
156	movl	\reg, %gs
157.endm
158
159#endif /* CONFIG_X86_32_LAZY_GS */
160
161/* Unconditionally switch to user cr3 */
162.macro SWITCH_TO_USER_CR3 scratch_reg:req
163	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
164
165	movl	%cr3, \scratch_reg
166	orl	$PTI_SWITCH_MASK, \scratch_reg
167	movl	\scratch_reg, %cr3
168.Lend_\@:
169.endm
170
171.macro BUG_IF_WRONG_CR3 no_user_check=0
172#ifdef CONFIG_DEBUG_ENTRY
173	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
174	.if \no_user_check == 0
175	/* coming from usermode? */
176	testl	$SEGMENT_RPL_MASK, PT_CS(%esp)
177	jz	.Lend_\@
178	.endif
179	/* On user-cr3? */
180	movl	%cr3, %eax
181	testl	$PTI_SWITCH_MASK, %eax
182	jnz	.Lend_\@
183	/* From userspace with kernel cr3 - BUG */
184	ud2
185.Lend_\@:
186#endif
187.endm
188
189/*
190 * Switch to kernel cr3 if not already loaded and return current cr3 in
191 * \scratch_reg
192 */
193.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
194	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
195	movl	%cr3, \scratch_reg
196	/* Test if we are already on kernel CR3 */
197	testl	$PTI_SWITCH_MASK, \scratch_reg
198	jz	.Lend_\@
199	andl	$(~PTI_SWITCH_MASK), \scratch_reg
200	movl	\scratch_reg, %cr3
201	/* Return original CR3 in \scratch_reg */
202	orl	$PTI_SWITCH_MASK, \scratch_reg
203.Lend_\@:
204.endm
205
206.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
207	cld
208	PUSH_GS
209	pushl	%fs
210	pushl	%es
211	pushl	%ds
212	pushl	\pt_regs_ax
213	pushl	%ebp
214	pushl	%edi
215	pushl	%esi
216	pushl	%edx
217	pushl	%ecx
218	pushl	%ebx
219	movl	$(__USER_DS), %edx
220	movl	%edx, %ds
221	movl	%edx, %es
222	movl	$(__KERNEL_PERCPU), %edx
223	movl	%edx, %fs
224	SET_KERNEL_GS %edx
225
226	/* Switch to kernel stack if necessary */
227.if \switch_stacks > 0
228	SWITCH_TO_KERNEL_STACK
229.endif
230
231.endm
232
233.macro SAVE_ALL_NMI cr3_reg:req
234	SAVE_ALL
235
236	BUG_IF_WRONG_CR3
237
238	/*
239	 * Now switch the CR3 when PTI is enabled.
240	 *
241	 * We can enter with either user or kernel cr3, the code will
242	 * store the old cr3 in \cr3_reg and switches to the kernel cr3
243	 * if necessary.
244	 */
245	SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
246
247.Lend_\@:
248.endm
249
250/*
251 * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
252 * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
253 * is just clearing the MSB, which makes it an invalid stack address and is also
254 * a signal to the unwinder that it's a pt_regs pointer in disguise.
255 *
256 * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
257 * original rbp.
258 */
259.macro ENCODE_FRAME_POINTER
260#ifdef CONFIG_FRAME_POINTER
261	mov %esp, %ebp
262	andl $0x7fffffff, %ebp
263#endif
264.endm
265
266.macro RESTORE_INT_REGS
267	popl	%ebx
268	popl	%ecx
269	popl	%edx
270	popl	%esi
271	popl	%edi
272	popl	%ebp
273	popl	%eax
274.endm
275
276.macro RESTORE_REGS pop=0
277	RESTORE_INT_REGS
2781:	popl	%ds
2792:	popl	%es
2803:	popl	%fs
281	POP_GS \pop
282.pushsection .fixup, "ax"
2834:	movl	$0, (%esp)
284	jmp	1b
2855:	movl	$0, (%esp)
286	jmp	2b
2876:	movl	$0, (%esp)
288	jmp	3b
289.popsection
290	_ASM_EXTABLE(1b, 4b)
291	_ASM_EXTABLE(2b, 5b)
292	_ASM_EXTABLE(3b, 6b)
293	POP_GS_EX
294.endm
295
296.macro RESTORE_ALL_NMI cr3_reg:req pop=0
297	/*
298	 * Now switch the CR3 when PTI is enabled.
299	 *
300	 * We enter with kernel cr3 and switch the cr3 to the value
301	 * stored on \cr3_reg, which is either a user or a kernel cr3.
302	 */
303	ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
304
305	testl	$PTI_SWITCH_MASK, \cr3_reg
306	jz	.Lswitched_\@
307
308	/* User cr3 in \cr3_reg - write it to hardware cr3 */
309	movl	\cr3_reg, %cr3
310
311.Lswitched_\@:
312
313	BUG_IF_WRONG_CR3
314
315	RESTORE_REGS pop=\pop
316.endm
317
318.macro CHECK_AND_APPLY_ESPFIX
319#ifdef CONFIG_X86_ESPFIX32
320#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
321
322	ALTERNATIVE	"jmp .Lend_\@", "", X86_BUG_ESPFIX
323
324	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
325	/*
326	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
327	 * are returning to the kernel.
328	 * See comments in process.c:copy_thread() for details.
329	 */
330	movb	PT_OLDSS(%esp), %ah
331	movb	PT_CS(%esp), %al
332	andl	$(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
333	cmpl	$((SEGMENT_LDT << 8) | USER_RPL), %eax
334	jne	.Lend_\@	# returning to user-space with LDT SS
335
336	/*
337	 * Setup and switch to ESPFIX stack
338	 *
339	 * We're returning to userspace with a 16 bit stack. The CPU will not
340	 * restore the high word of ESP for us on executing iret... This is an
341	 * "official" bug of all the x86-compatible CPUs, which we can work
342	 * around to make dosemu and wine happy. We do this by preloading the
343	 * high word of ESP with the high word of the userspace ESP while
344	 * compensating for the offset by changing to the ESPFIX segment with
345	 * a base address that matches for the difference.
346	 */
347	mov	%esp, %edx			/* load kernel esp */
348	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
349	mov	%dx, %ax			/* eax: new kernel esp */
350	sub	%eax, %edx			/* offset (low word is 0) */
351	shr	$16, %edx
352	mov	%dl, GDT_ESPFIX_SS + 4		/* bits 16..23 */
353	mov	%dh, GDT_ESPFIX_SS + 7		/* bits 24..31 */
354	pushl	$__ESPFIX_SS
355	pushl	%eax				/* new kernel esp */
356	/*
357	 * Disable interrupts, but do not irqtrace this section: we
358	 * will soon execute iret and the tracer was already set to
359	 * the irqstate after the IRET:
360	 */
361	DISABLE_INTERRUPTS(CLBR_ANY)
362	lss	(%esp), %esp			/* switch to espfix segment */
363.Lend_\@:
364#endif /* CONFIG_X86_ESPFIX32 */
365.endm
366
367/*
368 * Called with pt_regs fully populated and kernel segments loaded,
369 * so we can access PER_CPU and use the integer registers.
370 *
371 * We need to be very careful here with the %esp switch, because an NMI
372 * can happen everywhere. If the NMI handler finds itself on the
373 * entry-stack, it will overwrite the task-stack and everything we
374 * copied there. So allocate the stack-frame on the task-stack and
375 * switch to it before we do any copying.
376 */
377
378#define CS_FROM_ENTRY_STACK	(1 << 31)
379#define CS_FROM_USER_CR3	(1 << 30)
380
381.macro SWITCH_TO_KERNEL_STACK
382
383	ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
384
385	BUG_IF_WRONG_CR3
386
387	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
388
389	/*
390	 * %eax now contains the entry cr3 and we carry it forward in
391	 * that register for the time this macro runs
392	 */
393
394	/*
395	 * The high bits of the CS dword (__csh) are used for
396	 * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
397	 * hardware didn't do this for us.
398	 */
399	andl	$(0x0000ffff), PT_CS(%esp)
400
401	/* Are we on the entry stack? Bail out if not! */
402	movl	PER_CPU_VAR(cpu_entry_area), %ecx
403	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
404	subl	%esp, %ecx	/* ecx = (end of entry_stack) - esp */
405	cmpl	$SIZEOF_entry_stack, %ecx
406	jae	.Lend_\@
407
408	/* Load stack pointer into %esi and %edi */
409	movl	%esp, %esi
410	movl	%esi, %edi
411
412	/* Move %edi to the top of the entry stack */
413	andl	$(MASK_entry_stack), %edi
414	addl	$(SIZEOF_entry_stack), %edi
415
416	/* Load top of task-stack into %edi */
417	movl	TSS_entry2task_stack(%edi), %edi
418
419	/* Special case - entry from kernel mode via entry stack */
420#ifdef CONFIG_VM86
421	movl	PT_EFLAGS(%esp), %ecx		# mix EFLAGS and CS
422	movb	PT_CS(%esp), %cl
423	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
424#else
425	movl	PT_CS(%esp), %ecx
426	andl	$SEGMENT_RPL_MASK, %ecx
427#endif
428	cmpl	$USER_RPL, %ecx
429	jb	.Lentry_from_kernel_\@
430
431	/* Bytes to copy */
432	movl	$PTREGS_SIZE, %ecx
433
434#ifdef CONFIG_VM86
435	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esi)
436	jz	.Lcopy_pt_regs_\@
437
438	/*
439	 * Stack-frame contains 4 additional segment registers when
440	 * coming from VM86 mode
441	 */
442	addl	$(4 * 4), %ecx
443
444#endif
445.Lcopy_pt_regs_\@:
446
447	/* Allocate frame on task-stack */
448	subl	%ecx, %edi
449
450	/* Switch to task-stack */
451	movl	%edi, %esp
452
453	/*
454	 * We are now on the task-stack and can safely copy over the
455	 * stack-frame
456	 */
457	shrl	$2, %ecx
458	cld
459	rep movsl
460
461	jmp .Lend_\@
462
463.Lentry_from_kernel_\@:
464
465	/*
466	 * This handles the case when we enter the kernel from
467	 * kernel-mode and %esp points to the entry-stack. When this
468	 * happens we need to switch to the task-stack to run C code,
469	 * but switch back to the entry-stack again when we approach
470	 * iret and return to the interrupted code-path. This usually
471	 * happens when we hit an exception while restoring user-space
472	 * segment registers on the way back to user-space or when the
473	 * sysenter handler runs with eflags.tf set.
474	 *
475	 * When we switch to the task-stack here, we can't trust the
476	 * contents of the entry-stack anymore, as the exception handler
477	 * might be scheduled out or moved to another CPU. Therefore we
478	 * copy the complete entry-stack to the task-stack and set a
479	 * marker in the iret-frame (bit 31 of the CS dword) to detect
480	 * what we've done on the iret path.
481	 *
482	 * On the iret path we copy everything back and switch to the
483	 * entry-stack, so that the interrupted kernel code-path
484	 * continues on the same stack it was interrupted with.
485	 *
486	 * Be aware that an NMI can happen anytime in this code.
487	 *
488	 * %esi: Entry-Stack pointer (same as %esp)
489	 * %edi: Top of the task stack
490	 * %eax: CR3 on kernel entry
491	 */
492
493	/* Calculate number of bytes on the entry stack in %ecx */
494	movl	%esi, %ecx
495
496	/* %ecx to the top of entry-stack */
497	andl	$(MASK_entry_stack), %ecx
498	addl	$(SIZEOF_entry_stack), %ecx
499
500	/* Number of bytes on the entry stack to %ecx */
501	sub	%esi, %ecx
502
503	/* Mark stackframe as coming from entry stack */
504	orl	$CS_FROM_ENTRY_STACK, PT_CS(%esp)
505
506	/*
507	 * Test the cr3 used to enter the kernel and add a marker
508	 * so that we can switch back to it before iret.
509	 */
510	testl	$PTI_SWITCH_MASK, %eax
511	jz	.Lcopy_pt_regs_\@
512	orl	$CS_FROM_USER_CR3, PT_CS(%esp)
513
514	/*
515	 * %esi and %edi are unchanged, %ecx contains the number of
516	 * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
517	 * the stack-frame on task-stack and copy everything over
518	 */
519	jmp .Lcopy_pt_regs_\@
520
521.Lend_\@:
522.endm
523
524/*
525 * Switch back from the kernel stack to the entry stack.
526 *
527 * The %esp register must point to pt_regs on the task stack. It will
528 * first calculate the size of the stack-frame to copy, depending on
529 * whether we return to VM86 mode or not. With that it uses 'rep movsl'
530 * to copy the contents of the stack over to the entry stack.
531 *
532 * We must be very careful here, as we can't trust the contents of the
533 * task-stack once we switched to the entry-stack. When an NMI happens
534 * while on the entry-stack, the NMI handler will switch back to the top
535 * of the task stack, overwriting our stack-frame we are about to copy.
536 * Therefore we switch the stack only after everything is copied over.
537 */
538.macro SWITCH_TO_ENTRY_STACK
539
540	ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
541
542	/* Bytes to copy */
543	movl	$PTREGS_SIZE, %ecx
544
545#ifdef CONFIG_VM86
546	testl	$(X86_EFLAGS_VM), PT_EFLAGS(%esp)
547	jz	.Lcopy_pt_regs_\@
548
549	/* Additional 4 registers to copy when returning to VM86 mode */
550	addl    $(4 * 4), %ecx
551
552.Lcopy_pt_regs_\@:
553#endif
554
555	/* Initialize source and destination for movsl */
556	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
557	subl	%ecx, %edi
558	movl	%esp, %esi
559
560	/* Save future stack pointer in %ebx */
561	movl	%edi, %ebx
562
563	/* Copy over the stack-frame */
564	shrl	$2, %ecx
565	cld
566	rep movsl
567
568	/*
569	 * Switch to entry-stack - needs to happen after everything is
570	 * copied because the NMI handler will overwrite the task-stack
571	 * when on entry-stack
572	 */
573	movl	%ebx, %esp
574
575.Lend_\@:
576.endm
577
578/*
579 * This macro handles the case when we return to kernel-mode on the iret
580 * path and have to switch back to the entry stack and/or user-cr3
581 *
582 * See the comments below the .Lentry_from_kernel_\@ label in the
583 * SWITCH_TO_KERNEL_STACK macro for more details.
584 */
585.macro PARANOID_EXIT_TO_KERNEL_MODE
586
587	/*
588	 * Test if we entered the kernel with the entry-stack. Most
589	 * likely we did not, because this code only runs on the
590	 * return-to-kernel path.
591	 */
592	testl	$CS_FROM_ENTRY_STACK, PT_CS(%esp)
593	jz	.Lend_\@
594
595	/* Unlikely slow-path */
596
597	/* Clear marker from stack-frame */
598	andl	$(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
599
600	/* Copy the remaining task-stack contents to entry-stack */
601	movl	%esp, %esi
602	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
603
604	/* Bytes on the task-stack to ecx */
605	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
606	subl	%esi, %ecx
607
608	/* Allocate stack-frame on entry-stack */
609	subl	%ecx, %edi
610
611	/*
612	 * Save future stack-pointer, we must not switch until the
613	 * copy is done, otherwise the NMI handler could destroy the
614	 * contents of the task-stack we are about to copy.
615	 */
616	movl	%edi, %ebx
617
618	/* Do the copy */
619	shrl	$2, %ecx
620	cld
621	rep movsl
622
623	/* Safe to switch to entry-stack now */
624	movl	%ebx, %esp
625
626	/*
627	 * We came from entry-stack and need to check if we also need to
628	 * switch back to user cr3.
629	 */
630	testl	$CS_FROM_USER_CR3, PT_CS(%esp)
631	jz	.Lend_\@
632
633	/* Clear marker from stack-frame */
634	andl	$(~CS_FROM_USER_CR3), PT_CS(%esp)
635
636	SWITCH_TO_USER_CR3 scratch_reg=%eax
637
638.Lend_\@:
639.endm
640/*
641 * %eax: prev task
642 * %edx: next task
643 */
644ENTRY(__switch_to_asm)
645	/*
646	 * Save callee-saved registers
647	 * This must match the order in struct inactive_task_frame
648	 */
649	pushl	%ebp
650	pushl	%ebx
651	pushl	%edi
652	pushl	%esi
653	pushfl
654
655	/* switch stack */
656	movl	%esp, TASK_threadsp(%eax)
657	movl	TASK_threadsp(%edx), %esp
658
659#ifdef CONFIG_STACKPROTECTOR
660	movl	TASK_stack_canary(%edx), %ebx
661	movl	%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
662#endif
663
664#ifdef CONFIG_RETPOLINE
665	/*
666	 * When switching from a shallower to a deeper call stack
667	 * the RSB may either underflow or use entries populated
668	 * with userspace addresses. On CPUs where those concerns
669	 * exist, overwrite the RSB with entries which capture
670	 * speculative execution to prevent attack.
671	 */
672	FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
673#endif
674
675	/* restore callee-saved registers */
676	popfl
677	popl	%esi
678	popl	%edi
679	popl	%ebx
680	popl	%ebp
681
682	jmp	__switch_to
683END(__switch_to_asm)
684
685/*
686 * The unwinder expects the last frame on the stack to always be at the same
687 * offset from the end of the page, which allows it to validate the stack.
688 * Calling schedule_tail() directly would break that convention because its an
689 * asmlinkage function so its argument has to be pushed on the stack.  This
690 * wrapper creates a proper "end of stack" frame header before the call.
691 */
692ENTRY(schedule_tail_wrapper)
693	FRAME_BEGIN
694
695	pushl	%eax
696	call	schedule_tail
697	popl	%eax
698
699	FRAME_END
700	ret
701ENDPROC(schedule_tail_wrapper)
702/*
703 * A newly forked process directly context switches into this address.
704 *
705 * eax: prev task we switched from
706 * ebx: kernel thread func (NULL for user thread)
707 * edi: kernel thread arg
708 */
709ENTRY(ret_from_fork)
710	call	schedule_tail_wrapper
711
712	testl	%ebx, %ebx
713	jnz	1f		/* kernel threads are uncommon */
714
7152:
716	/* When we fork, we trace the syscall return in the child, too. */
717	movl    %esp, %eax
718	call    syscall_return_slowpath
719	STACKLEAK_ERASE
720	jmp     restore_all
721
722	/* kernel thread */
7231:	movl	%edi, %eax
724	CALL_NOSPEC %ebx
725	/*
726	 * A kernel thread is allowed to return here after successfully
727	 * calling do_execve().  Exit to userspace to complete the execve()
728	 * syscall.
729	 */
730	movl	$0, PT_EAX(%esp)
731	jmp	2b
732END(ret_from_fork)
733
734/*
735 * Return to user mode is not as complex as all this looks,
736 * but we want the default path for a system call return to
737 * go as quickly as possible which is why some of this is
738 * less clear than it otherwise should be.
739 */
740
741	# userspace resumption stub bypassing syscall exit tracing
742	ALIGN
743ret_from_exception:
744	preempt_stop(CLBR_ANY)
745ret_from_intr:
746#ifdef CONFIG_VM86
747	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
748	movb	PT_CS(%esp), %al
749	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
750#else
751	/*
752	 * We can be coming here from child spawned by kernel_thread().
753	 */
754	movl	PT_CS(%esp), %eax
755	andl	$SEGMENT_RPL_MASK, %eax
756#endif
757	cmpl	$USER_RPL, %eax
758	jb	resume_kernel			# not returning to v8086 or userspace
759
760ENTRY(resume_userspace)
761	DISABLE_INTERRUPTS(CLBR_ANY)
762	TRACE_IRQS_OFF
763	movl	%esp, %eax
764	call	prepare_exit_to_usermode
765	jmp	restore_all
766END(ret_from_exception)
767
768#ifdef CONFIG_PREEMPT
769ENTRY(resume_kernel)
770	DISABLE_INTERRUPTS(CLBR_ANY)
771	cmpl	$0, PER_CPU_VAR(__preempt_count)
772	jnz	restore_all_kernel
773	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
774	jz	restore_all_kernel
775	call	preempt_schedule_irq
776	jmp	restore_all_kernel
777END(resume_kernel)
778#endif
779
780GLOBAL(__begin_SYSENTER_singlestep_region)
781/*
782 * All code from here through __end_SYSENTER_singlestep_region is subject
783 * to being single-stepped if a user program sets TF and executes SYSENTER.
784 * There is absolutely nothing that we can do to prevent this from happening
785 * (thanks Intel!).  To keep our handling of this situation as simple as
786 * possible, we handle TF just like AC and NT, except that our #DB handler
787 * will ignore all of the single-step traps generated in this range.
788 */
789
790#ifdef CONFIG_XEN_PV
791/*
792 * Xen doesn't set %esp to be precisely what the normal SYSENTER
793 * entry point expects, so fix it up before using the normal path.
794 */
795ENTRY(xen_sysenter_target)
796	addl	$5*4, %esp			/* remove xen-provided frame */
797	jmp	.Lsysenter_past_esp
798#endif
799
800/*
801 * 32-bit SYSENTER entry.
802 *
803 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
804 * if X86_FEATURE_SEP is available.  This is the preferred system call
805 * entry on 32-bit systems.
806 *
807 * The SYSENTER instruction, in principle, should *only* occur in the
808 * vDSO.  In practice, a small number of Android devices were shipped
809 * with a copy of Bionic that inlined a SYSENTER instruction.  This
810 * never happened in any of Google's Bionic versions -- it only happened
811 * in a narrow range of Intel-provided versions.
812 *
813 * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
814 * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
815 * SYSENTER does not save anything on the stack,
816 * and does not save old EIP (!!!), ESP, or EFLAGS.
817 *
818 * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
819 * user and/or vm86 state), we explicitly disable the SYSENTER
820 * instruction in vm86 mode by reprogramming the MSRs.
821 *
822 * Arguments:
823 * eax  system call number
824 * ebx  arg1
825 * ecx  arg2
826 * edx  arg3
827 * esi  arg4
828 * edi  arg5
829 * ebp  user stack
830 * 0(%ebp) arg6
831 */
832ENTRY(entry_SYSENTER_32)
833	/*
834	 * On entry-stack with all userspace-regs live - save and
835	 * restore eflags and %eax to use it as scratch-reg for the cr3
836	 * switch.
837	 */
838	pushfl
839	pushl	%eax
840	BUG_IF_WRONG_CR3 no_user_check=1
841	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
842	popl	%eax
843	popfl
844
845	/* Stack empty again, switch to task stack */
846	movl	TSS_entry2task_stack(%esp), %esp
847
848.Lsysenter_past_esp:
849	pushl	$__USER_DS		/* pt_regs->ss */
850	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
851	pushfl				/* pt_regs->flags (except IF = 0) */
852	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
853	pushl	$__USER_CS		/* pt_regs->cs */
854	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
855	pushl	%eax			/* pt_regs->orig_ax */
856	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest, stack already switched */
857
858	/*
859	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
860	 * and TF ourselves.  To save a few cycles, we can check whether
861	 * either was set instead of doing an unconditional popfq.
862	 * This needs to happen before enabling interrupts so that
863	 * we don't get preempted with NT set.
864	 *
865	 * If TF is set, we will single-step all the way to here -- do_debug
866	 * will ignore all the traps.  (Yes, this is slow, but so is
867	 * single-stepping in general.  This allows us to avoid having
868	 * a more complicated code to handle the case where a user program
869	 * forces us to single-step through the SYSENTER entry code.)
870	 *
871	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
872	 * out-of-line as an optimization: NT is unlikely to be set in the
873	 * majority of the cases and instead of polluting the I$ unnecessarily,
874	 * we're keeping that code behind a branch which will predict as
875	 * not-taken and therefore its instructions won't be fetched.
876	 */
877	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
878	jnz	.Lsysenter_fix_flags
879.Lsysenter_flags_fixed:
880
881	/*
882	 * User mode is traced as though IRQs are on, and SYSENTER
883	 * turned them off.
884	 */
885	TRACE_IRQS_OFF
886
887	movl	%esp, %eax
888	call	do_fast_syscall_32
889	/* XEN PV guests always use IRET path */
890	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
891		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
892
893	STACKLEAK_ERASE
894
895/* Opportunistic SYSEXIT */
896	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
897
898	/*
899	 * Setup entry stack - we keep the pointer in %eax and do the
900	 * switch after almost all user-state is restored.
901	 */
902
903	/* Load entry stack pointer and allocate frame for eflags/eax */
904	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
905	subl	$(2*4), %eax
906
907	/* Copy eflags and eax to entry stack */
908	movl	PT_EFLAGS(%esp), %edi
909	movl	PT_EAX(%esp), %esi
910	movl	%edi, (%eax)
911	movl	%esi, 4(%eax)
912
913	/* Restore user registers and segments */
914	movl	PT_EIP(%esp), %edx	/* pt_regs->ip */
915	movl	PT_OLDESP(%esp), %ecx	/* pt_regs->sp */
9161:	mov	PT_FS(%esp), %fs
917	PTGS_TO_GS
918
919	popl	%ebx			/* pt_regs->bx */
920	addl	$2*4, %esp		/* skip pt_regs->cx and pt_regs->dx */
921	popl	%esi			/* pt_regs->si */
922	popl	%edi			/* pt_regs->di */
923	popl	%ebp			/* pt_regs->bp */
924
925	/* Switch to entry stack */
926	movl	%eax, %esp
927
928	/* Now ready to switch the cr3 */
929	SWITCH_TO_USER_CR3 scratch_reg=%eax
930
931	/*
932	 * Restore all flags except IF. (We restore IF separately because
933	 * STI gives a one-instruction window in which we won't be interrupted,
934	 * whereas POPF does not.)
935	 */
936	btrl	$X86_EFLAGS_IF_BIT, (%esp)
937	BUG_IF_WRONG_CR3 no_user_check=1
938	popfl
939	popl	%eax
940
941	/*
942	 * Return back to the vDSO, which will pop ecx and edx.
943	 * Don't bother with DS and ES (they already contain __USER_DS).
944	 */
945	sti
946	sysexit
947
948.pushsection .fixup, "ax"
9492:	movl	$0, PT_FS(%esp)
950	jmp	1b
951.popsection
952	_ASM_EXTABLE(1b, 2b)
953	PTGS_TO_GS_EX
954
955.Lsysenter_fix_flags:
956	pushl	$X86_EFLAGS_FIXED
957	popfl
958	jmp	.Lsysenter_flags_fixed
959GLOBAL(__end_SYSENTER_singlestep_region)
960ENDPROC(entry_SYSENTER_32)
961
962/*
963 * 32-bit legacy system call entry.
964 *
965 * 32-bit x86 Linux system calls traditionally used the INT $0x80
966 * instruction.  INT $0x80 lands here.
967 *
968 * This entry point can be used by any 32-bit perform system calls.
969 * Instances of INT $0x80 can be found inline in various programs and
970 * libraries.  It is also used by the vDSO's __kernel_vsyscall
971 * fallback for hardware that doesn't support a faster entry method.
972 * Restarted 32-bit system calls also fall back to INT $0x80
973 * regardless of what instruction was originally used to do the system
974 * call.  (64-bit programs can use INT $0x80 as well, but they can
975 * only run on 64-bit kernels and therefore land in
976 * entry_INT80_compat.)
977 *
978 * This is considered a slow path.  It is not used by most libc
979 * implementations on modern hardware except during process startup.
980 *
981 * Arguments:
982 * eax  system call number
983 * ebx  arg1
984 * ecx  arg2
985 * edx  arg3
986 * esi  arg4
987 * edi  arg5
988 * ebp  arg6
989 */
990ENTRY(entry_INT80_32)
991	ASM_CLAC
992	pushl	%eax			/* pt_regs->orig_ax */
993
994	SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1	/* save rest */
995
996	/*
997	 * User mode is traced as though IRQs are on, and the interrupt gate
998	 * turned them off.
999	 */
1000	TRACE_IRQS_OFF
1001
1002	movl	%esp, %eax
1003	call	do_int80_syscall_32
1004.Lsyscall_32_done:
1005
1006	STACKLEAK_ERASE
1007
1008restore_all:
1009	TRACE_IRQS_IRET
1010	SWITCH_TO_ENTRY_STACK
1011.Lrestore_all_notrace:
1012	CHECK_AND_APPLY_ESPFIX
1013.Lrestore_nocheck:
1014	/* Switch back to user CR3 */
1015	SWITCH_TO_USER_CR3 scratch_reg=%eax
1016
1017	BUG_IF_WRONG_CR3
1018
1019	/* Restore user state */
1020	RESTORE_REGS pop=4			# skip orig_eax/error_code
1021.Lirq_return:
1022	/*
1023	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
1024	 * when returning from IPI handler and when returning from
1025	 * scheduler to user-space.
1026	 */
1027	INTERRUPT_RETURN
1028
1029restore_all_kernel:
1030	TRACE_IRQS_IRET
1031	PARANOID_EXIT_TO_KERNEL_MODE
1032	BUG_IF_WRONG_CR3
1033	RESTORE_REGS 4
1034	jmp	.Lirq_return
1035
1036.section .fixup, "ax"
1037ENTRY(iret_exc	)
1038	pushl	$0				# no error code
1039	pushl	$do_iret_error
1040
1041#ifdef CONFIG_DEBUG_ENTRY
1042	/*
1043	 * The stack-frame here is the one that iret faulted on, so its a
1044	 * return-to-user frame. We are on kernel-cr3 because we come here from
1045	 * the fixup code. This confuses the CR3 checker, so switch to user-cr3
1046	 * as the checker expects it.
1047	 */
1048	pushl	%eax
1049	SWITCH_TO_USER_CR3 scratch_reg=%eax
1050	popl	%eax
1051#endif
1052
1053	jmp	common_exception
1054.previous
1055	_ASM_EXTABLE(.Lirq_return, iret_exc)
1056ENDPROC(entry_INT80_32)
1057
1058.macro FIXUP_ESPFIX_STACK
1059/*
1060 * Switch back for ESPFIX stack to the normal zerobased stack
1061 *
1062 * We can't call C functions using the ESPFIX stack. This code reads
1063 * the high word of the segment base from the GDT and swiches to the
1064 * normal stack and adjusts ESP with the matching offset.
1065 */
1066#ifdef CONFIG_X86_ESPFIX32
1067	/* fixup the stack */
1068	mov	GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
1069	mov	GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
1070	shl	$16, %eax
1071	addl	%esp, %eax			/* the adjusted stack pointer */
1072	pushl	$__KERNEL_DS
1073	pushl	%eax
1074	lss	(%esp), %esp			/* switch to the normal stack segment */
1075#endif
1076.endm
1077.macro UNWIND_ESPFIX_STACK
1078#ifdef CONFIG_X86_ESPFIX32
1079	movl	%ss, %eax
1080	/* see if on espfix stack */
1081	cmpw	$__ESPFIX_SS, %ax
1082	jne	27f
1083	movl	$__KERNEL_DS, %eax
1084	movl	%eax, %ds
1085	movl	%eax, %es
1086	/* switch to normal stack */
1087	FIXUP_ESPFIX_STACK
108827:
1089#endif
1090.endm
1091
1092/*
1093 * Build the entry stubs with some assembler magic.
1094 * We pack 1 stub into every 8-byte block.
1095 */
1096	.align 8
1097ENTRY(irq_entries_start)
1098    vector=FIRST_EXTERNAL_VECTOR
1099    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
1100	pushl	$(~vector+0x80)			/* Note: always in signed byte range */
1101    vector=vector+1
1102	jmp	common_interrupt
1103	.align	8
1104    .endr
1105END(irq_entries_start)
1106
1107/*
1108 * the CPU automatically disables interrupts when executing an IRQ vector,
1109 * so IRQ-flags tracing has to follow that:
1110 */
1111	.p2align CONFIG_X86_L1_CACHE_SHIFT
1112common_interrupt:
1113	ASM_CLAC
1114	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
1115
1116	SAVE_ALL switch_stacks=1
1117	ENCODE_FRAME_POINTER
1118	TRACE_IRQS_OFF
1119	movl	%esp, %eax
1120	call	do_IRQ
1121	jmp	ret_from_intr
1122ENDPROC(common_interrupt)
1123
1124#define BUILD_INTERRUPT3(name, nr, fn)			\
1125ENTRY(name)						\
1126	ASM_CLAC;					\
1127	pushl	$~(nr);					\
1128	SAVE_ALL switch_stacks=1;			\
1129	ENCODE_FRAME_POINTER;				\
1130	TRACE_IRQS_OFF					\
1131	movl	%esp, %eax;				\
1132	call	fn;					\
1133	jmp	ret_from_intr;				\
1134ENDPROC(name)
1135
1136#define BUILD_INTERRUPT(name, nr)		\
1137	BUILD_INTERRUPT3(name, nr, smp_##name);	\
1138
1139/* The include is where all of the SMP etc. interrupts come from */
1140#include <asm/entry_arch.h>
1141
1142ENTRY(coprocessor_error)
1143	ASM_CLAC
1144	pushl	$0
1145	pushl	$do_coprocessor_error
1146	jmp	common_exception
1147END(coprocessor_error)
1148
1149ENTRY(simd_coprocessor_error)
1150	ASM_CLAC
1151	pushl	$0
1152#ifdef CONFIG_X86_INVD_BUG
1153	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
1154	ALTERNATIVE "pushl	$do_general_protection",	\
1155		    "pushl	$do_simd_coprocessor_error",	\
1156		    X86_FEATURE_XMM
1157#else
1158	pushl	$do_simd_coprocessor_error
1159#endif
1160	jmp	common_exception
1161END(simd_coprocessor_error)
1162
1163ENTRY(device_not_available)
1164	ASM_CLAC
1165	pushl	$-1				# mark this as an int
1166	pushl	$do_device_not_available
1167	jmp	common_exception
1168END(device_not_available)
1169
1170#ifdef CONFIG_PARAVIRT
1171ENTRY(native_iret)
1172	iret
1173	_ASM_EXTABLE(native_iret, iret_exc)
1174END(native_iret)
1175#endif
1176
1177ENTRY(overflow)
1178	ASM_CLAC
1179	pushl	$0
1180	pushl	$do_overflow
1181	jmp	common_exception
1182END(overflow)
1183
1184ENTRY(bounds)
1185	ASM_CLAC
1186	pushl	$0
1187	pushl	$do_bounds
1188	jmp	common_exception
1189END(bounds)
1190
1191ENTRY(invalid_op)
1192	ASM_CLAC
1193	pushl	$0
1194	pushl	$do_invalid_op
1195	jmp	common_exception
1196END(invalid_op)
1197
1198ENTRY(coprocessor_segment_overrun)
1199	ASM_CLAC
1200	pushl	$0
1201	pushl	$do_coprocessor_segment_overrun
1202	jmp	common_exception
1203END(coprocessor_segment_overrun)
1204
1205ENTRY(invalid_TSS)
1206	ASM_CLAC
1207	pushl	$do_invalid_TSS
1208	jmp	common_exception
1209END(invalid_TSS)
1210
1211ENTRY(segment_not_present)
1212	ASM_CLAC
1213	pushl	$do_segment_not_present
1214	jmp	common_exception
1215END(segment_not_present)
1216
1217ENTRY(stack_segment)
1218	ASM_CLAC
1219	pushl	$do_stack_segment
1220	jmp	common_exception
1221END(stack_segment)
1222
1223ENTRY(alignment_check)
1224	ASM_CLAC
1225	pushl	$do_alignment_check
1226	jmp	common_exception
1227END(alignment_check)
1228
1229ENTRY(divide_error)
1230	ASM_CLAC
1231	pushl	$0				# no error code
1232	pushl	$do_divide_error
1233	jmp	common_exception
1234END(divide_error)
1235
1236#ifdef CONFIG_X86_MCE
1237ENTRY(machine_check)
1238	ASM_CLAC
1239	pushl	$0
1240	pushl	machine_check_vector
1241	jmp	common_exception
1242END(machine_check)
1243#endif
1244
1245ENTRY(spurious_interrupt_bug)
1246	ASM_CLAC
1247	pushl	$0
1248	pushl	$do_spurious_interrupt_bug
1249	jmp	common_exception
1250END(spurious_interrupt_bug)
1251
1252#ifdef CONFIG_XEN_PV
1253ENTRY(xen_hypervisor_callback)
1254	pushl	$-1				/* orig_ax = -1 => not a system call */
1255	SAVE_ALL
1256	ENCODE_FRAME_POINTER
1257	TRACE_IRQS_OFF
1258
1259	/*
1260	 * Check to see if we got the event in the critical
1261	 * region in xen_iret_direct, after we've reenabled
1262	 * events and checked for pending events.  This simulates
1263	 * iret instruction's behaviour where it delivers a
1264	 * pending interrupt when enabling interrupts:
1265	 */
1266	movl	PT_EIP(%esp), %eax
1267	cmpl	$xen_iret_start_crit, %eax
1268	jb	1f
1269	cmpl	$xen_iret_end_crit, %eax
1270	jae	1f
1271
1272	jmp	xen_iret_crit_fixup
1273
1274ENTRY(xen_do_upcall)
12751:	mov	%esp, %eax
1276	call	xen_evtchn_do_upcall
1277#ifndef CONFIG_PREEMPT
1278	call	xen_maybe_preempt_hcall
1279#endif
1280	jmp	ret_from_intr
1281ENDPROC(xen_hypervisor_callback)
1282
1283/*
1284 * Hypervisor uses this for application faults while it executes.
1285 * We get here for two reasons:
1286 *  1. Fault while reloading DS, ES, FS or GS
1287 *  2. Fault while executing IRET
1288 * Category 1 we fix up by reattempting the load, and zeroing the segment
1289 * register if the load fails.
1290 * Category 2 we fix up by jumping to do_iret_error. We cannot use the
1291 * normal Linux return path in this case because if we use the IRET hypercall
1292 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1293 * We distinguish between categories by maintaining a status value in EAX.
1294 */
1295ENTRY(xen_failsafe_callback)
1296	pushl	%eax
1297	movl	$1, %eax
12981:	mov	4(%esp), %ds
12992:	mov	8(%esp), %es
13003:	mov	12(%esp), %fs
13014:	mov	16(%esp), %gs
1302	/* EAX == 0 => Category 1 (Bad segment)
1303	   EAX != 0 => Category 2 (Bad IRET) */
1304	testl	%eax, %eax
1305	popl	%eax
1306	lea	16(%esp), %esp
1307	jz	5f
1308	jmp	iret_exc
13095:	pushl	$-1				/* orig_ax = -1 => not a system call */
1310	SAVE_ALL
1311	ENCODE_FRAME_POINTER
1312	jmp	ret_from_exception
1313
1314.section .fixup, "ax"
13156:	xorl	%eax, %eax
1316	movl	%eax, 4(%esp)
1317	jmp	1b
13187:	xorl	%eax, %eax
1319	movl	%eax, 8(%esp)
1320	jmp	2b
13218:	xorl	%eax, %eax
1322	movl	%eax, 12(%esp)
1323	jmp	3b
13249:	xorl	%eax, %eax
1325	movl	%eax, 16(%esp)
1326	jmp	4b
1327.previous
1328	_ASM_EXTABLE(1b, 6b)
1329	_ASM_EXTABLE(2b, 7b)
1330	_ASM_EXTABLE(3b, 8b)
1331	_ASM_EXTABLE(4b, 9b)
1332ENDPROC(xen_failsafe_callback)
1333#endif /* CONFIG_XEN_PV */
1334
1335#ifdef CONFIG_XEN_PVHVM
1336BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1337		 xen_evtchn_do_upcall)
1338#endif
1339
1340
1341#if IS_ENABLED(CONFIG_HYPERV)
1342
1343BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1344		 hyperv_vector_handler)
1345
1346BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
1347		 hyperv_reenlightenment_intr)
1348
1349BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
1350		 hv_stimer0_vector_handler)
1351
1352#endif /* CONFIG_HYPERV */
1353
1354ENTRY(page_fault)
1355	ASM_CLAC
1356	pushl	$do_page_fault
1357	ALIGN
1358	jmp common_exception
1359END(page_fault)
1360
1361common_exception:
1362	/* the function address is in %gs's slot on the stack */
1363	pushl	%fs
1364	pushl	%es
1365	pushl	%ds
1366	pushl	%eax
1367	movl	$(__USER_DS), %eax
1368	movl	%eax, %ds
1369	movl	%eax, %es
1370	movl	$(__KERNEL_PERCPU), %eax
1371	movl	%eax, %fs
1372	pushl	%ebp
1373	pushl	%edi
1374	pushl	%esi
1375	pushl	%edx
1376	pushl	%ecx
1377	pushl	%ebx
1378	SWITCH_TO_KERNEL_STACK
1379	ENCODE_FRAME_POINTER
1380	cld
1381	UNWIND_ESPFIX_STACK
1382	GS_TO_REG %ecx
1383	movl	PT_GS(%esp), %edi		# get the function address
1384	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
1385	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
1386	REG_TO_PTGS %ecx
1387	SET_KERNEL_GS %ecx
1388	TRACE_IRQS_OFF
1389	movl	%esp, %eax			# pt_regs pointer
1390	CALL_NOSPEC %edi
1391	jmp	ret_from_exception
1392END(common_exception)
1393
1394ENTRY(debug)
1395	/*
1396	 * Entry from sysenter is now handled in common_exception
1397	 */
1398	ASM_CLAC
1399	pushl	$-1				# mark this as an int
1400	pushl	$do_debug
1401	jmp	common_exception
1402END(debug)
1403
1404/*
1405 * NMI is doubly nasty.  It can happen on the first instruction of
1406 * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
1407 * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
1408 * switched stacks.  We handle both conditions by simply checking whether we
1409 * interrupted kernel code running on the SYSENTER stack.
1410 */
1411ENTRY(nmi)
1412	ASM_CLAC
1413
1414#ifdef CONFIG_X86_ESPFIX32
1415	pushl	%eax
1416	movl	%ss, %eax
1417	cmpw	$__ESPFIX_SS, %ax
1418	popl	%eax
1419	je	.Lnmi_espfix_stack
1420#endif
1421
1422	pushl	%eax				# pt_regs->orig_ax
1423	SAVE_ALL_NMI cr3_reg=%edi
1424	ENCODE_FRAME_POINTER
1425	xorl	%edx, %edx			# zero error code
1426	movl	%esp, %eax			# pt_regs pointer
1427
1428	/* Are we currently on the SYSENTER stack? */
1429	movl	PER_CPU_VAR(cpu_entry_area), %ecx
1430	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
1431	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */
1432	cmpl	$SIZEOF_entry_stack, %ecx
1433	jb	.Lnmi_from_sysenter_stack
1434
1435	/* Not on SYSENTER stack. */
1436	call	do_nmi
1437	jmp	.Lnmi_return
1438
1439.Lnmi_from_sysenter_stack:
1440	/*
1441	 * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
1442	 * is using the thread stack right now, so it's safe for us to use it.
1443	 */
1444	movl	%esp, %ebx
1445	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
1446	call	do_nmi
1447	movl	%ebx, %esp
1448
1449.Lnmi_return:
1450	CHECK_AND_APPLY_ESPFIX
1451	RESTORE_ALL_NMI cr3_reg=%edi pop=4
1452	jmp	.Lirq_return
1453
1454#ifdef CONFIG_X86_ESPFIX32
1455.Lnmi_espfix_stack:
1456	/*
1457	 * create the pointer to lss back
1458	 */
1459	pushl	%ss
1460	pushl	%esp
1461	addl	$4, (%esp)
1462	/* copy the iret frame of 12 bytes */
1463	.rept 3
1464	pushl	16(%esp)
1465	.endr
1466	pushl	%eax
1467	SAVE_ALL_NMI cr3_reg=%edi
1468	ENCODE_FRAME_POINTER
1469	FIXUP_ESPFIX_STACK			# %eax == %esp
1470	xorl	%edx, %edx			# zero error code
1471	call	do_nmi
1472	RESTORE_ALL_NMI cr3_reg=%edi
1473	lss	12+4(%esp), %esp		# back to espfix stack
1474	jmp	.Lirq_return
1475#endif
1476END(nmi)
1477
1478ENTRY(int3)
1479	ASM_CLAC
1480	pushl	$-1				# mark this as an int
1481
1482	SAVE_ALL switch_stacks=1
1483	ENCODE_FRAME_POINTER
1484	TRACE_IRQS_OFF
1485	xorl	%edx, %edx			# zero error code
1486	movl	%esp, %eax			# pt_regs pointer
1487	call	do_int3
1488	jmp	ret_from_exception
1489END(int3)
1490
1491ENTRY(general_protection)
1492	pushl	$do_general_protection
1493	jmp	common_exception
1494END(general_protection)
1495
1496#ifdef CONFIG_KVM_GUEST
1497ENTRY(async_page_fault)
1498	ASM_CLAC
1499	pushl	$do_async_page_fault
1500	jmp	common_exception
1501END(async_page_fault)
1502#endif
1503
1504ENTRY(rewind_stack_do_exit)
1505	/* Prevent any naive code from trying to unwind to our caller. */
1506	xorl	%ebp, %ebp
1507
1508	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esi
1509	leal	-TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
1510
1511	call	do_exit
15121:	jmp 1b
1513END(rewind_stack_do_exit)
1514