xref: /openbmc/linux/arch/x86/entry/entry_32.S (revision 023e41632e065d49bcbe31b3c4b336217f96a271)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 *  Copyright (C) 1991,1992  Linus Torvalds
4 *
5 * entry_32.S contains the system-call and low-level fault and trap handling routines.
6 *
7 * Stack layout while running C code:
8 *	ptrace needs to have all registers on the stack.
9 *	If the order here is changed, it needs to be
10 *	updated in fork.c:copy_process(), signal.c:do_signal(),
11 *	ptrace.c and ptrace.h
12 *
13 *	 0(%esp) - %ebx
14 *	 4(%esp) - %ecx
15 *	 8(%esp) - %edx
16 *	 C(%esp) - %esi
17 *	10(%esp) - %edi
18 *	14(%esp) - %ebp
19 *	18(%esp) - %eax
20 *	1C(%esp) - %ds
21 *	20(%esp) - %es
22 *	24(%esp) - %fs
23 *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
24 *	2C(%esp) - orig_eax
25 *	30(%esp) - %eip
26 *	34(%esp) - %cs
27 *	38(%esp) - %eflags
28 *	3C(%esp) - %oldesp
29 *	40(%esp) - %oldss
30 */
31
32#include <linux/linkage.h>
33#include <linux/err.h>
34#include <asm/thread_info.h>
35#include <asm/irqflags.h>
36#include <asm/errno.h>
37#include <asm/segment.h>
38#include <asm/smp.h>
39#include <asm/percpu.h>
40#include <asm/processor-flags.h>
41#include <asm/irq_vectors.h>
42#include <asm/cpufeatures.h>
43#include <asm/alternative-asm.h>
44#include <asm/asm.h>
45#include <asm/smap.h>
46#include <asm/frame.h>
47#include <asm/nospec-branch.h>
48
49#include "calling.h"
50
51	.section .entry.text, "ax"
52
53/*
54 * We use macros for low-level operations which need to be overridden
55 * for paravirtualization.  The following will never clobber any registers:
56 *   INTERRUPT_RETURN (aka. "iret")
57 *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
58 *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
59 *
60 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
61 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
62 * Allowing a register to be clobbered can shrink the paravirt replacement
63 * enough to patch inline, increasing performance.
64 */
65
66#ifdef CONFIG_PREEMPT
67# define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
68#else
69# define preempt_stop(clobbers)
70# define resume_kernel		restore_all_kernel
71#endif
72
73.macro TRACE_IRQS_IRET
74#ifdef CONFIG_TRACE_IRQFLAGS
75	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
76	jz	1f
77	TRACE_IRQS_ON
781:
79#endif
80.endm
81
82#define PTI_SWITCH_MASK         (1 << PAGE_SHIFT)
83
84/*
85 * User gs save/restore
86 *
87 * %gs is used for userland TLS and kernel only uses it for stack
88 * canary which is required to be at %gs:20 by gcc.  Read the comment
89 * at the top of stackprotector.h for more info.
90 *
91 * Local labels 98 and 99 are used.
92 */
93#ifdef CONFIG_X86_32_LAZY_GS
94
95 /* unfortunately push/pop can't be no-op */
96.macro PUSH_GS
97	pushl	$0
98.endm
99.macro POP_GS pop=0
100	addl	$(4 + \pop), %esp
101.endm
102.macro POP_GS_EX
103.endm
104
105 /* all the rest are no-op */
106.macro PTGS_TO_GS
107.endm
108.macro PTGS_TO_GS_EX
109.endm
110.macro GS_TO_REG reg
111.endm
112.macro REG_TO_PTGS reg
113.endm
114.macro SET_KERNEL_GS reg
115.endm
116
117#else	/* CONFIG_X86_32_LAZY_GS */
118
119.macro PUSH_GS
120	pushl	%gs
121.endm
122
123.macro POP_GS pop=0
12498:	popl	%gs
125  .if \pop <> 0
126	add	$\pop, %esp
127  .endif
128.endm
129.macro POP_GS_EX
130.pushsection .fixup, "ax"
13199:	movl	$0, (%esp)
132	jmp	98b
133.popsection
134	_ASM_EXTABLE(98b, 99b)
135.endm
136
137.macro PTGS_TO_GS
13898:	mov	PT_GS(%esp), %gs
139.endm
140.macro PTGS_TO_GS_EX
141.pushsection .fixup, "ax"
14299:	movl	$0, PT_GS(%esp)
143	jmp	98b
144.popsection
145	_ASM_EXTABLE(98b, 99b)
146.endm
147
148.macro GS_TO_REG reg
149	movl	%gs, \reg
150.endm
151.macro REG_TO_PTGS reg
152	movl	\reg, PT_GS(%esp)
153.endm
154.macro SET_KERNEL_GS reg
155	movl	$(__KERNEL_STACK_CANARY), \reg
156	movl	\reg, %gs
157.endm
158
159#endif /* CONFIG_X86_32_LAZY_GS */
160
161/* Unconditionally switch to user cr3 */
162.macro SWITCH_TO_USER_CR3 scratch_reg:req
163	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
164
165	movl	%cr3, \scratch_reg
166	orl	$PTI_SWITCH_MASK, \scratch_reg
167	movl	\scratch_reg, %cr3
168.Lend_\@:
169.endm
170
171.macro BUG_IF_WRONG_CR3 no_user_check=0
172#ifdef CONFIG_DEBUG_ENTRY
173	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
174	.if \no_user_check == 0
175	/* coming from usermode? */
176	testl	$SEGMENT_RPL_MASK, PT_CS(%esp)
177	jz	.Lend_\@
178	.endif
179	/* On user-cr3? */
180	movl	%cr3, %eax
181	testl	$PTI_SWITCH_MASK, %eax
182	jnz	.Lend_\@
183	/* From userspace with kernel cr3 - BUG */
184	ud2
185.Lend_\@:
186#endif
187.endm
188
189/*
190 * Switch to kernel cr3 if not already loaded and return current cr3 in
191 * \scratch_reg
192 */
193.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
194	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
195	movl	%cr3, \scratch_reg
196	/* Test if we are already on kernel CR3 */
197	testl	$PTI_SWITCH_MASK, \scratch_reg
198	jz	.Lend_\@
199	andl	$(~PTI_SWITCH_MASK), \scratch_reg
200	movl	\scratch_reg, %cr3
201	/* Return original CR3 in \scratch_reg */
202	orl	$PTI_SWITCH_MASK, \scratch_reg
203.Lend_\@:
204.endm
205
206.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
207	cld
208	PUSH_GS
209	pushl	%fs
210	pushl	%es
211	pushl	%ds
212	pushl	\pt_regs_ax
213	pushl	%ebp
214	pushl	%edi
215	pushl	%esi
216	pushl	%edx
217	pushl	%ecx
218	pushl	%ebx
219	movl	$(__USER_DS), %edx
220	movl	%edx, %ds
221	movl	%edx, %es
222	movl	$(__KERNEL_PERCPU), %edx
223	movl	%edx, %fs
224	SET_KERNEL_GS %edx
225
226	/* Switch to kernel stack if necessary */
227.if \switch_stacks > 0
228	SWITCH_TO_KERNEL_STACK
229.endif
230
231.endm
232
233.macro SAVE_ALL_NMI cr3_reg:req
234	SAVE_ALL
235
236	BUG_IF_WRONG_CR3
237
238	/*
239	 * Now switch the CR3 when PTI is enabled.
240	 *
241	 * We can enter with either user or kernel cr3, the code will
242	 * store the old cr3 in \cr3_reg and switches to the kernel cr3
243	 * if necessary.
244	 */
245	SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
246
247.Lend_\@:
248.endm
249
250/*
251 * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The
252 * frame pointer is replaced with an encoded pointer to pt_regs.  The encoding
253 * is just clearing the MSB, which makes it an invalid stack address and is also
254 * a signal to the unwinder that it's a pt_regs pointer in disguise.
255 *
256 * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
257 * original rbp.
258 */
259.macro ENCODE_FRAME_POINTER
260#ifdef CONFIG_FRAME_POINTER
261	mov %esp, %ebp
262	andl $0x7fffffff, %ebp
263#endif
264.endm
265
266.macro RESTORE_INT_REGS
267	popl	%ebx
268	popl	%ecx
269	popl	%edx
270	popl	%esi
271	popl	%edi
272	popl	%ebp
273	popl	%eax
274.endm
275
276.macro RESTORE_REGS pop=0
277	RESTORE_INT_REGS
2781:	popl	%ds
2792:	popl	%es
2803:	popl	%fs
281	POP_GS \pop
282.pushsection .fixup, "ax"
2834:	movl	$0, (%esp)
284	jmp	1b
2855:	movl	$0, (%esp)
286	jmp	2b
2876:	movl	$0, (%esp)
288	jmp	3b
289.popsection
290	_ASM_EXTABLE(1b, 4b)
291	_ASM_EXTABLE(2b, 5b)
292	_ASM_EXTABLE(3b, 6b)
293	POP_GS_EX
294.endm
295
296.macro RESTORE_ALL_NMI cr3_reg:req pop=0
297	/*
298	 * Now switch the CR3 when PTI is enabled.
299	 *
300	 * We enter with kernel cr3 and switch the cr3 to the value
301	 * stored on \cr3_reg, which is either a user or a kernel cr3.
302	 */
303	ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
304
305	testl	$PTI_SWITCH_MASK, \cr3_reg
306	jz	.Lswitched_\@
307
308	/* User cr3 in \cr3_reg - write it to hardware cr3 */
309	movl	\cr3_reg, %cr3
310
311.Lswitched_\@:
312
313	BUG_IF_WRONG_CR3
314
315	RESTORE_REGS pop=\pop
316.endm
317
318.macro CHECK_AND_APPLY_ESPFIX
319#ifdef CONFIG_X86_ESPFIX32
320#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
321
322	ALTERNATIVE	"jmp .Lend_\@", "", X86_BUG_ESPFIX
323
324	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
325	/*
326	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
327	 * are returning to the kernel.
328	 * See comments in process.c:copy_thread() for details.
329	 */
330	movb	PT_OLDSS(%esp), %ah
331	movb	PT_CS(%esp), %al
332	andl	$(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
333	cmpl	$((SEGMENT_LDT << 8) | USER_RPL), %eax
334	jne	.Lend_\@	# returning to user-space with LDT SS
335
336	/*
337	 * Setup and switch to ESPFIX stack
338	 *
339	 * We're returning to userspace with a 16 bit stack. The CPU will not
340	 * restore the high word of ESP for us on executing iret... This is an
341	 * "official" bug of all the x86-compatible CPUs, which we can work
342	 * around to make dosemu and wine happy. We do this by preloading the
343	 * high word of ESP with the high word of the userspace ESP while
344	 * compensating for the offset by changing to the ESPFIX segment with
345	 * a base address that matches for the difference.
346	 */
347	mov	%esp, %edx			/* load kernel esp */
348	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
349	mov	%dx, %ax			/* eax: new kernel esp */
350	sub	%eax, %edx			/* offset (low word is 0) */
351	shr	$16, %edx
352	mov	%dl, GDT_ESPFIX_SS + 4		/* bits 16..23 */
353	mov	%dh, GDT_ESPFIX_SS + 7		/* bits 24..31 */
354	pushl	$__ESPFIX_SS
355	pushl	%eax				/* new kernel esp */
356	/*
357	 * Disable interrupts, but do not irqtrace this section: we
358	 * will soon execute iret and the tracer was already set to
359	 * the irqstate after the IRET:
360	 */
361	DISABLE_INTERRUPTS(CLBR_ANY)
362	lss	(%esp), %esp			/* switch to espfix segment */
363.Lend_\@:
364#endif /* CONFIG_X86_ESPFIX32 */
365.endm
366
367/*
368 * Called with pt_regs fully populated and kernel segments loaded,
369 * so we can access PER_CPU and use the integer registers.
370 *
371 * We need to be very careful here with the %esp switch, because an NMI
372 * can happen everywhere. If the NMI handler finds itself on the
373 * entry-stack, it will overwrite the task-stack and everything we
374 * copied there. So allocate the stack-frame on the task-stack and
375 * switch to it before we do any copying.
376 */
377
378#define CS_FROM_ENTRY_STACK	(1 << 31)
379#define CS_FROM_USER_CR3	(1 << 30)
380
381.macro SWITCH_TO_KERNEL_STACK
382
383	ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
384
385	BUG_IF_WRONG_CR3
386
387	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
388
389	/*
390	 * %eax now contains the entry cr3 and we carry it forward in
391	 * that register for the time this macro runs
392	 */
393
394	/*
395	 * The high bits of the CS dword (__csh) are used for
396	 * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
397	 * hardware didn't do this for us.
398	 */
399	andl	$(0x0000ffff), PT_CS(%esp)
400
401	/* Are we on the entry stack? Bail out if not! */
402	movl	PER_CPU_VAR(cpu_entry_area), %ecx
403	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
404	subl	%esp, %ecx	/* ecx = (end of entry_stack) - esp */
405	cmpl	$SIZEOF_entry_stack, %ecx
406	jae	.Lend_\@
407
408	/* Load stack pointer into %esi and %edi */
409	movl	%esp, %esi
410	movl	%esi, %edi
411
412	/* Move %edi to the top of the entry stack */
413	andl	$(MASK_entry_stack), %edi
414	addl	$(SIZEOF_entry_stack), %edi
415
416	/* Load top of task-stack into %edi */
417	movl	TSS_entry2task_stack(%edi), %edi
418
419	/* Special case - entry from kernel mode via entry stack */
420#ifdef CONFIG_VM86
421	movl	PT_EFLAGS(%esp), %ecx		# mix EFLAGS and CS
422	movb	PT_CS(%esp), %cl
423	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
424#else
425	movl	PT_CS(%esp), %ecx
426	andl	$SEGMENT_RPL_MASK, %ecx
427#endif
428	cmpl	$USER_RPL, %ecx
429	jb	.Lentry_from_kernel_\@
430
431	/* Bytes to copy */
432	movl	$PTREGS_SIZE, %ecx
433
434#ifdef CONFIG_VM86
435	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esi)
436	jz	.Lcopy_pt_regs_\@
437
438	/*
439	 * Stack-frame contains 4 additional segment registers when
440	 * coming from VM86 mode
441	 */
442	addl	$(4 * 4), %ecx
443
444#endif
445.Lcopy_pt_regs_\@:
446
447	/* Allocate frame on task-stack */
448	subl	%ecx, %edi
449
450	/* Switch to task-stack */
451	movl	%edi, %esp
452
453	/*
454	 * We are now on the task-stack and can safely copy over the
455	 * stack-frame
456	 */
457	shrl	$2, %ecx
458	cld
459	rep movsl
460
461	jmp .Lend_\@
462
463.Lentry_from_kernel_\@:
464
465	/*
466	 * This handles the case when we enter the kernel from
467	 * kernel-mode and %esp points to the entry-stack. When this
468	 * happens we need to switch to the task-stack to run C code,
469	 * but switch back to the entry-stack again when we approach
470	 * iret and return to the interrupted code-path. This usually
471	 * happens when we hit an exception while restoring user-space
472	 * segment registers on the way back to user-space or when the
473	 * sysenter handler runs with eflags.tf set.
474	 *
475	 * When we switch to the task-stack here, we can't trust the
476	 * contents of the entry-stack anymore, as the exception handler
477	 * might be scheduled out or moved to another CPU. Therefore we
478	 * copy the complete entry-stack to the task-stack and set a
479	 * marker in the iret-frame (bit 31 of the CS dword) to detect
480	 * what we've done on the iret path.
481	 *
482	 * On the iret path we copy everything back and switch to the
483	 * entry-stack, so that the interrupted kernel code-path
484	 * continues on the same stack it was interrupted with.
485	 *
486	 * Be aware that an NMI can happen anytime in this code.
487	 *
488	 * %esi: Entry-Stack pointer (same as %esp)
489	 * %edi: Top of the task stack
490	 * %eax: CR3 on kernel entry
491	 */
492
493	/* Calculate number of bytes on the entry stack in %ecx */
494	movl	%esi, %ecx
495
496	/* %ecx to the top of entry-stack */
497	andl	$(MASK_entry_stack), %ecx
498	addl	$(SIZEOF_entry_stack), %ecx
499
500	/* Number of bytes on the entry stack to %ecx */
501	sub	%esi, %ecx
502
503	/* Mark stackframe as coming from entry stack */
504	orl	$CS_FROM_ENTRY_STACK, PT_CS(%esp)
505
506	/*
507	 * Test the cr3 used to enter the kernel and add a marker
508	 * so that we can switch back to it before iret.
509	 */
510	testl	$PTI_SWITCH_MASK, %eax
511	jz	.Lcopy_pt_regs_\@
512	orl	$CS_FROM_USER_CR3, PT_CS(%esp)
513
514	/*
515	 * %esi and %edi are unchanged, %ecx contains the number of
516	 * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
517	 * the stack-frame on task-stack and copy everything over
518	 */
519	jmp .Lcopy_pt_regs_\@
520
521.Lend_\@:
522.endm
523
524/*
525 * Switch back from the kernel stack to the entry stack.
526 *
527 * The %esp register must point to pt_regs on the task stack. It will
528 * first calculate the size of the stack-frame to copy, depending on
529 * whether we return to VM86 mode or not. With that it uses 'rep movsl'
530 * to copy the contents of the stack over to the entry stack.
531 *
532 * We must be very careful here, as we can't trust the contents of the
533 * task-stack once we switched to the entry-stack. When an NMI happens
534 * while on the entry-stack, the NMI handler will switch back to the top
535 * of the task stack, overwriting our stack-frame we are about to copy.
536 * Therefore we switch the stack only after everything is copied over.
537 */
538.macro SWITCH_TO_ENTRY_STACK
539
540	ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
541
542	/* Bytes to copy */
543	movl	$PTREGS_SIZE, %ecx
544
545#ifdef CONFIG_VM86
546	testl	$(X86_EFLAGS_VM), PT_EFLAGS(%esp)
547	jz	.Lcopy_pt_regs_\@
548
549	/* Additional 4 registers to copy when returning to VM86 mode */
550	addl    $(4 * 4), %ecx
551
552.Lcopy_pt_regs_\@:
553#endif
554
555	/* Initialize source and destination for movsl */
556	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
557	subl	%ecx, %edi
558	movl	%esp, %esi
559
560	/* Save future stack pointer in %ebx */
561	movl	%edi, %ebx
562
563	/* Copy over the stack-frame */
564	shrl	$2, %ecx
565	cld
566	rep movsl
567
568	/*
569	 * Switch to entry-stack - needs to happen after everything is
570	 * copied because the NMI handler will overwrite the task-stack
571	 * when on entry-stack
572	 */
573	movl	%ebx, %esp
574
575.Lend_\@:
576.endm
577
578/*
579 * This macro handles the case when we return to kernel-mode on the iret
580 * path and have to switch back to the entry stack and/or user-cr3
581 *
582 * See the comments below the .Lentry_from_kernel_\@ label in the
583 * SWITCH_TO_KERNEL_STACK macro for more details.
584 */
585.macro PARANOID_EXIT_TO_KERNEL_MODE
586
587	/*
588	 * Test if we entered the kernel with the entry-stack. Most
589	 * likely we did not, because this code only runs on the
590	 * return-to-kernel path.
591	 */
592	testl	$CS_FROM_ENTRY_STACK, PT_CS(%esp)
593	jz	.Lend_\@
594
595	/* Unlikely slow-path */
596
597	/* Clear marker from stack-frame */
598	andl	$(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
599
600	/* Copy the remaining task-stack contents to entry-stack */
601	movl	%esp, %esi
602	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
603
604	/* Bytes on the task-stack to ecx */
605	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
606	subl	%esi, %ecx
607
608	/* Allocate stack-frame on entry-stack */
609	subl	%ecx, %edi
610
611	/*
612	 * Save future stack-pointer, we must not switch until the
613	 * copy is done, otherwise the NMI handler could destroy the
614	 * contents of the task-stack we are about to copy.
615	 */
616	movl	%edi, %ebx
617
618	/* Do the copy */
619	shrl	$2, %ecx
620	cld
621	rep movsl
622
623	/* Safe to switch to entry-stack now */
624	movl	%ebx, %esp
625
626	/*
627	 * We came from entry-stack and need to check if we also need to
628	 * switch back to user cr3.
629	 */
630	testl	$CS_FROM_USER_CR3, PT_CS(%esp)
631	jz	.Lend_\@
632
633	/* Clear marker from stack-frame */
634	andl	$(~CS_FROM_USER_CR3), PT_CS(%esp)
635
636	SWITCH_TO_USER_CR3 scratch_reg=%eax
637
638.Lend_\@:
639.endm
640/*
641 * %eax: prev task
642 * %edx: next task
643 */
644ENTRY(__switch_to_asm)
645	/*
646	 * Save callee-saved registers
647	 * This must match the order in struct inactive_task_frame
648	 */
649	pushl	%ebp
650	pushl	%ebx
651	pushl	%edi
652	pushl	%esi
653
654	/* switch stack */
655	movl	%esp, TASK_threadsp(%eax)
656	movl	TASK_threadsp(%edx), %esp
657
658#ifdef CONFIG_STACKPROTECTOR
659	movl	TASK_stack_canary(%edx), %ebx
660	movl	%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
661#endif
662
663#ifdef CONFIG_RETPOLINE
664	/*
665	 * When switching from a shallower to a deeper call stack
666	 * the RSB may either underflow or use entries populated
667	 * with userspace addresses. On CPUs where those concerns
668	 * exist, overwrite the RSB with entries which capture
669	 * speculative execution to prevent attack.
670	 */
671	FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
672#endif
673
674	/* restore callee-saved registers */
675	popl	%esi
676	popl	%edi
677	popl	%ebx
678	popl	%ebp
679
680	jmp	__switch_to
681END(__switch_to_asm)
682
683/*
684 * The unwinder expects the last frame on the stack to always be at the same
685 * offset from the end of the page, which allows it to validate the stack.
686 * Calling schedule_tail() directly would break that convention because its an
687 * asmlinkage function so its argument has to be pushed on the stack.  This
688 * wrapper creates a proper "end of stack" frame header before the call.
689 */
690ENTRY(schedule_tail_wrapper)
691	FRAME_BEGIN
692
693	pushl	%eax
694	call	schedule_tail
695	popl	%eax
696
697	FRAME_END
698	ret
699ENDPROC(schedule_tail_wrapper)
700/*
701 * A newly forked process directly context switches into this address.
702 *
703 * eax: prev task we switched from
704 * ebx: kernel thread func (NULL for user thread)
705 * edi: kernel thread arg
706 */
707ENTRY(ret_from_fork)
708	call	schedule_tail_wrapper
709
710	testl	%ebx, %ebx
711	jnz	1f		/* kernel threads are uncommon */
712
7132:
714	/* When we fork, we trace the syscall return in the child, too. */
715	movl    %esp, %eax
716	call    syscall_return_slowpath
717	STACKLEAK_ERASE
718	jmp     restore_all
719
720	/* kernel thread */
7211:	movl	%edi, %eax
722	CALL_NOSPEC %ebx
723	/*
724	 * A kernel thread is allowed to return here after successfully
725	 * calling do_execve().  Exit to userspace to complete the execve()
726	 * syscall.
727	 */
728	movl	$0, PT_EAX(%esp)
729	jmp	2b
730END(ret_from_fork)
731
732/*
733 * Return to user mode is not as complex as all this looks,
734 * but we want the default path for a system call return to
735 * go as quickly as possible which is why some of this is
736 * less clear than it otherwise should be.
737 */
738
739	# userspace resumption stub bypassing syscall exit tracing
740	ALIGN
741ret_from_exception:
742	preempt_stop(CLBR_ANY)
743ret_from_intr:
744#ifdef CONFIG_VM86
745	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
746	movb	PT_CS(%esp), %al
747	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
748#else
749	/*
750	 * We can be coming here from child spawned by kernel_thread().
751	 */
752	movl	PT_CS(%esp), %eax
753	andl	$SEGMENT_RPL_MASK, %eax
754#endif
755	cmpl	$USER_RPL, %eax
756	jb	resume_kernel			# not returning to v8086 or userspace
757
758ENTRY(resume_userspace)
759	DISABLE_INTERRUPTS(CLBR_ANY)
760	TRACE_IRQS_OFF
761	movl	%esp, %eax
762	call	prepare_exit_to_usermode
763	jmp	restore_all
764END(ret_from_exception)
765
766#ifdef CONFIG_PREEMPT
767ENTRY(resume_kernel)
768	DISABLE_INTERRUPTS(CLBR_ANY)
769.Lneed_resched:
770	cmpl	$0, PER_CPU_VAR(__preempt_count)
771	jnz	restore_all_kernel
772	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
773	jz	restore_all_kernel
774	call	preempt_schedule_irq
775	jmp	.Lneed_resched
776END(resume_kernel)
777#endif
778
779GLOBAL(__begin_SYSENTER_singlestep_region)
780/*
781 * All code from here through __end_SYSENTER_singlestep_region is subject
782 * to being single-stepped if a user program sets TF and executes SYSENTER.
783 * There is absolutely nothing that we can do to prevent this from happening
784 * (thanks Intel!).  To keep our handling of this situation as simple as
785 * possible, we handle TF just like AC and NT, except that our #DB handler
786 * will ignore all of the single-step traps generated in this range.
787 */
788
789#ifdef CONFIG_XEN_PV
790/*
791 * Xen doesn't set %esp to be precisely what the normal SYSENTER
792 * entry point expects, so fix it up before using the normal path.
793 */
794ENTRY(xen_sysenter_target)
795	addl	$5*4, %esp			/* remove xen-provided frame */
796	jmp	.Lsysenter_past_esp
797#endif
798
799/*
800 * 32-bit SYSENTER entry.
801 *
802 * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
803 * if X86_FEATURE_SEP is available.  This is the preferred system call
804 * entry on 32-bit systems.
805 *
806 * The SYSENTER instruction, in principle, should *only* occur in the
807 * vDSO.  In practice, a small number of Android devices were shipped
808 * with a copy of Bionic that inlined a SYSENTER instruction.  This
809 * never happened in any of Google's Bionic versions -- it only happened
810 * in a narrow range of Intel-provided versions.
811 *
812 * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
813 * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
814 * SYSENTER does not save anything on the stack,
815 * and does not save old EIP (!!!), ESP, or EFLAGS.
816 *
817 * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
818 * user and/or vm86 state), we explicitly disable the SYSENTER
819 * instruction in vm86 mode by reprogramming the MSRs.
820 *
821 * Arguments:
822 * eax  system call number
823 * ebx  arg1
824 * ecx  arg2
825 * edx  arg3
826 * esi  arg4
827 * edi  arg5
828 * ebp  user stack
829 * 0(%ebp) arg6
830 */
831ENTRY(entry_SYSENTER_32)
832	/*
833	 * On entry-stack with all userspace-regs live - save and
834	 * restore eflags and %eax to use it as scratch-reg for the cr3
835	 * switch.
836	 */
837	pushfl
838	pushl	%eax
839	BUG_IF_WRONG_CR3 no_user_check=1
840	SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
841	popl	%eax
842	popfl
843
844	/* Stack empty again, switch to task stack */
845	movl	TSS_entry2task_stack(%esp), %esp
846
847.Lsysenter_past_esp:
848	pushl	$__USER_DS		/* pt_regs->ss */
849	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
850	pushfl				/* pt_regs->flags (except IF = 0) */
851	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
852	pushl	$__USER_CS		/* pt_regs->cs */
853	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
854	pushl	%eax			/* pt_regs->orig_ax */
855	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest, stack already switched */
856
857	/*
858	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
859	 * and TF ourselves.  To save a few cycles, we can check whether
860	 * either was set instead of doing an unconditional popfq.
861	 * This needs to happen before enabling interrupts so that
862	 * we don't get preempted with NT set.
863	 *
864	 * If TF is set, we will single-step all the way to here -- do_debug
865	 * will ignore all the traps.  (Yes, this is slow, but so is
866	 * single-stepping in general.  This allows us to avoid having
867	 * a more complicated code to handle the case where a user program
868	 * forces us to single-step through the SYSENTER entry code.)
869	 *
870	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
871	 * out-of-line as an optimization: NT is unlikely to be set in the
872	 * majority of the cases and instead of polluting the I$ unnecessarily,
873	 * we're keeping that code behind a branch which will predict as
874	 * not-taken and therefore its instructions won't be fetched.
875	 */
876	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
877	jnz	.Lsysenter_fix_flags
878.Lsysenter_flags_fixed:
879
880	/*
881	 * User mode is traced as though IRQs are on, and SYSENTER
882	 * turned them off.
883	 */
884	TRACE_IRQS_OFF
885
886	movl	%esp, %eax
887	call	do_fast_syscall_32
888	/* XEN PV guests always use IRET path */
889	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
890		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
891
892	STACKLEAK_ERASE
893
894/* Opportunistic SYSEXIT */
895	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
896
897	/*
898	 * Setup entry stack - we keep the pointer in %eax and do the
899	 * switch after almost all user-state is restored.
900	 */
901
902	/* Load entry stack pointer and allocate frame for eflags/eax */
903	movl	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
904	subl	$(2*4), %eax
905
906	/* Copy eflags and eax to entry stack */
907	movl	PT_EFLAGS(%esp), %edi
908	movl	PT_EAX(%esp), %esi
909	movl	%edi, (%eax)
910	movl	%esi, 4(%eax)
911
912	/* Restore user registers and segments */
913	movl	PT_EIP(%esp), %edx	/* pt_regs->ip */
914	movl	PT_OLDESP(%esp), %ecx	/* pt_regs->sp */
9151:	mov	PT_FS(%esp), %fs
916	PTGS_TO_GS
917
918	popl	%ebx			/* pt_regs->bx */
919	addl	$2*4, %esp		/* skip pt_regs->cx and pt_regs->dx */
920	popl	%esi			/* pt_regs->si */
921	popl	%edi			/* pt_regs->di */
922	popl	%ebp			/* pt_regs->bp */
923
924	/* Switch to entry stack */
925	movl	%eax, %esp
926
927	/* Now ready to switch the cr3 */
928	SWITCH_TO_USER_CR3 scratch_reg=%eax
929
930	/*
931	 * Restore all flags except IF. (We restore IF separately because
932	 * STI gives a one-instruction window in which we won't be interrupted,
933	 * whereas POPF does not.)
934	 */
935	btrl	$X86_EFLAGS_IF_BIT, (%esp)
936	BUG_IF_WRONG_CR3 no_user_check=1
937	popfl
938	popl	%eax
939
940	/*
941	 * Return back to the vDSO, which will pop ecx and edx.
942	 * Don't bother with DS and ES (they already contain __USER_DS).
943	 */
944	sti
945	sysexit
946
947.pushsection .fixup, "ax"
9482:	movl	$0, PT_FS(%esp)
949	jmp	1b
950.popsection
951	_ASM_EXTABLE(1b, 2b)
952	PTGS_TO_GS_EX
953
954.Lsysenter_fix_flags:
955	pushl	$X86_EFLAGS_FIXED
956	popfl
957	jmp	.Lsysenter_flags_fixed
958GLOBAL(__end_SYSENTER_singlestep_region)
959ENDPROC(entry_SYSENTER_32)
960
961/*
962 * 32-bit legacy system call entry.
963 *
964 * 32-bit x86 Linux system calls traditionally used the INT $0x80
965 * instruction.  INT $0x80 lands here.
966 *
967 * This entry point can be used by any 32-bit perform system calls.
968 * Instances of INT $0x80 can be found inline in various programs and
969 * libraries.  It is also used by the vDSO's __kernel_vsyscall
970 * fallback for hardware that doesn't support a faster entry method.
971 * Restarted 32-bit system calls also fall back to INT $0x80
972 * regardless of what instruction was originally used to do the system
973 * call.  (64-bit programs can use INT $0x80 as well, but they can
974 * only run on 64-bit kernels and therefore land in
975 * entry_INT80_compat.)
976 *
977 * This is considered a slow path.  It is not used by most libc
978 * implementations on modern hardware except during process startup.
979 *
980 * Arguments:
981 * eax  system call number
982 * ebx  arg1
983 * ecx  arg2
984 * edx  arg3
985 * esi  arg4
986 * edi  arg5
987 * ebp  arg6
988 */
989ENTRY(entry_INT80_32)
990	ASM_CLAC
991	pushl	%eax			/* pt_regs->orig_ax */
992
993	SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1	/* save rest */
994
995	/*
996	 * User mode is traced as though IRQs are on, and the interrupt gate
997	 * turned them off.
998	 */
999	TRACE_IRQS_OFF
1000
1001	movl	%esp, %eax
1002	call	do_int80_syscall_32
1003.Lsyscall_32_done:
1004
1005	STACKLEAK_ERASE
1006
1007restore_all:
1008	TRACE_IRQS_IRET
1009	SWITCH_TO_ENTRY_STACK
1010.Lrestore_all_notrace:
1011	CHECK_AND_APPLY_ESPFIX
1012.Lrestore_nocheck:
1013	/* Switch back to user CR3 */
1014	SWITCH_TO_USER_CR3 scratch_reg=%eax
1015
1016	BUG_IF_WRONG_CR3
1017
1018	/* Restore user state */
1019	RESTORE_REGS pop=4			# skip orig_eax/error_code
1020.Lirq_return:
1021	/*
1022	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
1023	 * when returning from IPI handler and when returning from
1024	 * scheduler to user-space.
1025	 */
1026	INTERRUPT_RETURN
1027
1028restore_all_kernel:
1029	TRACE_IRQS_IRET
1030	PARANOID_EXIT_TO_KERNEL_MODE
1031	BUG_IF_WRONG_CR3
1032	RESTORE_REGS 4
1033	jmp	.Lirq_return
1034
1035.section .fixup, "ax"
1036ENTRY(iret_exc	)
1037	pushl	$0				# no error code
1038	pushl	$do_iret_error
1039
1040#ifdef CONFIG_DEBUG_ENTRY
1041	/*
1042	 * The stack-frame here is the one that iret faulted on, so its a
1043	 * return-to-user frame. We are on kernel-cr3 because we come here from
1044	 * the fixup code. This confuses the CR3 checker, so switch to user-cr3
1045	 * as the checker expects it.
1046	 */
1047	pushl	%eax
1048	SWITCH_TO_USER_CR3 scratch_reg=%eax
1049	popl	%eax
1050#endif
1051
1052	jmp	common_exception
1053.previous
1054	_ASM_EXTABLE(.Lirq_return, iret_exc)
1055ENDPROC(entry_INT80_32)
1056
1057.macro FIXUP_ESPFIX_STACK
1058/*
1059 * Switch back for ESPFIX stack to the normal zerobased stack
1060 *
1061 * We can't call C functions using the ESPFIX stack. This code reads
1062 * the high word of the segment base from the GDT and swiches to the
1063 * normal stack and adjusts ESP with the matching offset.
1064 */
1065#ifdef CONFIG_X86_ESPFIX32
1066	/* fixup the stack */
1067	mov	GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
1068	mov	GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
1069	shl	$16, %eax
1070	addl	%esp, %eax			/* the adjusted stack pointer */
1071	pushl	$__KERNEL_DS
1072	pushl	%eax
1073	lss	(%esp), %esp			/* switch to the normal stack segment */
1074#endif
1075.endm
1076.macro UNWIND_ESPFIX_STACK
1077#ifdef CONFIG_X86_ESPFIX32
1078	movl	%ss, %eax
1079	/* see if on espfix stack */
1080	cmpw	$__ESPFIX_SS, %ax
1081	jne	27f
1082	movl	$__KERNEL_DS, %eax
1083	movl	%eax, %ds
1084	movl	%eax, %es
1085	/* switch to normal stack */
1086	FIXUP_ESPFIX_STACK
108727:
1088#endif
1089.endm
1090
1091/*
1092 * Build the entry stubs with some assembler magic.
1093 * We pack 1 stub into every 8-byte block.
1094 */
1095	.align 8
1096ENTRY(irq_entries_start)
1097    vector=FIRST_EXTERNAL_VECTOR
1098    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
1099	pushl	$(~vector+0x80)			/* Note: always in signed byte range */
1100    vector=vector+1
1101	jmp	common_interrupt
1102	.align	8
1103    .endr
1104END(irq_entries_start)
1105
1106/*
1107 * the CPU automatically disables interrupts when executing an IRQ vector,
1108 * so IRQ-flags tracing has to follow that:
1109 */
1110	.p2align CONFIG_X86_L1_CACHE_SHIFT
1111common_interrupt:
1112	ASM_CLAC
1113	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
1114
1115	SAVE_ALL switch_stacks=1
1116	ENCODE_FRAME_POINTER
1117	TRACE_IRQS_OFF
1118	movl	%esp, %eax
1119	call	do_IRQ
1120	jmp	ret_from_intr
1121ENDPROC(common_interrupt)
1122
1123#define BUILD_INTERRUPT3(name, nr, fn)			\
1124ENTRY(name)						\
1125	ASM_CLAC;					\
1126	pushl	$~(nr);					\
1127	SAVE_ALL switch_stacks=1;			\
1128	ENCODE_FRAME_POINTER;				\
1129	TRACE_IRQS_OFF					\
1130	movl	%esp, %eax;				\
1131	call	fn;					\
1132	jmp	ret_from_intr;				\
1133ENDPROC(name)
1134
1135#define BUILD_INTERRUPT(name, nr)		\
1136	BUILD_INTERRUPT3(name, nr, smp_##name);	\
1137
1138/* The include is where all of the SMP etc. interrupts come from */
1139#include <asm/entry_arch.h>
1140
1141ENTRY(coprocessor_error)
1142	ASM_CLAC
1143	pushl	$0
1144	pushl	$do_coprocessor_error
1145	jmp	common_exception
1146END(coprocessor_error)
1147
1148ENTRY(simd_coprocessor_error)
1149	ASM_CLAC
1150	pushl	$0
1151#ifdef CONFIG_X86_INVD_BUG
1152	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
1153	ALTERNATIVE "pushl	$do_general_protection",	\
1154		    "pushl	$do_simd_coprocessor_error",	\
1155		    X86_FEATURE_XMM
1156#else
1157	pushl	$do_simd_coprocessor_error
1158#endif
1159	jmp	common_exception
1160END(simd_coprocessor_error)
1161
1162ENTRY(device_not_available)
1163	ASM_CLAC
1164	pushl	$-1				# mark this as an int
1165	pushl	$do_device_not_available
1166	jmp	common_exception
1167END(device_not_available)
1168
1169#ifdef CONFIG_PARAVIRT
1170ENTRY(native_iret)
1171	iret
1172	_ASM_EXTABLE(native_iret, iret_exc)
1173END(native_iret)
1174#endif
1175
1176ENTRY(overflow)
1177	ASM_CLAC
1178	pushl	$0
1179	pushl	$do_overflow
1180	jmp	common_exception
1181END(overflow)
1182
1183ENTRY(bounds)
1184	ASM_CLAC
1185	pushl	$0
1186	pushl	$do_bounds
1187	jmp	common_exception
1188END(bounds)
1189
1190ENTRY(invalid_op)
1191	ASM_CLAC
1192	pushl	$0
1193	pushl	$do_invalid_op
1194	jmp	common_exception
1195END(invalid_op)
1196
1197ENTRY(coprocessor_segment_overrun)
1198	ASM_CLAC
1199	pushl	$0
1200	pushl	$do_coprocessor_segment_overrun
1201	jmp	common_exception
1202END(coprocessor_segment_overrun)
1203
1204ENTRY(invalid_TSS)
1205	ASM_CLAC
1206	pushl	$do_invalid_TSS
1207	jmp	common_exception
1208END(invalid_TSS)
1209
1210ENTRY(segment_not_present)
1211	ASM_CLAC
1212	pushl	$do_segment_not_present
1213	jmp	common_exception
1214END(segment_not_present)
1215
1216ENTRY(stack_segment)
1217	ASM_CLAC
1218	pushl	$do_stack_segment
1219	jmp	common_exception
1220END(stack_segment)
1221
1222ENTRY(alignment_check)
1223	ASM_CLAC
1224	pushl	$do_alignment_check
1225	jmp	common_exception
1226END(alignment_check)
1227
1228ENTRY(divide_error)
1229	ASM_CLAC
1230	pushl	$0				# no error code
1231	pushl	$do_divide_error
1232	jmp	common_exception
1233END(divide_error)
1234
1235#ifdef CONFIG_X86_MCE
1236ENTRY(machine_check)
1237	ASM_CLAC
1238	pushl	$0
1239	pushl	machine_check_vector
1240	jmp	common_exception
1241END(machine_check)
1242#endif
1243
1244ENTRY(spurious_interrupt_bug)
1245	ASM_CLAC
1246	pushl	$0
1247	pushl	$do_spurious_interrupt_bug
1248	jmp	common_exception
1249END(spurious_interrupt_bug)
1250
1251#ifdef CONFIG_XEN_PV
1252ENTRY(xen_hypervisor_callback)
1253	pushl	$-1				/* orig_ax = -1 => not a system call */
1254	SAVE_ALL
1255	ENCODE_FRAME_POINTER
1256	TRACE_IRQS_OFF
1257
1258	/*
1259	 * Check to see if we got the event in the critical
1260	 * region in xen_iret_direct, after we've reenabled
1261	 * events and checked for pending events.  This simulates
1262	 * iret instruction's behaviour where it delivers a
1263	 * pending interrupt when enabling interrupts:
1264	 */
1265	movl	PT_EIP(%esp), %eax
1266	cmpl	$xen_iret_start_crit, %eax
1267	jb	1f
1268	cmpl	$xen_iret_end_crit, %eax
1269	jae	1f
1270
1271	jmp	xen_iret_crit_fixup
1272
1273ENTRY(xen_do_upcall)
12741:	mov	%esp, %eax
1275	call	xen_evtchn_do_upcall
1276#ifndef CONFIG_PREEMPT
1277	call	xen_maybe_preempt_hcall
1278#endif
1279	jmp	ret_from_intr
1280ENDPROC(xen_hypervisor_callback)
1281
1282/*
1283 * Hypervisor uses this for application faults while it executes.
1284 * We get here for two reasons:
1285 *  1. Fault while reloading DS, ES, FS or GS
1286 *  2. Fault while executing IRET
1287 * Category 1 we fix up by reattempting the load, and zeroing the segment
1288 * register if the load fails.
1289 * Category 2 we fix up by jumping to do_iret_error. We cannot use the
1290 * normal Linux return path in this case because if we use the IRET hypercall
1291 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1292 * We distinguish between categories by maintaining a status value in EAX.
1293 */
1294ENTRY(xen_failsafe_callback)
1295	pushl	%eax
1296	movl	$1, %eax
12971:	mov	4(%esp), %ds
12982:	mov	8(%esp), %es
12993:	mov	12(%esp), %fs
13004:	mov	16(%esp), %gs
1301	/* EAX == 0 => Category 1 (Bad segment)
1302	   EAX != 0 => Category 2 (Bad IRET) */
1303	testl	%eax, %eax
1304	popl	%eax
1305	lea	16(%esp), %esp
1306	jz	5f
1307	jmp	iret_exc
13085:	pushl	$-1				/* orig_ax = -1 => not a system call */
1309	SAVE_ALL
1310	ENCODE_FRAME_POINTER
1311	jmp	ret_from_exception
1312
1313.section .fixup, "ax"
13146:	xorl	%eax, %eax
1315	movl	%eax, 4(%esp)
1316	jmp	1b
13177:	xorl	%eax, %eax
1318	movl	%eax, 8(%esp)
1319	jmp	2b
13208:	xorl	%eax, %eax
1321	movl	%eax, 12(%esp)
1322	jmp	3b
13239:	xorl	%eax, %eax
1324	movl	%eax, 16(%esp)
1325	jmp	4b
1326.previous
1327	_ASM_EXTABLE(1b, 6b)
1328	_ASM_EXTABLE(2b, 7b)
1329	_ASM_EXTABLE(3b, 8b)
1330	_ASM_EXTABLE(4b, 9b)
1331ENDPROC(xen_failsafe_callback)
1332#endif /* CONFIG_XEN_PV */
1333
1334#ifdef CONFIG_XEN_PVHVM
1335BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1336		 xen_evtchn_do_upcall)
1337#endif
1338
1339
1340#if IS_ENABLED(CONFIG_HYPERV)
1341
1342BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1343		 hyperv_vector_handler)
1344
1345BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
1346		 hyperv_reenlightenment_intr)
1347
1348BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
1349		 hv_stimer0_vector_handler)
1350
1351#endif /* CONFIG_HYPERV */
1352
1353ENTRY(page_fault)
1354	ASM_CLAC
1355	pushl	$do_page_fault
1356	ALIGN
1357	jmp common_exception
1358END(page_fault)
1359
1360common_exception:
1361	/* the function address is in %gs's slot on the stack */
1362	pushl	%fs
1363	pushl	%es
1364	pushl	%ds
1365	pushl	%eax
1366	movl	$(__USER_DS), %eax
1367	movl	%eax, %ds
1368	movl	%eax, %es
1369	movl	$(__KERNEL_PERCPU), %eax
1370	movl	%eax, %fs
1371	pushl	%ebp
1372	pushl	%edi
1373	pushl	%esi
1374	pushl	%edx
1375	pushl	%ecx
1376	pushl	%ebx
1377	SWITCH_TO_KERNEL_STACK
1378	ENCODE_FRAME_POINTER
1379	cld
1380	UNWIND_ESPFIX_STACK
1381	GS_TO_REG %ecx
1382	movl	PT_GS(%esp), %edi		# get the function address
1383	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
1384	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
1385	REG_TO_PTGS %ecx
1386	SET_KERNEL_GS %ecx
1387	TRACE_IRQS_OFF
1388	movl	%esp, %eax			# pt_regs pointer
1389	CALL_NOSPEC %edi
1390	jmp	ret_from_exception
1391END(common_exception)
1392
1393ENTRY(debug)
1394	/*
1395	 * Entry from sysenter is now handled in common_exception
1396	 */
1397	ASM_CLAC
1398	pushl	$-1				# mark this as an int
1399	pushl	$do_debug
1400	jmp	common_exception
1401END(debug)
1402
1403/*
1404 * NMI is doubly nasty.  It can happen on the first instruction of
1405 * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
1406 * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
1407 * switched stacks.  We handle both conditions by simply checking whether we
1408 * interrupted kernel code running on the SYSENTER stack.
1409 */
1410ENTRY(nmi)
1411	ASM_CLAC
1412
1413#ifdef CONFIG_X86_ESPFIX32
1414	pushl	%eax
1415	movl	%ss, %eax
1416	cmpw	$__ESPFIX_SS, %ax
1417	popl	%eax
1418	je	.Lnmi_espfix_stack
1419#endif
1420
1421	pushl	%eax				# pt_regs->orig_ax
1422	SAVE_ALL_NMI cr3_reg=%edi
1423	ENCODE_FRAME_POINTER
1424	xorl	%edx, %edx			# zero error code
1425	movl	%esp, %eax			# pt_regs pointer
1426
1427	/* Are we currently on the SYSENTER stack? */
1428	movl	PER_CPU_VAR(cpu_entry_area), %ecx
1429	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
1430	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */
1431	cmpl	$SIZEOF_entry_stack, %ecx
1432	jb	.Lnmi_from_sysenter_stack
1433
1434	/* Not on SYSENTER stack. */
1435	call	do_nmi
1436	jmp	.Lnmi_return
1437
1438.Lnmi_from_sysenter_stack:
1439	/*
1440	 * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
1441	 * is using the thread stack right now, so it's safe for us to use it.
1442	 */
1443	movl	%esp, %ebx
1444	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
1445	call	do_nmi
1446	movl	%ebx, %esp
1447
1448.Lnmi_return:
1449	CHECK_AND_APPLY_ESPFIX
1450	RESTORE_ALL_NMI cr3_reg=%edi pop=4
1451	jmp	.Lirq_return
1452
1453#ifdef CONFIG_X86_ESPFIX32
1454.Lnmi_espfix_stack:
1455	/*
1456	 * create the pointer to lss back
1457	 */
1458	pushl	%ss
1459	pushl	%esp
1460	addl	$4, (%esp)
1461	/* copy the iret frame of 12 bytes */
1462	.rept 3
1463	pushl	16(%esp)
1464	.endr
1465	pushl	%eax
1466	SAVE_ALL_NMI cr3_reg=%edi
1467	ENCODE_FRAME_POINTER
1468	FIXUP_ESPFIX_STACK			# %eax == %esp
1469	xorl	%edx, %edx			# zero error code
1470	call	do_nmi
1471	RESTORE_ALL_NMI cr3_reg=%edi
1472	lss	12+4(%esp), %esp		# back to espfix stack
1473	jmp	.Lirq_return
1474#endif
1475END(nmi)
1476
1477ENTRY(int3)
1478	ASM_CLAC
1479	pushl	$-1				# mark this as an int
1480
1481	SAVE_ALL switch_stacks=1
1482	ENCODE_FRAME_POINTER
1483	TRACE_IRQS_OFF
1484	xorl	%edx, %edx			# zero error code
1485	movl	%esp, %eax			# pt_regs pointer
1486	call	do_int3
1487	jmp	ret_from_exception
1488END(int3)
1489
1490ENTRY(general_protection)
1491	pushl	$do_general_protection
1492	jmp	common_exception
1493END(general_protection)
1494
1495#ifdef CONFIG_KVM_GUEST
1496ENTRY(async_page_fault)
1497	ASM_CLAC
1498	pushl	$do_async_page_fault
1499	jmp	common_exception
1500END(async_page_fault)
1501#endif
1502
1503ENTRY(rewind_stack_do_exit)
1504	/* Prevent any naive code from trying to unwind to our caller. */
1505	xorl	%ebp, %ebp
1506
1507	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esi
1508	leal	-TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
1509
1510	call	do_exit
15111:	jmp 1b
1512END(rewind_stack_do_exit)
1513