xref: /openbmc/linux/arch/x86/entry/common.c (revision 86aa961bb4619a68077ebeba21c52e9ba0eab43d)
1fb9e53ccSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21f484aa6SAndy Lutomirski /*
31f484aa6SAndy Lutomirski  * common.c - C code for kernel entry and exit
41f484aa6SAndy Lutomirski  * Copyright (c) 2015 Andrew Lutomirski
51f484aa6SAndy Lutomirski  *
61f484aa6SAndy Lutomirski  * Based on asm and ptrace code by many authors.  The code here originated
71f484aa6SAndy Lutomirski  * in ptrace.c and signal.c.
81f484aa6SAndy Lutomirski  */
91f484aa6SAndy Lutomirski 
101f484aa6SAndy Lutomirski #include <linux/kernel.h>
111f484aa6SAndy Lutomirski #include <linux/sched.h>
1268db0cf1SIngo Molnar #include <linux/sched/task_stack.h>
1327d6b4d1SThomas Gleixner #include <linux/entry-common.h>
141f484aa6SAndy Lutomirski #include <linux/mm.h>
151f484aa6SAndy Lutomirski #include <linux/smp.h>
161f484aa6SAndy Lutomirski #include <linux/errno.h>
171f484aa6SAndy Lutomirski #include <linux/ptrace.h>
181f484aa6SAndy Lutomirski #include <linux/export.h>
192fbd7af5SDan Williams #include <linux/nospec.h>
205ea0727bSThomas Garnier #include <linux/syscalls.h>
215f409e20SRik van Riel #include <linux/uaccess.h>
221f484aa6SAndy Lutomirski 
232f6474e4SThomas Gleixner #ifdef CONFIG_XEN_PV
242f6474e4SThomas Gleixner #include <xen/xen-ops.h>
252f6474e4SThomas Gleixner #include <xen/events.h>
262f6474e4SThomas Gleixner #endif
272f6474e4SThomas Gleixner 
2822ca647cSThomas Gleixner #include <asm/apic.h>
291f484aa6SAndy Lutomirski #include <asm/desc.h>
301f484aa6SAndy Lutomirski #include <asm/traps.h>
31710246dfSAndy Lutomirski #include <asm/vdso.h>
32cd4d09ecSBorislav Petkov #include <asm/cpufeature.h>
335f409e20SRik van Riel #include <asm/fpu/api.h>
3404dcbdb8SThomas Gleixner #include <asm/nospec-branch.h>
3522fe5b04SThomas Gleixner #include <asm/io_bitmap.h>
3699ce3255SBenjamin Thiel #include <asm/syscall.h>
372f6474e4SThomas Gleixner #include <asm/irq_stack.h>
381f484aa6SAndy Lutomirski 
390b085e68SThomas Gleixner #ifdef CONFIG_X86_64
402978996fSH. Peter Anvin (Intel) 
do_syscall_x64(struct pt_regs * regs,int nr)412978996fSH. Peter Anvin (Intel) static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
422978996fSH. Peter Anvin (Intel) {
432978996fSH. Peter Anvin (Intel) 	/*
442978996fSH. Peter Anvin (Intel) 	 * Convert negative numbers to very high and thus out of range
452978996fSH. Peter Anvin (Intel) 	 * numbers for comparisons.
462978996fSH. Peter Anvin (Intel) 	 */
472978996fSH. Peter Anvin (Intel) 	unsigned int unr = nr;
482978996fSH. Peter Anvin (Intel) 
492978996fSH. Peter Anvin (Intel) 	if (likely(unr < NR_syscalls)) {
502978996fSH. Peter Anvin (Intel) 		unr = array_index_nospec(unr, NR_syscalls);
51eb0f175bSLinus Torvalds 		regs->ax = x64_sys_call(regs, unr);
522978996fSH. Peter Anvin (Intel) 		return true;
532978996fSH. Peter Anvin (Intel) 	}
542978996fSH. Peter Anvin (Intel) 	return false;
552978996fSH. Peter Anvin (Intel) }
562978996fSH. Peter Anvin (Intel) 
do_syscall_x32(struct pt_regs * regs,int nr)572978996fSH. Peter Anvin (Intel) static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
582978996fSH. Peter Anvin (Intel) {
592978996fSH. Peter Anvin (Intel) 	/*
602978996fSH. Peter Anvin (Intel) 	 * Adjust the starting offset of the table, and convert numbers
612978996fSH. Peter Anvin (Intel) 	 * < __X32_SYSCALL_BIT to very high and thus out of range
622978996fSH. Peter Anvin (Intel) 	 * numbers for comparisons.
632978996fSH. Peter Anvin (Intel) 	 */
642978996fSH. Peter Anvin (Intel) 	unsigned int xnr = nr - __X32_SYSCALL_BIT;
652978996fSH. Peter Anvin (Intel) 
662978996fSH. Peter Anvin (Intel) 	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
672978996fSH. Peter Anvin (Intel) 		xnr = array_index_nospec(xnr, X32_NR_syscalls);
68eb0f175bSLinus Torvalds 		regs->ax = x32_sys_call(regs, xnr);
692978996fSH. Peter Anvin (Intel) 		return true;
702978996fSH. Peter Anvin (Intel) 	}
712978996fSH. Peter Anvin (Intel) 	return false;
722978996fSH. Peter Anvin (Intel) }
732978996fSH. Peter Anvin (Intel) 
do_syscall_64(struct pt_regs * regs,int nr)742978996fSH. Peter Anvin (Intel) __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
750b085e68SThomas Gleixner {
76fe950f60SKees Cook 	add_random_kstack_offset();
7727d6b4d1SThomas Gleixner 	nr = syscall_enter_from_user_mode(regs, nr);
780b085e68SThomas Gleixner 
790b085e68SThomas Gleixner 	instrumentation_begin();
802978996fSH. Peter Anvin (Intel) 
812978996fSH. Peter Anvin (Intel) 	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
822978996fSH. Peter Anvin (Intel) 		/* Invalid system call, but still a system call. */
83b337b496SH. Peter Anvin (Intel) 		regs->ax = __x64_sys_ni_syscall(regs);
841e423bffSAndy Lutomirski 	}
852978996fSH. Peter Anvin (Intel) 
868f159f1dSThomas Gleixner 	instrumentation_end();
87167fd210SThomas Gleixner 	syscall_exit_to_user_mode(regs);
881e423bffSAndy Lutomirski }
891e423bffSAndy Lutomirski #endif
901e423bffSAndy Lutomirski 
91bd2d3a3bSAndy Lutomirski #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
syscall_32_enter(struct pt_regs * regs)922978996fSH. Peter Anvin (Intel) static __always_inline int syscall_32_enter(struct pt_regs *regs)
93bd2d3a3bSAndy Lutomirski {
940b085e68SThomas Gleixner 	if (IS_ENABLED(CONFIG_IA32_EMULATION))
950b085e68SThomas Gleixner 		current_thread_info()->status |= TS_COMPAT;
964facb95bSThomas Gleixner 
972978996fSH. Peter Anvin (Intel) 	return (int)regs->orig_ax;
98bd2d3a3bSAndy Lutomirski }
99bd2d3a3bSAndy Lutomirski 
100f259af26SNikolay Borisov #ifdef CONFIG_IA32_EMULATION
101f259af26SNikolay Borisov bool __ia32_enabled __ro_after_init = true;
102f259af26SNikolay Borisov #endif
103f259af26SNikolay Borisov 
1040b085e68SThomas Gleixner /*
1050b085e68SThomas Gleixner  * Invoke a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.
1060b085e68SThomas Gleixner  */
do_syscall_32_irqs_on(struct pt_regs * regs,int nr)1072978996fSH. Peter Anvin (Intel) static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
1080b085e68SThomas Gleixner {
1092978996fSH. Peter Anvin (Intel) 	/*
1102978996fSH. Peter Anvin (Intel) 	 * Convert negative numbers to very high and thus out of range
1112978996fSH. Peter Anvin (Intel) 	 * numbers for comparisons.
1122978996fSH. Peter Anvin (Intel) 	 */
1132978996fSH. Peter Anvin (Intel) 	unsigned int unr = nr;
1142978996fSH. Peter Anvin (Intel) 
1152978996fSH. Peter Anvin (Intel) 	if (likely(unr < IA32_NR_syscalls)) {
1162978996fSH. Peter Anvin (Intel) 		unr = array_index_nospec(unr, IA32_NR_syscalls);
117eb0f175bSLinus Torvalds 		regs->ax = ia32_sys_call(regs, unr);
1182978996fSH. Peter Anvin (Intel) 	} else if (nr != -1) {
119b337b496SH. Peter Anvin (Intel) 		regs->ax = __ia32_sys_ni_syscall(regs);
120bd2d3a3bSAndy Lutomirski 	}
121bd2d3a3bSAndy Lutomirski }
122710246dfSAndy Lutomirski 
1234591766fSThomas Gleixner #ifdef CONFIG_IA32_EMULATION
int80_is_external(void)12422ca647cSThomas Gleixner static __always_inline bool int80_is_external(void)
12522ca647cSThomas Gleixner {
12622ca647cSThomas Gleixner 	const unsigned int offs = (0x80 / 32) * 0x10;
12722ca647cSThomas Gleixner 	const u32 bit = BIT(0x80 % 32);
12822ca647cSThomas Gleixner 
12922ca647cSThomas Gleixner 	/* The local APIC on XENPV guests is fake */
13022ca647cSThomas Gleixner 	if (cpu_feature_enabled(X86_FEATURE_XENPV))
13122ca647cSThomas Gleixner 		return false;
13222ca647cSThomas Gleixner 
13322ca647cSThomas Gleixner 	/*
13422ca647cSThomas Gleixner 	 * If vector 0x80 is set in the APIC ISR then this is an external
13522ca647cSThomas Gleixner 	 * interrupt. Either from broken hardware or injected by a VMM.
13622ca647cSThomas Gleixner 	 *
13722ca647cSThomas Gleixner 	 * Note: In guest mode this is only valid for secure guests where
13822ca647cSThomas Gleixner 	 * the secure module fully controls the vAPIC exposed to the guest.
13922ca647cSThomas Gleixner 	 */
14022ca647cSThomas Gleixner 	return apic_read(APIC_ISR + offs) & bit;
14122ca647cSThomas Gleixner }
14222ca647cSThomas Gleixner 
1434591766fSThomas Gleixner /**
144*eb36b0dcSPawan Gupta  * do_int80_emulation - 32-bit legacy syscall C entry from asm
1454591766fSThomas Gleixner  *
1464591766fSThomas Gleixner  * This entry point can be used by 32-bit and 64-bit programs to perform
1474591766fSThomas Gleixner  * 32-bit system calls.  Instances of INT $0x80 can be found inline in
1484591766fSThomas Gleixner  * various programs and libraries.  It is also used by the vDSO's
1494591766fSThomas Gleixner  * __kernel_vsyscall fallback for hardware that doesn't support a faster
1504591766fSThomas Gleixner  * entry method.  Restarted 32-bit system calls also fall back to INT
1514591766fSThomas Gleixner  * $0x80 regardless of what instruction was originally used to do the
1524591766fSThomas Gleixner  * system call.
1534591766fSThomas Gleixner  *
1544591766fSThomas Gleixner  * This is considered a slow path.  It is not used by most libc
1554591766fSThomas Gleixner  * implementations on modern hardware except during process startup.
1564591766fSThomas Gleixner  *
1574591766fSThomas Gleixner  * The arguments for the INT $0x80 based syscall are on stack in the
1584591766fSThomas Gleixner  * pt_regs structure:
1594591766fSThomas Gleixner  *   eax:				system call number
1604591766fSThomas Gleixner  *   ebx, ecx, edx, esi, edi, ebp:	arg1 - arg 6
1614591766fSThomas Gleixner  */
do_int80_emulation(struct pt_regs * regs)162*eb36b0dcSPawan Gupta __visible noinstr void do_int80_emulation(struct pt_regs *regs)
1634591766fSThomas Gleixner {
1644591766fSThomas Gleixner 	int nr;
1654591766fSThomas Gleixner 
16622ca647cSThomas Gleixner 	/* Kernel does not use INT $0x80! */
16722ca647cSThomas Gleixner 	if (unlikely(!user_mode(regs))) {
16822ca647cSThomas Gleixner 		irqentry_enter(regs);
16922ca647cSThomas Gleixner 		instrumentation_begin();
17022ca647cSThomas Gleixner 		panic("Unexpected external interrupt 0x80\n");
17122ca647cSThomas Gleixner 	}
17222ca647cSThomas Gleixner 
17322ca647cSThomas Gleixner 	/*
17422ca647cSThomas Gleixner 	 * Establish kernel context for instrumentation, including for
17522ca647cSThomas Gleixner 	 * int80_is_external() below which calls into the APIC driver.
17622ca647cSThomas Gleixner 	 * Identical for soft and external interrupts.
17722ca647cSThomas Gleixner 	 */
1784591766fSThomas Gleixner 	enter_from_user_mode(regs);
1794591766fSThomas Gleixner 
1804591766fSThomas Gleixner 	instrumentation_begin();
1814591766fSThomas Gleixner 	add_random_kstack_offset();
1824591766fSThomas Gleixner 
18322ca647cSThomas Gleixner 	/* Validate that this is a soft interrupt to the extent possible */
18422ca647cSThomas Gleixner 	if (unlikely(int80_is_external()))
18522ca647cSThomas Gleixner 		panic("Unexpected external interrupt 0x80\n");
18622ca647cSThomas Gleixner 
1874591766fSThomas Gleixner 	/*
1884591766fSThomas Gleixner 	 * The low level idtentry code pushed -1 into regs::orig_ax
1894591766fSThomas Gleixner 	 * and regs::ax contains the syscall number.
1904591766fSThomas Gleixner 	 *
1914591766fSThomas Gleixner 	 * User tracing code (ptrace or signal handlers) might assume
1924591766fSThomas Gleixner 	 * that the regs::orig_ax contains a 32-bit number on invoking
1934591766fSThomas Gleixner 	 * a 32-bit syscall.
1944591766fSThomas Gleixner 	 *
1954591766fSThomas Gleixner 	 * Establish the syscall convention by saving the 32bit truncated
1964591766fSThomas Gleixner 	 * syscall number in regs::orig_ax and by invalidating regs::ax.
1974591766fSThomas Gleixner 	 */
1984591766fSThomas Gleixner 	regs->orig_ax = regs->ax & GENMASK(31, 0);
1994591766fSThomas Gleixner 	regs->ax = -ENOSYS;
2004591766fSThomas Gleixner 
2014591766fSThomas Gleixner 	nr = syscall_32_enter(regs);
2024591766fSThomas Gleixner 
2034591766fSThomas Gleixner 	local_irq_enable();
2044591766fSThomas Gleixner 	nr = syscall_enter_from_user_mode_work(regs, nr);
2054591766fSThomas Gleixner 	do_syscall_32_irqs_on(regs, nr);
2064591766fSThomas Gleixner 
2074591766fSThomas Gleixner 	instrumentation_end();
2084591766fSThomas Gleixner 	syscall_exit_to_user_mode(regs);
2094591766fSThomas Gleixner }
2104591766fSThomas Gleixner #else /* CONFIG_IA32_EMULATION */
2114591766fSThomas Gleixner 
2124591766fSThomas Gleixner /* Handles int $0x80 on a 32bit kernel */
do_int80_syscall_32(struct pt_regs * regs)2138f159f1dSThomas Gleixner __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
2148b13c255SAndy Lutomirski {
2152978996fSH. Peter Anvin (Intel) 	int nr = syscall_32_enter(regs);
2168f159f1dSThomas Gleixner 
217fe950f60SKees Cook 	add_random_kstack_offset();
2184facb95bSThomas Gleixner 	/*
2192978996fSH. Peter Anvin (Intel) 	 * Subtlety here: if ptrace pokes something larger than 2^31-1 into
2202978996fSH. Peter Anvin (Intel) 	 * orig_ax, the int return value truncates it. This matches
2212978996fSH. Peter Anvin (Intel) 	 * the semantics of syscall_get_nr().
2224facb95bSThomas Gleixner 	 */
2232978996fSH. Peter Anvin (Intel) 	nr = syscall_enter_from_user_mode(regs, nr);
2249caa7ff5SPeter Zijlstra 	instrumentation_begin();
2254facb95bSThomas Gleixner 
2260b085e68SThomas Gleixner 	do_syscall_32_irqs_on(regs, nr);
2279caa7ff5SPeter Zijlstra 
2289caa7ff5SPeter Zijlstra 	instrumentation_end();
229167fd210SThomas Gleixner 	syscall_exit_to_user_mode(regs);
2308f159f1dSThomas Gleixner }
2314591766fSThomas Gleixner #endif /* !CONFIG_IA32_EMULATION */
2328f159f1dSThomas Gleixner 
__do_fast_syscall_32(struct pt_regs * regs)2330b085e68SThomas Gleixner static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
2348f159f1dSThomas Gleixner {
2352978996fSH. Peter Anvin (Intel) 	int nr = syscall_32_enter(regs);
2368f159f1dSThomas Gleixner 	int res;
2378f159f1dSThomas Gleixner 
238fe950f60SKees Cook 	add_random_kstack_offset();
2394facb95bSThomas Gleixner 	/*
2404facb95bSThomas Gleixner 	 * This cannot use syscall_enter_from_user_mode() as it has to
2414facb95bSThomas Gleixner 	 * fetch EBP before invoking any of the syscall entry work
2424facb95bSThomas Gleixner 	 * functions.
2434facb95bSThomas Gleixner 	 */
2444facb95bSThomas Gleixner 	syscall_enter_from_user_mode_prepare(regs);
2454facb95bSThomas Gleixner 
2460b085e68SThomas Gleixner 	instrumentation_begin();
2478f159f1dSThomas Gleixner 	/* Fetch EBP from where the vDSO stashed it. */
2488f159f1dSThomas Gleixner 	if (IS_ENABLED(CONFIG_X86_64)) {
2498f159f1dSThomas Gleixner 		/*
2508f159f1dSThomas Gleixner 		 * Micro-optimization: the pointer we're following is
2518f159f1dSThomas Gleixner 		 * explicitly 32 bits, so it can't be out of range.
2528f159f1dSThomas Gleixner 		 */
2538f159f1dSThomas Gleixner 		res = __get_user(*(u32 *)&regs->bp,
2548f159f1dSThomas Gleixner 			 (u32 __user __force *)(unsigned long)(u32)regs->sp);
2558f159f1dSThomas Gleixner 	} else {
2568f159f1dSThomas Gleixner 		res = get_user(*(u32 *)&regs->bp,
2578f159f1dSThomas Gleixner 		       (u32 __user __force *)(unsigned long)(u32)regs->sp);
2588f159f1dSThomas Gleixner 	}
2598f159f1dSThomas Gleixner 
2608f159f1dSThomas Gleixner 	if (res) {
2618f159f1dSThomas Gleixner 		/* User code screwed up. */
2628f159f1dSThomas Gleixner 		regs->ax = -EFAULT;
2639caa7ff5SPeter Zijlstra 
2645d5675dfSAndy Lutomirski 		local_irq_disable();
265240001d4SPeter Zijlstra 		instrumentation_end();
2665d5675dfSAndy Lutomirski 		irqentry_exit_to_user_mode(regs);
2678f159f1dSThomas Gleixner 		return false;
2688f159f1dSThomas Gleixner 	}
2698f159f1dSThomas Gleixner 
2702978996fSH. Peter Anvin (Intel) 	nr = syscall_enter_from_user_mode_work(regs, nr);
2714facb95bSThomas Gleixner 
2728f159f1dSThomas Gleixner 	/* Now this is just like a normal syscall. */
2730b085e68SThomas Gleixner 	do_syscall_32_irqs_on(regs, nr);
2749caa7ff5SPeter Zijlstra 
2759caa7ff5SPeter Zijlstra 	instrumentation_end();
276167fd210SThomas Gleixner 	syscall_exit_to_user_mode(regs);
2778f159f1dSThomas Gleixner 	return true;
2788b13c255SAndy Lutomirski }
2798b13c255SAndy Lutomirski 
2805f310f73SAndy Lutomirski /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
do_fast_syscall_32(struct pt_regs * regs)2818f159f1dSThomas Gleixner __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
282710246dfSAndy Lutomirski {
283710246dfSAndy Lutomirski 	/*
284710246dfSAndy Lutomirski 	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
285710246dfSAndy Lutomirski 	 * convention.  Adjust regs so it looks like we entered using int80.
286710246dfSAndy Lutomirski 	 */
287710246dfSAndy Lutomirski 	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
288710246dfSAndy Lutomirski 					vdso_image_32.sym_int80_landing_pad;
289710246dfSAndy Lutomirski 
290710246dfSAndy Lutomirski 	/*
291710246dfSAndy Lutomirski 	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
292710246dfSAndy Lutomirski 	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
293710246dfSAndy Lutomirski 	 * Fix it up.
294710246dfSAndy Lutomirski 	 */
295710246dfSAndy Lutomirski 	regs->ip = landing_pad;
296710246dfSAndy Lutomirski 
2970b085e68SThomas Gleixner 	/* Invoke the syscall. If it failed, keep it simple: use IRET. */
2980b085e68SThomas Gleixner 	if (!__do_fast_syscall_32(regs))
2998f159f1dSThomas Gleixner 		return 0;
3007841b408SAndy Lutomirski 
3017841b408SAndy Lutomirski #ifdef CONFIG_X86_64
3027841b408SAndy Lutomirski 	/*
3037841b408SAndy Lutomirski 	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
3047841b408SAndy Lutomirski 	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
3057841b408SAndy Lutomirski 	 * bother with SYSEXIT.
3067841b408SAndy Lutomirski 	 *
3077841b408SAndy Lutomirski 	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
3087841b408SAndy Lutomirski 	 * because the ECX fixup above will ensure that this is essentially
3097841b408SAndy Lutomirski 	 * never the case.
3107841b408SAndy Lutomirski 	 */
3117841b408SAndy Lutomirski 	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
3127841b408SAndy Lutomirski 		regs->ip == landing_pad &&
3137841b408SAndy Lutomirski 		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
3147841b408SAndy Lutomirski #else
3155f310f73SAndy Lutomirski 	/*
3165f310f73SAndy Lutomirski 	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
3175f310f73SAndy Lutomirski 	 *
3185f310f73SAndy Lutomirski 	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
3195f310f73SAndy Lutomirski 	 * because the ECX fixup above will ensure that this is essentially
3205f310f73SAndy Lutomirski 	 * never the case.
3215f310f73SAndy Lutomirski 	 *
3225f310f73SAndy Lutomirski 	 * We don't allow syscalls at all from VM86 mode, but we still
3235f310f73SAndy Lutomirski 	 * need to check VM, because we might be returning from sys_vm86.
3245f310f73SAndy Lutomirski 	 */
3255f310f73SAndy Lutomirski 	return static_cpu_has(X86_FEATURE_SEP) &&
3265f310f73SAndy Lutomirski 		regs->cs == __USER_CS && regs->ss == __USER_DS &&
3275f310f73SAndy Lutomirski 		regs->ip == landing_pad &&
3285f310f73SAndy Lutomirski 		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
3297841b408SAndy Lutomirski #endif
330710246dfSAndy Lutomirski }
331d1721250SAndy Lutomirski 
332d1721250SAndy Lutomirski /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
do_SYSENTER_32(struct pt_regs * regs)333d1721250SAndy Lutomirski __visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
334d1721250SAndy Lutomirski {
335d1721250SAndy Lutomirski 	/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
336d1721250SAndy Lutomirski 	regs->sp = regs->bp;
337d1721250SAndy Lutomirski 
338d1721250SAndy Lutomirski 	/* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
339d1721250SAndy Lutomirski 	regs->flags |= X86_EFLAGS_IF;
340d1721250SAndy Lutomirski 
341d1721250SAndy Lutomirski 	return do_fast_syscall_32(regs);
342d1721250SAndy Lutomirski }
343bd2d3a3bSAndy Lutomirski #endif
344cc42c045SBrian Gerst 
SYSCALL_DEFINE0(ni_syscall)345cc42c045SBrian Gerst SYSCALL_DEFINE0(ni_syscall)
346cc42c045SBrian Gerst {
347cc42c045SBrian Gerst 	return -ENOSYS;
348cc42c045SBrian Gerst }
3490ba50e86SThomas Gleixner 
3502f6474e4SThomas Gleixner #ifdef CONFIG_XEN_PV
3512f6474e4SThomas Gleixner #ifndef CONFIG_PREEMPTION
3522f6474e4SThomas Gleixner /*
3532f6474e4SThomas Gleixner  * Some hypercalls issued by the toolstack can take many 10s of
3542f6474e4SThomas Gleixner  * seconds. Allow tasks running hypercalls via the privcmd driver to
3552f6474e4SThomas Gleixner  * be voluntarily preempted even if full kernel preemption is
3562f6474e4SThomas Gleixner  * disabled.
3572f6474e4SThomas Gleixner  *
3582f6474e4SThomas Gleixner  * Such preemptible hypercalls are bracketed by
3592f6474e4SThomas Gleixner  * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
3602f6474e4SThomas Gleixner  * calls.
3612f6474e4SThomas Gleixner  */
3622f6474e4SThomas Gleixner DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
3632f6474e4SThomas Gleixner EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
3642f6474e4SThomas Gleixner 
3652f6474e4SThomas Gleixner /*
3662f6474e4SThomas Gleixner  * In case of scheduling the flag must be cleared and restored after
3672f6474e4SThomas Gleixner  * returning from schedule as the task might move to a different CPU.
3682f6474e4SThomas Gleixner  */
get_and_clear_inhcall(void)3692f6474e4SThomas Gleixner static __always_inline bool get_and_clear_inhcall(void)
3702f6474e4SThomas Gleixner {
3712f6474e4SThomas Gleixner 	bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
3722f6474e4SThomas Gleixner 
3732f6474e4SThomas Gleixner 	__this_cpu_write(xen_in_preemptible_hcall, false);
3742f6474e4SThomas Gleixner 	return inhcall;
3752f6474e4SThomas Gleixner }
3762f6474e4SThomas Gleixner 
restore_inhcall(bool inhcall)3772f6474e4SThomas Gleixner static __always_inline void restore_inhcall(bool inhcall)
3782f6474e4SThomas Gleixner {
3792f6474e4SThomas Gleixner 	__this_cpu_write(xen_in_preemptible_hcall, inhcall);
3802f6474e4SThomas Gleixner }
3812f6474e4SThomas Gleixner #else
get_and_clear_inhcall(void)3822f6474e4SThomas Gleixner static __always_inline bool get_and_clear_inhcall(void) { return false; }
restore_inhcall(bool inhcall)3832f6474e4SThomas Gleixner static __always_inline void restore_inhcall(bool inhcall) { }
3842f6474e4SThomas Gleixner #endif
3852f6474e4SThomas Gleixner 
__xen_pv_evtchn_do_upcall(struct pt_regs * regs)386359f01d1SThomas Gleixner static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
3872f6474e4SThomas Gleixner {
388359f01d1SThomas Gleixner 	struct pt_regs *old_regs = set_irq_regs(regs);
389359f01d1SThomas Gleixner 
3902f6474e4SThomas Gleixner 	inc_irq_stat(irq_hv_callback_count);
3912f6474e4SThomas Gleixner 
39237510dd5SJuergen Gross 	xen_evtchn_do_upcall();
3932f6474e4SThomas Gleixner 
394359f01d1SThomas Gleixner 	set_irq_regs(old_regs);
3952f6474e4SThomas Gleixner }
3962f6474e4SThomas Gleixner 
xen_pv_evtchn_do_upcall(struct pt_regs * regs)3972f6474e4SThomas Gleixner __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
3982f6474e4SThomas Gleixner {
399359f01d1SThomas Gleixner 	irqentry_state_t state = irqentry_enter(regs);
400b037b09bSAndy Lutomirski 	bool inhcall;
4012f6474e4SThomas Gleixner 
40284e60065SPeter Zijlstra 	instrumentation_begin();
403359f01d1SThomas Gleixner 	run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
4042f6474e4SThomas Gleixner 
4052f6474e4SThomas Gleixner 	inhcall = get_and_clear_inhcall();
406b037b09bSAndy Lutomirski 	if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
407bdcd178aSThomas Gleixner 		irqentry_exit_cond_resched();
4082f6474e4SThomas Gleixner 		instrumentation_end();
4092f6474e4SThomas Gleixner 		restore_inhcall(inhcall);
4102f6474e4SThomas Gleixner 	} else {
41184e60065SPeter Zijlstra 		instrumentation_end();
412a27a0a55SThomas Gleixner 		irqentry_exit(regs, state);
4132f6474e4SThomas Gleixner 	}
4142f6474e4SThomas Gleixner }
4152f6474e4SThomas Gleixner #endif /* CONFIG_XEN_PV */
416