1fb9e53ccSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21f484aa6SAndy Lutomirski /*
31f484aa6SAndy Lutomirski * common.c - C code for kernel entry and exit
41f484aa6SAndy Lutomirski * Copyright (c) 2015 Andrew Lutomirski
51f484aa6SAndy Lutomirski *
61f484aa6SAndy Lutomirski * Based on asm and ptrace code by many authors. The code here originated
71f484aa6SAndy Lutomirski * in ptrace.c and signal.c.
81f484aa6SAndy Lutomirski */
91f484aa6SAndy Lutomirski
101f484aa6SAndy Lutomirski #include <linux/kernel.h>
111f484aa6SAndy Lutomirski #include <linux/sched.h>
1268db0cf1SIngo Molnar #include <linux/sched/task_stack.h>
1327d6b4d1SThomas Gleixner #include <linux/entry-common.h>
141f484aa6SAndy Lutomirski #include <linux/mm.h>
151f484aa6SAndy Lutomirski #include <linux/smp.h>
161f484aa6SAndy Lutomirski #include <linux/errno.h>
171f484aa6SAndy Lutomirski #include <linux/ptrace.h>
181f484aa6SAndy Lutomirski #include <linux/export.h>
192fbd7af5SDan Williams #include <linux/nospec.h>
205ea0727bSThomas Garnier #include <linux/syscalls.h>
215f409e20SRik van Riel #include <linux/uaccess.h>
221f484aa6SAndy Lutomirski
232f6474e4SThomas Gleixner #ifdef CONFIG_XEN_PV
242f6474e4SThomas Gleixner #include <xen/xen-ops.h>
252f6474e4SThomas Gleixner #include <xen/events.h>
262f6474e4SThomas Gleixner #endif
272f6474e4SThomas Gleixner
2822ca647cSThomas Gleixner #include <asm/apic.h>
291f484aa6SAndy Lutomirski #include <asm/desc.h>
301f484aa6SAndy Lutomirski #include <asm/traps.h>
31710246dfSAndy Lutomirski #include <asm/vdso.h>
32cd4d09ecSBorislav Petkov #include <asm/cpufeature.h>
335f409e20SRik van Riel #include <asm/fpu/api.h>
3404dcbdb8SThomas Gleixner #include <asm/nospec-branch.h>
3522fe5b04SThomas Gleixner #include <asm/io_bitmap.h>
3699ce3255SBenjamin Thiel #include <asm/syscall.h>
372f6474e4SThomas Gleixner #include <asm/irq_stack.h>
381f484aa6SAndy Lutomirski
390b085e68SThomas Gleixner #ifdef CONFIG_X86_64
402978996fSH. Peter Anvin (Intel)
do_syscall_x64(struct pt_regs * regs,int nr)412978996fSH. Peter Anvin (Intel) static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
422978996fSH. Peter Anvin (Intel) {
432978996fSH. Peter Anvin (Intel) /*
442978996fSH. Peter Anvin (Intel) * Convert negative numbers to very high and thus out of range
452978996fSH. Peter Anvin (Intel) * numbers for comparisons.
462978996fSH. Peter Anvin (Intel) */
472978996fSH. Peter Anvin (Intel) unsigned int unr = nr;
482978996fSH. Peter Anvin (Intel)
492978996fSH. Peter Anvin (Intel) if (likely(unr < NR_syscalls)) {
502978996fSH. Peter Anvin (Intel) unr = array_index_nospec(unr, NR_syscalls);
51eb0f175bSLinus Torvalds regs->ax = x64_sys_call(regs, unr);
522978996fSH. Peter Anvin (Intel) return true;
532978996fSH. Peter Anvin (Intel) }
542978996fSH. Peter Anvin (Intel) return false;
552978996fSH. Peter Anvin (Intel) }
562978996fSH. Peter Anvin (Intel)
do_syscall_x32(struct pt_regs * regs,int nr)572978996fSH. Peter Anvin (Intel) static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
582978996fSH. Peter Anvin (Intel) {
592978996fSH. Peter Anvin (Intel) /*
602978996fSH. Peter Anvin (Intel) * Adjust the starting offset of the table, and convert numbers
612978996fSH. Peter Anvin (Intel) * < __X32_SYSCALL_BIT to very high and thus out of range
622978996fSH. Peter Anvin (Intel) * numbers for comparisons.
632978996fSH. Peter Anvin (Intel) */
642978996fSH. Peter Anvin (Intel) unsigned int xnr = nr - __X32_SYSCALL_BIT;
652978996fSH. Peter Anvin (Intel)
662978996fSH. Peter Anvin (Intel) if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
672978996fSH. Peter Anvin (Intel) xnr = array_index_nospec(xnr, X32_NR_syscalls);
68eb0f175bSLinus Torvalds regs->ax = x32_sys_call(regs, xnr);
692978996fSH. Peter Anvin (Intel) return true;
702978996fSH. Peter Anvin (Intel) }
712978996fSH. Peter Anvin (Intel) return false;
722978996fSH. Peter Anvin (Intel) }
732978996fSH. Peter Anvin (Intel)
do_syscall_64(struct pt_regs * regs,int nr)742978996fSH. Peter Anvin (Intel) __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
750b085e68SThomas Gleixner {
76fe950f60SKees Cook add_random_kstack_offset();
7727d6b4d1SThomas Gleixner nr = syscall_enter_from_user_mode(regs, nr);
780b085e68SThomas Gleixner
790b085e68SThomas Gleixner instrumentation_begin();
802978996fSH. Peter Anvin (Intel)
812978996fSH. Peter Anvin (Intel) if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
822978996fSH. Peter Anvin (Intel) /* Invalid system call, but still a system call. */
83b337b496SH. Peter Anvin (Intel) regs->ax = __x64_sys_ni_syscall(regs);
841e423bffSAndy Lutomirski }
852978996fSH. Peter Anvin (Intel)
868f159f1dSThomas Gleixner instrumentation_end();
87167fd210SThomas Gleixner syscall_exit_to_user_mode(regs);
881e423bffSAndy Lutomirski }
891e423bffSAndy Lutomirski #endif
901e423bffSAndy Lutomirski
91bd2d3a3bSAndy Lutomirski #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
syscall_32_enter(struct pt_regs * regs)922978996fSH. Peter Anvin (Intel) static __always_inline int syscall_32_enter(struct pt_regs *regs)
93bd2d3a3bSAndy Lutomirski {
940b085e68SThomas Gleixner if (IS_ENABLED(CONFIG_IA32_EMULATION))
950b085e68SThomas Gleixner current_thread_info()->status |= TS_COMPAT;
964facb95bSThomas Gleixner
972978996fSH. Peter Anvin (Intel) return (int)regs->orig_ax;
98bd2d3a3bSAndy Lutomirski }
99bd2d3a3bSAndy Lutomirski
100f259af26SNikolay Borisov #ifdef CONFIG_IA32_EMULATION
101f259af26SNikolay Borisov bool __ia32_enabled __ro_after_init = true;
102f259af26SNikolay Borisov #endif
103f259af26SNikolay Borisov
1040b085e68SThomas Gleixner /*
1050b085e68SThomas Gleixner * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
1060b085e68SThomas Gleixner */
do_syscall_32_irqs_on(struct pt_regs * regs,int nr)1072978996fSH. Peter Anvin (Intel) static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
1080b085e68SThomas Gleixner {
1092978996fSH. Peter Anvin (Intel) /*
1102978996fSH. Peter Anvin (Intel) * Convert negative numbers to very high and thus out of range
1112978996fSH. Peter Anvin (Intel) * numbers for comparisons.
1122978996fSH. Peter Anvin (Intel) */
1132978996fSH. Peter Anvin (Intel) unsigned int unr = nr;
1142978996fSH. Peter Anvin (Intel)
1152978996fSH. Peter Anvin (Intel) if (likely(unr < IA32_NR_syscalls)) {
1162978996fSH. Peter Anvin (Intel) unr = array_index_nospec(unr, IA32_NR_syscalls);
117eb0f175bSLinus Torvalds regs->ax = ia32_sys_call(regs, unr);
1182978996fSH. Peter Anvin (Intel) } else if (nr != -1) {
119b337b496SH. Peter Anvin (Intel) regs->ax = __ia32_sys_ni_syscall(regs);
120bd2d3a3bSAndy Lutomirski }
121bd2d3a3bSAndy Lutomirski }
122710246dfSAndy Lutomirski
1234591766fSThomas Gleixner #ifdef CONFIG_IA32_EMULATION
int80_is_external(void)12422ca647cSThomas Gleixner static __always_inline bool int80_is_external(void)
12522ca647cSThomas Gleixner {
12622ca647cSThomas Gleixner const unsigned int offs = (0x80 / 32) * 0x10;
12722ca647cSThomas Gleixner const u32 bit = BIT(0x80 % 32);
12822ca647cSThomas Gleixner
12922ca647cSThomas Gleixner /* The local APIC on XENPV guests is fake */
13022ca647cSThomas Gleixner if (cpu_feature_enabled(X86_FEATURE_XENPV))
13122ca647cSThomas Gleixner return false;
13222ca647cSThomas Gleixner
13322ca647cSThomas Gleixner /*
13422ca647cSThomas Gleixner * If vector 0x80 is set in the APIC ISR then this is an external
13522ca647cSThomas Gleixner * interrupt. Either from broken hardware or injected by a VMM.
13622ca647cSThomas Gleixner *
13722ca647cSThomas Gleixner * Note: In guest mode this is only valid for secure guests where
13822ca647cSThomas Gleixner * the secure module fully controls the vAPIC exposed to the guest.
13922ca647cSThomas Gleixner */
14022ca647cSThomas Gleixner return apic_read(APIC_ISR + offs) & bit;
14122ca647cSThomas Gleixner }
14222ca647cSThomas Gleixner
1434591766fSThomas Gleixner /**
144*eb36b0dcSPawan Gupta * do_int80_emulation - 32-bit legacy syscall C entry from asm
1454591766fSThomas Gleixner *
1464591766fSThomas Gleixner * This entry point can be used by 32-bit and 64-bit programs to perform
1474591766fSThomas Gleixner * 32-bit system calls. Instances of INT $0x80 can be found inline in
1484591766fSThomas Gleixner * various programs and libraries. It is also used by the vDSO's
1494591766fSThomas Gleixner * __kernel_vsyscall fallback for hardware that doesn't support a faster
1504591766fSThomas Gleixner * entry method. Restarted 32-bit system calls also fall back to INT
1514591766fSThomas Gleixner * $0x80 regardless of what instruction was originally used to do the
1524591766fSThomas Gleixner * system call.
1534591766fSThomas Gleixner *
1544591766fSThomas Gleixner * This is considered a slow path. It is not used by most libc
1554591766fSThomas Gleixner * implementations on modern hardware except during process startup.
1564591766fSThomas Gleixner *
1574591766fSThomas Gleixner * The arguments for the INT $0x80 based syscall are on stack in the
1584591766fSThomas Gleixner * pt_regs structure:
1594591766fSThomas Gleixner * eax: system call number
1604591766fSThomas Gleixner * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
1614591766fSThomas Gleixner */
do_int80_emulation(struct pt_regs * regs)162*eb36b0dcSPawan Gupta __visible noinstr void do_int80_emulation(struct pt_regs *regs)
1634591766fSThomas Gleixner {
1644591766fSThomas Gleixner int nr;
1654591766fSThomas Gleixner
16622ca647cSThomas Gleixner /* Kernel does not use INT $0x80! */
16722ca647cSThomas Gleixner if (unlikely(!user_mode(regs))) {
16822ca647cSThomas Gleixner irqentry_enter(regs);
16922ca647cSThomas Gleixner instrumentation_begin();
17022ca647cSThomas Gleixner panic("Unexpected external interrupt 0x80\n");
17122ca647cSThomas Gleixner }
17222ca647cSThomas Gleixner
17322ca647cSThomas Gleixner /*
17422ca647cSThomas Gleixner * Establish kernel context for instrumentation, including for
17522ca647cSThomas Gleixner * int80_is_external() below which calls into the APIC driver.
17622ca647cSThomas Gleixner * Identical for soft and external interrupts.
17722ca647cSThomas Gleixner */
1784591766fSThomas Gleixner enter_from_user_mode(regs);
1794591766fSThomas Gleixner
1804591766fSThomas Gleixner instrumentation_begin();
1814591766fSThomas Gleixner add_random_kstack_offset();
1824591766fSThomas Gleixner
18322ca647cSThomas Gleixner /* Validate that this is a soft interrupt to the extent possible */
18422ca647cSThomas Gleixner if (unlikely(int80_is_external()))
18522ca647cSThomas Gleixner panic("Unexpected external interrupt 0x80\n");
18622ca647cSThomas Gleixner
1874591766fSThomas Gleixner /*
1884591766fSThomas Gleixner * The low level idtentry code pushed -1 into regs::orig_ax
1894591766fSThomas Gleixner * and regs::ax contains the syscall number.
1904591766fSThomas Gleixner *
1914591766fSThomas Gleixner * User tracing code (ptrace or signal handlers) might assume
1924591766fSThomas Gleixner * that the regs::orig_ax contains a 32-bit number on invoking
1934591766fSThomas Gleixner * a 32-bit syscall.
1944591766fSThomas Gleixner *
1954591766fSThomas Gleixner * Establish the syscall convention by saving the 32bit truncated
1964591766fSThomas Gleixner * syscall number in regs::orig_ax and by invalidating regs::ax.
1974591766fSThomas Gleixner */
1984591766fSThomas Gleixner regs->orig_ax = regs->ax & GENMASK(31, 0);
1994591766fSThomas Gleixner regs->ax = -ENOSYS;
2004591766fSThomas Gleixner
2014591766fSThomas Gleixner nr = syscall_32_enter(regs);
2024591766fSThomas Gleixner
2034591766fSThomas Gleixner local_irq_enable();
2044591766fSThomas Gleixner nr = syscall_enter_from_user_mode_work(regs, nr);
2054591766fSThomas Gleixner do_syscall_32_irqs_on(regs, nr);
2064591766fSThomas Gleixner
2074591766fSThomas Gleixner instrumentation_end();
2084591766fSThomas Gleixner syscall_exit_to_user_mode(regs);
2094591766fSThomas Gleixner }
2104591766fSThomas Gleixner #else /* CONFIG_IA32_EMULATION */
2114591766fSThomas Gleixner
2124591766fSThomas Gleixner /* Handles int $0x80 on a 32bit kernel */
do_int80_syscall_32(struct pt_regs * regs)2138f159f1dSThomas Gleixner __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
2148b13c255SAndy Lutomirski {
2152978996fSH. Peter Anvin (Intel) int nr = syscall_32_enter(regs);
2168f159f1dSThomas Gleixner
217fe950f60SKees Cook add_random_kstack_offset();
2184facb95bSThomas Gleixner /*
2192978996fSH. Peter Anvin (Intel) * Subtlety here: if ptrace pokes something larger than 2^31-1 into
2202978996fSH. Peter Anvin (Intel) * orig_ax, the int return value truncates it. This matches
2212978996fSH. Peter Anvin (Intel) * the semantics of syscall_get_nr().
2224facb95bSThomas Gleixner */
2232978996fSH. Peter Anvin (Intel) nr = syscall_enter_from_user_mode(regs, nr);
2249caa7ff5SPeter Zijlstra instrumentation_begin();
2254facb95bSThomas Gleixner
2260b085e68SThomas Gleixner do_syscall_32_irqs_on(regs, nr);
2279caa7ff5SPeter Zijlstra
2289caa7ff5SPeter Zijlstra instrumentation_end();
229167fd210SThomas Gleixner syscall_exit_to_user_mode(regs);
2308f159f1dSThomas Gleixner }
2314591766fSThomas Gleixner #endif /* !CONFIG_IA32_EMULATION */
2328f159f1dSThomas Gleixner
__do_fast_syscall_32(struct pt_regs * regs)2330b085e68SThomas Gleixner static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
2348f159f1dSThomas Gleixner {
2352978996fSH. Peter Anvin (Intel) int nr = syscall_32_enter(regs);
2368f159f1dSThomas Gleixner int res;
2378f159f1dSThomas Gleixner
238fe950f60SKees Cook add_random_kstack_offset();
2394facb95bSThomas Gleixner /*
2404facb95bSThomas Gleixner * This cannot use syscall_enter_from_user_mode() as it has to
2414facb95bSThomas Gleixner * fetch EBP before invoking any of the syscall entry work
2424facb95bSThomas Gleixner * functions.
2434facb95bSThomas Gleixner */
2444facb95bSThomas Gleixner syscall_enter_from_user_mode_prepare(regs);
2454facb95bSThomas Gleixner
2460b085e68SThomas Gleixner instrumentation_begin();
2478f159f1dSThomas Gleixner /* Fetch EBP from where the vDSO stashed it. */
2488f159f1dSThomas Gleixner if (IS_ENABLED(CONFIG_X86_64)) {
2498f159f1dSThomas Gleixner /*
2508f159f1dSThomas Gleixner * Micro-optimization: the pointer we're following is
2518f159f1dSThomas Gleixner * explicitly 32 bits, so it can't be out of range.
2528f159f1dSThomas Gleixner */
2538f159f1dSThomas Gleixner res = __get_user(*(u32 *)®s->bp,
2548f159f1dSThomas Gleixner (u32 __user __force *)(unsigned long)(u32)regs->sp);
2558f159f1dSThomas Gleixner } else {
2568f159f1dSThomas Gleixner res = get_user(*(u32 *)®s->bp,
2578f159f1dSThomas Gleixner (u32 __user __force *)(unsigned long)(u32)regs->sp);
2588f159f1dSThomas Gleixner }
2598f159f1dSThomas Gleixner
2608f159f1dSThomas Gleixner if (res) {
2618f159f1dSThomas Gleixner /* User code screwed up. */
2628f159f1dSThomas Gleixner regs->ax = -EFAULT;
2639caa7ff5SPeter Zijlstra
2645d5675dfSAndy Lutomirski local_irq_disable();
265240001d4SPeter Zijlstra instrumentation_end();
2665d5675dfSAndy Lutomirski irqentry_exit_to_user_mode(regs);
2678f159f1dSThomas Gleixner return false;
2688f159f1dSThomas Gleixner }
2698f159f1dSThomas Gleixner
2702978996fSH. Peter Anvin (Intel) nr = syscall_enter_from_user_mode_work(regs, nr);
2714facb95bSThomas Gleixner
2728f159f1dSThomas Gleixner /* Now this is just like a normal syscall. */
2730b085e68SThomas Gleixner do_syscall_32_irqs_on(regs, nr);
2749caa7ff5SPeter Zijlstra
2759caa7ff5SPeter Zijlstra instrumentation_end();
276167fd210SThomas Gleixner syscall_exit_to_user_mode(regs);
2778f159f1dSThomas Gleixner return true;
2788b13c255SAndy Lutomirski }
2798b13c255SAndy Lutomirski
2805f310f73SAndy Lutomirski /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
do_fast_syscall_32(struct pt_regs * regs)2818f159f1dSThomas Gleixner __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
282710246dfSAndy Lutomirski {
283710246dfSAndy Lutomirski /*
284710246dfSAndy Lutomirski * Called using the internal vDSO SYSENTER/SYSCALL32 calling
285710246dfSAndy Lutomirski * convention. Adjust regs so it looks like we entered using int80.
286710246dfSAndy Lutomirski */
287710246dfSAndy Lutomirski unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
288710246dfSAndy Lutomirski vdso_image_32.sym_int80_landing_pad;
289710246dfSAndy Lutomirski
290710246dfSAndy Lutomirski /*
291710246dfSAndy Lutomirski * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
292710246dfSAndy Lutomirski * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
293710246dfSAndy Lutomirski * Fix it up.
294710246dfSAndy Lutomirski */
295710246dfSAndy Lutomirski regs->ip = landing_pad;
296710246dfSAndy Lutomirski
2970b085e68SThomas Gleixner /* Invoke the syscall. If it failed, keep it simple: use IRET. */
2980b085e68SThomas Gleixner if (!__do_fast_syscall_32(regs))
2998f159f1dSThomas Gleixner return 0;
3007841b408SAndy Lutomirski
3017841b408SAndy Lutomirski #ifdef CONFIG_X86_64
3027841b408SAndy Lutomirski /*
3037841b408SAndy Lutomirski * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
3047841b408SAndy Lutomirski * SYSRETL is available on all 64-bit CPUs, so we don't need to
3057841b408SAndy Lutomirski * bother with SYSEXIT.
3067841b408SAndy Lutomirski *
3077841b408SAndy Lutomirski * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
3087841b408SAndy Lutomirski * because the ECX fixup above will ensure that this is essentially
3097841b408SAndy Lutomirski * never the case.
3107841b408SAndy Lutomirski */
3117841b408SAndy Lutomirski return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
3127841b408SAndy Lutomirski regs->ip == landing_pad &&
3137841b408SAndy Lutomirski (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
3147841b408SAndy Lutomirski #else
3155f310f73SAndy Lutomirski /*
3165f310f73SAndy Lutomirski * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
3175f310f73SAndy Lutomirski *
3185f310f73SAndy Lutomirski * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
3195f310f73SAndy Lutomirski * because the ECX fixup above will ensure that this is essentially
3205f310f73SAndy Lutomirski * never the case.
3215f310f73SAndy Lutomirski *
3225f310f73SAndy Lutomirski * We don't allow syscalls at all from VM86 mode, but we still
3235f310f73SAndy Lutomirski * need to check VM, because we might be returning from sys_vm86.
3245f310f73SAndy Lutomirski */
3255f310f73SAndy Lutomirski return static_cpu_has(X86_FEATURE_SEP) &&
3265f310f73SAndy Lutomirski regs->cs == __USER_CS && regs->ss == __USER_DS &&
3275f310f73SAndy Lutomirski regs->ip == landing_pad &&
3285f310f73SAndy Lutomirski (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
3297841b408SAndy Lutomirski #endif
330710246dfSAndy Lutomirski }
331d1721250SAndy Lutomirski
332d1721250SAndy Lutomirski /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
do_SYSENTER_32(struct pt_regs * regs)333d1721250SAndy Lutomirski __visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
334d1721250SAndy Lutomirski {
335d1721250SAndy Lutomirski /* SYSENTER loses RSP, but the vDSO saved it in RBP. */
336d1721250SAndy Lutomirski regs->sp = regs->bp;
337d1721250SAndy Lutomirski
338d1721250SAndy Lutomirski /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
339d1721250SAndy Lutomirski regs->flags |= X86_EFLAGS_IF;
340d1721250SAndy Lutomirski
341d1721250SAndy Lutomirski return do_fast_syscall_32(regs);
342d1721250SAndy Lutomirski }
343bd2d3a3bSAndy Lutomirski #endif
344cc42c045SBrian Gerst
SYSCALL_DEFINE0(ni_syscall)345cc42c045SBrian Gerst SYSCALL_DEFINE0(ni_syscall)
346cc42c045SBrian Gerst {
347cc42c045SBrian Gerst return -ENOSYS;
348cc42c045SBrian Gerst }
3490ba50e86SThomas Gleixner
3502f6474e4SThomas Gleixner #ifdef CONFIG_XEN_PV
3512f6474e4SThomas Gleixner #ifndef CONFIG_PREEMPTION
3522f6474e4SThomas Gleixner /*
3532f6474e4SThomas Gleixner * Some hypercalls issued by the toolstack can take many 10s of
3542f6474e4SThomas Gleixner * seconds. Allow tasks running hypercalls via the privcmd driver to
3552f6474e4SThomas Gleixner * be voluntarily preempted even if full kernel preemption is
3562f6474e4SThomas Gleixner * disabled.
3572f6474e4SThomas Gleixner *
3582f6474e4SThomas Gleixner * Such preemptible hypercalls are bracketed by
3592f6474e4SThomas Gleixner * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
3602f6474e4SThomas Gleixner * calls.
3612f6474e4SThomas Gleixner */
3622f6474e4SThomas Gleixner DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
3632f6474e4SThomas Gleixner EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
3642f6474e4SThomas Gleixner
3652f6474e4SThomas Gleixner /*
3662f6474e4SThomas Gleixner * In case of scheduling the flag must be cleared and restored after
3672f6474e4SThomas Gleixner * returning from schedule as the task might move to a different CPU.
3682f6474e4SThomas Gleixner */
get_and_clear_inhcall(void)3692f6474e4SThomas Gleixner static __always_inline bool get_and_clear_inhcall(void)
3702f6474e4SThomas Gleixner {
3712f6474e4SThomas Gleixner bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
3722f6474e4SThomas Gleixner
3732f6474e4SThomas Gleixner __this_cpu_write(xen_in_preemptible_hcall, false);
3742f6474e4SThomas Gleixner return inhcall;
3752f6474e4SThomas Gleixner }
3762f6474e4SThomas Gleixner
restore_inhcall(bool inhcall)3772f6474e4SThomas Gleixner static __always_inline void restore_inhcall(bool inhcall)
3782f6474e4SThomas Gleixner {
3792f6474e4SThomas Gleixner __this_cpu_write(xen_in_preemptible_hcall, inhcall);
3802f6474e4SThomas Gleixner }
3812f6474e4SThomas Gleixner #else
get_and_clear_inhcall(void)3822f6474e4SThomas Gleixner static __always_inline bool get_and_clear_inhcall(void) { return false; }
restore_inhcall(bool inhcall)3832f6474e4SThomas Gleixner static __always_inline void restore_inhcall(bool inhcall) { }
3842f6474e4SThomas Gleixner #endif
3852f6474e4SThomas Gleixner
__xen_pv_evtchn_do_upcall(struct pt_regs * regs)386359f01d1SThomas Gleixner static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
3872f6474e4SThomas Gleixner {
388359f01d1SThomas Gleixner struct pt_regs *old_regs = set_irq_regs(regs);
389359f01d1SThomas Gleixner
3902f6474e4SThomas Gleixner inc_irq_stat(irq_hv_callback_count);
3912f6474e4SThomas Gleixner
39237510dd5SJuergen Gross xen_evtchn_do_upcall();
3932f6474e4SThomas Gleixner
394359f01d1SThomas Gleixner set_irq_regs(old_regs);
3952f6474e4SThomas Gleixner }
3962f6474e4SThomas Gleixner
xen_pv_evtchn_do_upcall(struct pt_regs * regs)3972f6474e4SThomas Gleixner __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
3982f6474e4SThomas Gleixner {
399359f01d1SThomas Gleixner irqentry_state_t state = irqentry_enter(regs);
400b037b09bSAndy Lutomirski bool inhcall;
4012f6474e4SThomas Gleixner
40284e60065SPeter Zijlstra instrumentation_begin();
403359f01d1SThomas Gleixner run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
4042f6474e4SThomas Gleixner
4052f6474e4SThomas Gleixner inhcall = get_and_clear_inhcall();
406b037b09bSAndy Lutomirski if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
407bdcd178aSThomas Gleixner irqentry_exit_cond_resched();
4082f6474e4SThomas Gleixner instrumentation_end();
4092f6474e4SThomas Gleixner restore_inhcall(inhcall);
4102f6474e4SThomas Gleixner } else {
41184e60065SPeter Zijlstra instrumentation_end();
412a27a0a55SThomas Gleixner irqentry_exit(regs, state);
4132f6474e4SThomas Gleixner }
4142f6474e4SThomas Gleixner }
4152f6474e4SThomas Gleixner #endif /* CONFIG_XEN_PV */
416