1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * common.c - C code for kernel entry and exit 4 * Copyright (c) 2015 Andrew Lutomirski 5 * 6 * Based on asm and ptrace code by many authors. The code here originated 7 * in ptrace.c and signal.c. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/sched.h> 12 #include <linux/sched/task_stack.h> 13 #include <linux/entry-common.h> 14 #include <linux/mm.h> 15 #include <linux/smp.h> 16 #include <linux/errno.h> 17 #include <linux/ptrace.h> 18 #include <linux/export.h> 19 #include <linux/nospec.h> 20 #include <linux/syscalls.h> 21 #include <linux/uaccess.h> 22 23 #ifdef CONFIG_XEN_PV 24 #include <xen/xen-ops.h> 25 #include <xen/events.h> 26 #endif 27 28 #include <asm/desc.h> 29 #include <asm/traps.h> 30 #include <asm/vdso.h> 31 #include <asm/cpufeature.h> 32 #include <asm/fpu/api.h> 33 #include <asm/nospec-branch.h> 34 #include <asm/io_bitmap.h> 35 #include <asm/syscall.h> 36 #include <asm/irq_stack.h> 37 38 #ifdef CONFIG_X86_64 39 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) 40 { 41 add_random_kstack_offset(); 42 nr = syscall_enter_from_user_mode(regs, nr); 43 44 instrumentation_begin(); 45 if (likely(nr < NR_syscalls)) { 46 nr = array_index_nospec(nr, NR_syscalls); 47 regs->ax = sys_call_table[nr](regs); 48 #ifdef CONFIG_X86_X32_ABI 49 } else if (likely((nr & __X32_SYSCALL_BIT) && 50 (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { 51 nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, 52 X32_NR_syscalls); 53 regs->ax = x32_sys_call_table[nr](regs); 54 #endif 55 } 56 instrumentation_end(); 57 syscall_exit_to_user_mode(regs); 58 } 59 #endif 60 61 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 62 static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) 63 { 64 if (IS_ENABLED(CONFIG_IA32_EMULATION)) 65 current_thread_info()->status |= TS_COMPAT; 66 67 return (unsigned int)regs->orig_ax; 68 } 69 70 /* 71 * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. 72 */ 73 static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, 74 unsigned int nr) 75 { 76 if (likely(nr < IA32_NR_syscalls)) { 77 nr = array_index_nospec(nr, IA32_NR_syscalls); 78 regs->ax = ia32_sys_call_table[nr](regs); 79 } 80 } 81 82 /* Handles int $0x80 */ 83 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) 84 { 85 unsigned int nr = syscall_32_enter(regs); 86 87 add_random_kstack_offset(); 88 /* 89 * Subtlety here: if ptrace pokes something larger than 2^32-1 into 90 * orig_ax, the unsigned int return value truncates it. This may 91 * or may not be necessary, but it matches the old asm behavior. 92 */ 93 nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); 94 instrumentation_begin(); 95 96 do_syscall_32_irqs_on(regs, nr); 97 98 instrumentation_end(); 99 syscall_exit_to_user_mode(regs); 100 } 101 102 static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) 103 { 104 unsigned int nr = syscall_32_enter(regs); 105 int res; 106 107 add_random_kstack_offset(); 108 /* 109 * This cannot use syscall_enter_from_user_mode() as it has to 110 * fetch EBP before invoking any of the syscall entry work 111 * functions. 112 */ 113 syscall_enter_from_user_mode_prepare(regs); 114 115 instrumentation_begin(); 116 /* Fetch EBP from where the vDSO stashed it. */ 117 if (IS_ENABLED(CONFIG_X86_64)) { 118 /* 119 * Micro-optimization: the pointer we're following is 120 * explicitly 32 bits, so it can't be out of range. 121 */ 122 res = __get_user(*(u32 *)®s->bp, 123 (u32 __user __force *)(unsigned long)(u32)regs->sp); 124 } else { 125 res = get_user(*(u32 *)®s->bp, 126 (u32 __user __force *)(unsigned long)(u32)regs->sp); 127 } 128 129 if (res) { 130 /* User code screwed up. */ 131 regs->ax = -EFAULT; 132 133 instrumentation_end(); 134 local_irq_disable(); 135 irqentry_exit_to_user_mode(regs); 136 return false; 137 } 138 139 /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ 140 nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); 141 142 /* Now this is just like a normal syscall. */ 143 do_syscall_32_irqs_on(regs, nr); 144 145 instrumentation_end(); 146 syscall_exit_to_user_mode(regs); 147 return true; 148 } 149 150 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 151 __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) 152 { 153 /* 154 * Called using the internal vDSO SYSENTER/SYSCALL32 calling 155 * convention. Adjust regs so it looks like we entered using int80. 156 */ 157 unsigned long landing_pad = (unsigned long)current->mm->context.vdso + 158 vdso_image_32.sym_int80_landing_pad; 159 160 /* 161 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward 162 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. 163 * Fix it up. 164 */ 165 regs->ip = landing_pad; 166 167 /* Invoke the syscall. If it failed, keep it simple: use IRET. */ 168 if (!__do_fast_syscall_32(regs)) 169 return 0; 170 171 #ifdef CONFIG_X86_64 172 /* 173 * Opportunistic SYSRETL: if possible, try to return using SYSRETL. 174 * SYSRETL is available on all 64-bit CPUs, so we don't need to 175 * bother with SYSEXIT. 176 * 177 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, 178 * because the ECX fixup above will ensure that this is essentially 179 * never the case. 180 */ 181 return regs->cs == __USER32_CS && regs->ss == __USER_DS && 182 regs->ip == landing_pad && 183 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; 184 #else 185 /* 186 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. 187 * 188 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, 189 * because the ECX fixup above will ensure that this is essentially 190 * never the case. 191 * 192 * We don't allow syscalls at all from VM86 mode, but we still 193 * need to check VM, because we might be returning from sys_vm86. 194 */ 195 return static_cpu_has(X86_FEATURE_SEP) && 196 regs->cs == __USER_CS && regs->ss == __USER_DS && 197 regs->ip == landing_pad && 198 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; 199 #endif 200 } 201 202 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 203 __visible noinstr long do_SYSENTER_32(struct pt_regs *regs) 204 { 205 /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ 206 regs->sp = regs->bp; 207 208 /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ 209 regs->flags |= X86_EFLAGS_IF; 210 211 return do_fast_syscall_32(regs); 212 } 213 #endif 214 215 SYSCALL_DEFINE0(ni_syscall) 216 { 217 return -ENOSYS; 218 } 219 220 #ifdef CONFIG_XEN_PV 221 #ifndef CONFIG_PREEMPTION 222 /* 223 * Some hypercalls issued by the toolstack can take many 10s of 224 * seconds. Allow tasks running hypercalls via the privcmd driver to 225 * be voluntarily preempted even if full kernel preemption is 226 * disabled. 227 * 228 * Such preemptible hypercalls are bracketed by 229 * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() 230 * calls. 231 */ 232 DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); 233 EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); 234 235 /* 236 * In case of scheduling the flag must be cleared and restored after 237 * returning from schedule as the task might move to a different CPU. 238 */ 239 static __always_inline bool get_and_clear_inhcall(void) 240 { 241 bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); 242 243 __this_cpu_write(xen_in_preemptible_hcall, false); 244 return inhcall; 245 } 246 247 static __always_inline void restore_inhcall(bool inhcall) 248 { 249 __this_cpu_write(xen_in_preemptible_hcall, inhcall); 250 } 251 #else 252 static __always_inline bool get_and_clear_inhcall(void) { return false; } 253 static __always_inline void restore_inhcall(bool inhcall) { } 254 #endif 255 256 static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) 257 { 258 struct pt_regs *old_regs = set_irq_regs(regs); 259 260 inc_irq_stat(irq_hv_callback_count); 261 262 xen_hvm_evtchn_do_upcall(); 263 264 set_irq_regs(old_regs); 265 } 266 267 __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) 268 { 269 irqentry_state_t state = irqentry_enter(regs); 270 bool inhcall; 271 272 run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); 273 274 inhcall = get_and_clear_inhcall(); 275 if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { 276 instrumentation_begin(); 277 irqentry_exit_cond_resched(); 278 instrumentation_end(); 279 restore_inhcall(inhcall); 280 } else { 281 irqentry_exit(regs, state); 282 } 283 } 284 #endif /* CONFIG_XEN_PV */ 285