1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * common.c - C code for kernel entry and exit 4 * Copyright (c) 2015 Andrew Lutomirski 5 * 6 * Based on asm and ptrace code by many authors. The code here originated 7 * in ptrace.c and signal.c. 8 */ 9 10 #include <linux/kernel.h> 11 #include <linux/sched.h> 12 #include <linux/sched/task_stack.h> 13 #include <linux/entry-common.h> 14 #include <linux/mm.h> 15 #include <linux/smp.h> 16 #include <linux/errno.h> 17 #include <linux/ptrace.h> 18 #include <linux/export.h> 19 #include <linux/nospec.h> 20 #include <linux/syscalls.h> 21 #include <linux/uaccess.h> 22 23 #ifdef CONFIG_XEN_PV 24 #include <xen/xen-ops.h> 25 #include <xen/events.h> 26 #endif 27 28 #include <asm/desc.h> 29 #include <asm/traps.h> 30 #include <asm/vdso.h> 31 #include <asm/cpufeature.h> 32 #include <asm/fpu/api.h> 33 #include <asm/nospec-branch.h> 34 #include <asm/io_bitmap.h> 35 #include <asm/syscall.h> 36 #include <asm/irq_stack.h> 37 38 #ifdef CONFIG_X86_64 39 40 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) 41 { 42 /* 43 * Convert negative numbers to very high and thus out of range 44 * numbers for comparisons. 45 */ 46 unsigned int unr = nr; 47 48 if (likely(unr < NR_syscalls)) { 49 unr = array_index_nospec(unr, NR_syscalls); 50 regs->ax = sys_call_table[unr](regs); 51 return true; 52 } 53 return false; 54 } 55 56 static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) 57 { 58 /* 59 * Adjust the starting offset of the table, and convert numbers 60 * < __X32_SYSCALL_BIT to very high and thus out of range 61 * numbers for comparisons. 62 */ 63 unsigned int xnr = nr - __X32_SYSCALL_BIT; 64 65 if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { 66 xnr = array_index_nospec(xnr, X32_NR_syscalls); 67 regs->ax = x32_sys_call_table[xnr](regs); 68 return true; 69 } 70 return false; 71 } 72 73 __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) 74 { 75 add_random_kstack_offset(); 76 nr = syscall_enter_from_user_mode(regs, nr); 77 78 instrumentation_begin(); 79 80 if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { 81 /* Invalid system call, but still a system call. */ 82 regs->ax = __x64_sys_ni_syscall(regs); 83 } 84 85 instrumentation_end(); 86 syscall_exit_to_user_mode(regs); 87 } 88 #endif 89 90 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 91 static __always_inline int syscall_32_enter(struct pt_regs *regs) 92 { 93 if (IS_ENABLED(CONFIG_IA32_EMULATION)) 94 current_thread_info()->status |= TS_COMPAT; 95 96 return (int)regs->orig_ax; 97 } 98 99 /* 100 * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. 101 */ 102 static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) 103 { 104 /* 105 * Convert negative numbers to very high and thus out of range 106 * numbers for comparisons. 107 */ 108 unsigned int unr = nr; 109 110 if (likely(unr < IA32_NR_syscalls)) { 111 unr = array_index_nospec(unr, IA32_NR_syscalls); 112 regs->ax = ia32_sys_call_table[unr](regs); 113 } else if (nr != -1) { 114 regs->ax = __ia32_sys_ni_syscall(regs); 115 } 116 } 117 118 /* Handles int $0x80 */ 119 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) 120 { 121 int nr = syscall_32_enter(regs); 122 123 add_random_kstack_offset(); 124 /* 125 * Subtlety here: if ptrace pokes something larger than 2^31-1 into 126 * orig_ax, the int return value truncates it. This matches 127 * the semantics of syscall_get_nr(). 128 */ 129 nr = syscall_enter_from_user_mode(regs, nr); 130 instrumentation_begin(); 131 132 do_syscall_32_irqs_on(regs, nr); 133 134 instrumentation_end(); 135 syscall_exit_to_user_mode(regs); 136 } 137 138 static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) 139 { 140 int nr = syscall_32_enter(regs); 141 int res; 142 143 add_random_kstack_offset(); 144 /* 145 * This cannot use syscall_enter_from_user_mode() as it has to 146 * fetch EBP before invoking any of the syscall entry work 147 * functions. 148 */ 149 syscall_enter_from_user_mode_prepare(regs); 150 151 instrumentation_begin(); 152 /* Fetch EBP from where the vDSO stashed it. */ 153 if (IS_ENABLED(CONFIG_X86_64)) { 154 /* 155 * Micro-optimization: the pointer we're following is 156 * explicitly 32 bits, so it can't be out of range. 157 */ 158 res = __get_user(*(u32 *)®s->bp, 159 (u32 __user __force *)(unsigned long)(u32)regs->sp); 160 } else { 161 res = get_user(*(u32 *)®s->bp, 162 (u32 __user __force *)(unsigned long)(u32)regs->sp); 163 } 164 165 if (res) { 166 /* User code screwed up. */ 167 regs->ax = -EFAULT; 168 169 local_irq_disable(); 170 instrumentation_end(); 171 irqentry_exit_to_user_mode(regs); 172 return false; 173 } 174 175 nr = syscall_enter_from_user_mode_work(regs, nr); 176 177 /* Now this is just like a normal syscall. */ 178 do_syscall_32_irqs_on(regs, nr); 179 180 instrumentation_end(); 181 syscall_exit_to_user_mode(regs); 182 return true; 183 } 184 185 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 186 __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) 187 { 188 /* 189 * Called using the internal vDSO SYSENTER/SYSCALL32 calling 190 * convention. Adjust regs so it looks like we entered using int80. 191 */ 192 unsigned long landing_pad = (unsigned long)current->mm->context.vdso + 193 vdso_image_32.sym_int80_landing_pad; 194 195 /* 196 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward 197 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. 198 * Fix it up. 199 */ 200 regs->ip = landing_pad; 201 202 /* Invoke the syscall. If it failed, keep it simple: use IRET. */ 203 if (!__do_fast_syscall_32(regs)) 204 return 0; 205 206 #ifdef CONFIG_X86_64 207 /* 208 * Opportunistic SYSRETL: if possible, try to return using SYSRETL. 209 * SYSRETL is available on all 64-bit CPUs, so we don't need to 210 * bother with SYSEXIT. 211 * 212 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, 213 * because the ECX fixup above will ensure that this is essentially 214 * never the case. 215 */ 216 return regs->cs == __USER32_CS && regs->ss == __USER_DS && 217 regs->ip == landing_pad && 218 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; 219 #else 220 /* 221 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. 222 * 223 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, 224 * because the ECX fixup above will ensure that this is essentially 225 * never the case. 226 * 227 * We don't allow syscalls at all from VM86 mode, but we still 228 * need to check VM, because we might be returning from sys_vm86. 229 */ 230 return static_cpu_has(X86_FEATURE_SEP) && 231 regs->cs == __USER_CS && regs->ss == __USER_DS && 232 regs->ip == landing_pad && 233 (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; 234 #endif 235 } 236 237 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ 238 __visible noinstr long do_SYSENTER_32(struct pt_regs *regs) 239 { 240 /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ 241 regs->sp = regs->bp; 242 243 /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ 244 regs->flags |= X86_EFLAGS_IF; 245 246 return do_fast_syscall_32(regs); 247 } 248 #endif 249 250 SYSCALL_DEFINE0(ni_syscall) 251 { 252 return -ENOSYS; 253 } 254 255 #ifdef CONFIG_XEN_PV 256 #ifndef CONFIG_PREEMPTION 257 /* 258 * Some hypercalls issued by the toolstack can take many 10s of 259 * seconds. Allow tasks running hypercalls via the privcmd driver to 260 * be voluntarily preempted even if full kernel preemption is 261 * disabled. 262 * 263 * Such preemptible hypercalls are bracketed by 264 * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() 265 * calls. 266 */ 267 DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); 268 EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); 269 270 /* 271 * In case of scheduling the flag must be cleared and restored after 272 * returning from schedule as the task might move to a different CPU. 273 */ 274 static __always_inline bool get_and_clear_inhcall(void) 275 { 276 bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); 277 278 __this_cpu_write(xen_in_preemptible_hcall, false); 279 return inhcall; 280 } 281 282 static __always_inline void restore_inhcall(bool inhcall) 283 { 284 __this_cpu_write(xen_in_preemptible_hcall, inhcall); 285 } 286 #else 287 static __always_inline bool get_and_clear_inhcall(void) { return false; } 288 static __always_inline void restore_inhcall(bool inhcall) { } 289 #endif 290 291 static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) 292 { 293 struct pt_regs *old_regs = set_irq_regs(regs); 294 295 inc_irq_stat(irq_hv_callback_count); 296 297 xen_hvm_evtchn_do_upcall(); 298 299 set_irq_regs(old_regs); 300 } 301 302 __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) 303 { 304 irqentry_state_t state = irqentry_enter(regs); 305 bool inhcall; 306 307 instrumentation_begin(); 308 run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); 309 310 inhcall = get_and_clear_inhcall(); 311 if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { 312 irqentry_exit_cond_resched(); 313 instrumentation_end(); 314 restore_inhcall(inhcall); 315 } else { 316 instrumentation_end(); 317 irqentry_exit(regs, state); 318 } 319 } 320 #endif /* CONFIG_XEN_PV */ 321