1/* 2 * Compatibility mode system call entry point for x86-64. 3 * 4 * Copyright 2000-2002 Andi Kleen, SuSE Labs. 5 */ 6#include "calling.h" 7#include <asm/asm-offsets.h> 8#include <asm/current.h> 9#include <asm/errno.h> 10#include <asm/ia32_unistd.h> 11#include <asm/thread_info.h> 12#include <asm/segment.h> 13#include <asm/irqflags.h> 14#include <asm/asm.h> 15#include <asm/smap.h> 16#include <linux/linkage.h> 17#include <linux/err.h> 18 19 .section .entry.text, "ax" 20 21#ifdef CONFIG_PARAVIRT 22ENTRY(native_usergs_sysret32) 23 swapgs 24 sysretl 25ENDPROC(native_usergs_sysret32) 26#endif 27 28/* 29 * 32-bit SYSENTER instruction entry. 30 * 31 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. 32 * IF and VM in rflags are cleared (IOW: interrupts are off). 33 * SYSENTER does not save anything on the stack, 34 * and does not save old rip (!!!) and rflags. 35 * 36 * Arguments: 37 * eax system call number 38 * ebx arg1 39 * ecx arg2 40 * edx arg3 41 * esi arg4 42 * edi arg5 43 * ebp user stack 44 * 0(%ebp) arg6 45 * 46 * This is purely a fast path. For anything complicated we use the int 0x80 47 * path below. We set up a complete hardware stack frame to share code 48 * with the int 0x80 path. 49 */ 50ENTRY(entry_SYSENTER_compat) 51 /* Interrupts are off on entry. */ 52 SWAPGS_UNSAFE_STACK 53 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 54 55 /* 56 * User tracing code (ptrace or signal handlers) might assume that 57 * the saved RAX contains a 32-bit number when we're invoking a 32-bit 58 * syscall. Just in case the high bits are nonzero, zero-extend 59 * the syscall number. (This could almost certainly be deleted 60 * with no ill effects.) 61 */ 62 movl %eax, %eax 63 64 /* Construct struct pt_regs on stack */ 65 pushq $__USER32_DS /* pt_regs->ss */ 66 pushq %rcx /* pt_regs->sp */ 67 68 /* 69 * Push flags. This is nasty. First, interrupts are currently 70 * off, but we need pt_regs->flags to have IF set. Second, even 71 * if TF was set when SYSENTER started, it's clear by now. We fix 72 * that later using TIF_SINGLESTEP. 73 */ 74 pushfq /* pt_regs->flags (except IF = 0) */ 75 orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ 76 ASM_CLAC /* Clear AC after saving FLAGS */ 77 78 pushq $__USER32_CS /* pt_regs->cs */ 79 xorq %r8,%r8 80 pushq %r8 /* pt_regs->ip = 0 (placeholder) */ 81 pushq %rax /* pt_regs->orig_ax */ 82 pushq %rdi /* pt_regs->di */ 83 pushq %rsi /* pt_regs->si */ 84 pushq %rdx /* pt_regs->dx */ 85 pushq %rcx /* pt_regs->cx (will be overwritten) */ 86 pushq $-ENOSYS /* pt_regs->ax */ 87 pushq %r8 /* pt_regs->r8 = 0 */ 88 pushq %r8 /* pt_regs->r9 = 0 */ 89 pushq %r8 /* pt_regs->r10 = 0 */ 90 pushq %r8 /* pt_regs->r11 = 0 */ 91 pushq %rbx /* pt_regs->rbx */ 92 pushq %rbp /* pt_regs->rbp */ 93 pushq %r8 /* pt_regs->r12 = 0 */ 94 pushq %r8 /* pt_regs->r13 = 0 */ 95 pushq %r8 /* pt_regs->r14 = 0 */ 96 pushq %r8 /* pt_regs->r15 = 0 */ 97 cld 98 99 /* 100 * Sysenter doesn't filter flags, so we need to clear NT 101 * ourselves. To save a few cycles, we can check whether 102 * NT was set instead of doing an unconditional popfq. 103 * This needs to happen before enabling interrupts so that 104 * we don't get preempted with NT set. 105 * 106 * NB.: sysenter_fix_flags is a label with the code under it moved 107 * out-of-line as an optimization: NT is unlikely to be set in the 108 * majority of the cases and instead of polluting the I$ unnecessarily, 109 * we're keeping that code behind a branch which will predict as 110 * not-taken and therefore its instructions won't be fetched. 111 */ 112 testl $X86_EFLAGS_NT, EFLAGS(%rsp) 113 jnz sysenter_fix_flags 114sysenter_flags_fixed: 115 116 /* 117 * User mode is traced as though IRQs are on, and SYSENTER 118 * turned them off. 119 */ 120 TRACE_IRQS_OFF 121 122 movq %rsp, %rdi 123 call do_fast_syscall_32 124 testl %eax, %eax 125 jz .Lsyscall_32_done 126 jmp sysret32_from_system_call 127 128sysenter_fix_flags: 129 pushq $X86_EFLAGS_FIXED 130 popfq 131 jmp sysenter_flags_fixed 132ENDPROC(entry_SYSENTER_compat) 133 134/* 135 * 32-bit SYSCALL instruction entry. 136 * 137 * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 138 * then loads new ss, cs, and rip from previously programmed MSRs. 139 * rflags gets masked by a value from another MSR (so CLD and CLAC 140 * are not needed). SYSCALL does not save anything on the stack 141 * and does not change rsp. 142 * 143 * Note: rflags saving+masking-with-MSR happens only in Long mode 144 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). 145 * Don't get confused: rflags saving+masking depends on Long Mode Active bit 146 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes 147 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). 148 * 149 * Arguments: 150 * eax system call number 151 * ecx return address 152 * ebx arg1 153 * ebp arg2 (note: not saved in the stack frame, should not be touched) 154 * edx arg3 155 * esi arg4 156 * edi arg5 157 * esp user stack 158 * 0(%esp) arg6 159 */ 160ENTRY(entry_SYSCALL_compat) 161 /* Interrupts are off on entry. */ 162 SWAPGS_UNSAFE_STACK 163 164 /* Stash user ESP and switch to the kernel stack. */ 165 movl %esp, %r8d 166 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 167 168 /* Zero-extending 32-bit regs, do not remove */ 169 movl %eax, %eax 170 171 /* Construct struct pt_regs on stack */ 172 pushq $__USER32_DS /* pt_regs->ss */ 173 pushq %r8 /* pt_regs->sp */ 174 pushq %r11 /* pt_regs->flags */ 175 pushq $__USER32_CS /* pt_regs->cs */ 176 pushq %rcx /* pt_regs->ip */ 177 pushq %rax /* pt_regs->orig_ax */ 178 pushq %rdi /* pt_regs->di */ 179 pushq %rsi /* pt_regs->si */ 180 pushq %rdx /* pt_regs->dx */ 181 pushq %rcx /* pt_regs->cx (will be overwritten) */ 182 pushq $-ENOSYS /* pt_regs->ax */ 183 xorq %r8,%r8 184 pushq %r8 /* pt_regs->r8 = 0 */ 185 pushq %r8 /* pt_regs->r9 = 0 */ 186 pushq %r8 /* pt_regs->r10 = 0 */ 187 pushq %r8 /* pt_regs->r11 = 0 */ 188 pushq %rbx /* pt_regs->rbx */ 189 pushq %rbp /* pt_regs->rbp */ 190 pushq %r8 /* pt_regs->r12 = 0 */ 191 pushq %r8 /* pt_regs->r13 = 0 */ 192 pushq %r8 /* pt_regs->r14 = 0 */ 193 pushq %r8 /* pt_regs->r15 = 0 */ 194 195 /* 196 * User mode is traced as though IRQs are on, and SYSENTER 197 * turned them off. 198 */ 199 TRACE_IRQS_OFF 200 201 movq %rsp, %rdi 202 call do_fast_syscall_32 203 testl %eax, %eax 204 jz .Lsyscall_32_done 205 206 /* Opportunistic SYSRET */ 207sysret32_from_system_call: 208 TRACE_IRQS_ON /* User mode traces as IRQs on. */ 209 movq RBX(%rsp), %rbx /* pt_regs->rbx */ 210 movq RBP(%rsp), %rbp /* pt_regs->rbp */ 211 movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ 212 movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ 213 addq $RAX, %rsp /* Skip r8-r15 */ 214 popq %rax /* pt_regs->rax */ 215 popq %rdx /* Skip pt_regs->cx */ 216 popq %rdx /* pt_regs->dx */ 217 popq %rsi /* pt_regs->si */ 218 popq %rdi /* pt_regs->di */ 219 220 /* 221 * USERGS_SYSRET32 does: 222 * GSBASE = user's GS base 223 * EIP = ECX 224 * RFLAGS = R11 225 * CS = __USER32_CS 226 * SS = __USER_DS 227 * 228 * ECX will not match pt_regs->cx, but we're returning to a vDSO 229 * trampoline that will fix up RCX, so this is okay. 230 * 231 * R12-R15 are callee-saved, so they contain whatever was in them 232 * when the system call started, which is already known to user 233 * code. We zero R8-R10 to avoid info leaks. 234 */ 235 xorq %r8, %r8 236 xorq %r9, %r9 237 xorq %r10, %r10 238 movq RSP-ORIG_RAX(%rsp), %rsp 239 USERGS_SYSRET32 240END(entry_SYSCALL_compat) 241 242/* 243 * Emulated IA32 system calls via int 0x80. 244 * 245 * Arguments: 246 * eax system call number 247 * ebx arg1 248 * ecx arg2 249 * edx arg3 250 * esi arg4 251 * edi arg5 252 * ebp arg6 (note: not saved in the stack frame, should not be touched) 253 * 254 * Notes: 255 * Uses the same stack frame as the x86-64 version. 256 * All registers except eax must be saved (but ptrace may violate that). 257 * Arguments are zero extended. For system calls that want sign extension and 258 * take long arguments a wrapper is needed. Most calls can just be called 259 * directly. 260 * Assumes it is only called from user space and entered with interrupts off. 261 */ 262 263ENTRY(entry_INT80_compat) 264 /* 265 * Interrupts are off on entry. 266 */ 267 PARAVIRT_ADJUST_EXCEPTION_FRAME 268 SWAPGS 269 270 /* 271 * User tracing code (ptrace or signal handlers) might assume that 272 * the saved RAX contains a 32-bit number when we're invoking a 32-bit 273 * syscall. Just in case the high bits are nonzero, zero-extend 274 * the syscall number. (This could almost certainly be deleted 275 * with no ill effects.) 276 */ 277 movl %eax, %eax 278 279 /* Construct struct pt_regs on stack (iret frame is already on stack) */ 280 pushq %rax /* pt_regs->orig_ax */ 281 pushq %rdi /* pt_regs->di */ 282 pushq %rsi /* pt_regs->si */ 283 pushq %rdx /* pt_regs->dx */ 284 pushq %rcx /* pt_regs->cx */ 285 pushq $-ENOSYS /* pt_regs->ax */ 286 xorq %r8,%r8 287 pushq %r8 /* pt_regs->r8 = 0 */ 288 pushq %r8 /* pt_regs->r9 = 0 */ 289 pushq %r8 /* pt_regs->r10 = 0 */ 290 pushq %r8 /* pt_regs->r11 = 0 */ 291 pushq %rbx /* pt_regs->rbx */ 292 pushq %rbp /* pt_regs->rbp */ 293 pushq %r12 /* pt_regs->r12 */ 294 pushq %r13 /* pt_regs->r13 */ 295 pushq %r14 /* pt_regs->r14 */ 296 pushq %r15 /* pt_regs->r15 */ 297 cld 298 299 /* 300 * User mode is traced as though IRQs are on, and the interrupt 301 * gate turned them off. 302 */ 303 TRACE_IRQS_OFF 304 305 movq %rsp, %rdi 306 call do_syscall_32_irqs_off 307.Lsyscall_32_done: 308 309 /* Go back to user mode. */ 310 TRACE_IRQS_ON 311 SWAPGS 312 jmp restore_regs_and_iret 313END(entry_INT80_compat) 314 315 ALIGN 316GLOBAL(stub32_clone) 317 /* 318 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). 319 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). 320 * 321 * The native 64-bit kernel's sys_clone() implements the latter, 322 * so we need to swap arguments here before calling it: 323 */ 324 xchg %r8, %rcx 325 jmp sys_clone 326