1/* 2 * Compatibility mode system call entry point for x86-64. 3 * 4 * Copyright 2000-2002 Andi Kleen, SuSE Labs. 5 */ 6#include "calling.h" 7#include <asm/asm-offsets.h> 8#include <asm/current.h> 9#include <asm/errno.h> 10#include <asm/ia32_unistd.h> 11#include <asm/thread_info.h> 12#include <asm/segment.h> 13#include <asm/irqflags.h> 14#include <asm/asm.h> 15#include <asm/smap.h> 16#include <linux/linkage.h> 17#include <linux/err.h> 18 19/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 20#include <linux/elf-em.h> 21#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) 22#define __AUDIT_ARCH_LE 0x40000000 23 24#ifndef CONFIG_AUDITSYSCALL 25# define sysexit_audit ia32_ret_from_sys_call 26# define sysretl_audit ia32_ret_from_sys_call 27#endif 28 29 .section .entry.text, "ax" 30 31#ifdef CONFIG_PARAVIRT 32ENTRY(native_usergs_sysret32) 33 swapgs 34 sysretl 35ENDPROC(native_usergs_sysret32) 36#endif 37 38/* 39 * 32-bit SYSENTER instruction entry. 40 * 41 * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. 42 * IF and VM in rflags are cleared (IOW: interrupts are off). 43 * SYSENTER does not save anything on the stack, 44 * and does not save old rip (!!!) and rflags. 45 * 46 * Arguments: 47 * eax system call number 48 * ebx arg1 49 * ecx arg2 50 * edx arg3 51 * esi arg4 52 * edi arg5 53 * ebp user stack 54 * 0(%ebp) arg6 55 * 56 * This is purely a fast path. For anything complicated we use the int 0x80 57 * path below. We set up a complete hardware stack frame to share code 58 * with the int 0x80 path. 59 */ 60ENTRY(entry_SYSENTER_compat) 61 /* 62 * Interrupts are off on entry. 63 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 64 * it is too small to ever cause noticeable irq latency. 65 */ 66 SWAPGS_UNSAFE_STACK 67 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 68 ENABLE_INTERRUPTS(CLBR_NONE) 69 70 /* Zero-extending 32-bit regs, do not remove */ 71 movl %ebp, %ebp 72 movl %eax, %eax 73 74 movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d 75 76 /* Construct struct pt_regs on stack */ 77 pushq $__USER32_DS /* pt_regs->ss */ 78 pushq %rbp /* pt_regs->sp */ 79 pushfq /* pt_regs->flags */ 80 pushq $__USER32_CS /* pt_regs->cs */ 81 pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ 82 pushq %rax /* pt_regs->orig_ax */ 83 pushq %rdi /* pt_regs->di */ 84 pushq %rsi /* pt_regs->si */ 85 pushq %rdx /* pt_regs->dx */ 86 pushq %rcx /* pt_regs->cx */ 87 pushq $-ENOSYS /* pt_regs->ax */ 88 cld 89 sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ 90 91 /* 92 * no need to do an access_ok check here because rbp has been 93 * 32-bit zero extended 94 */ 95 ASM_STAC 961: movl (%rbp), %ebp 97 _ASM_EXTABLE(1b, ia32_badarg) 98 ASM_CLAC 99 100 /* 101 * Sysenter doesn't filter flags, so we need to clear NT 102 * ourselves. To save a few cycles, we can check whether 103 * NT was set instead of doing an unconditional popfq. 104 */ 105 testl $X86_EFLAGS_NT, EFLAGS(%rsp) 106 jnz sysenter_fix_flags 107sysenter_flags_fixed: 108 109 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 110 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 111 jnz sysenter_tracesys 112 113sysenter_do_call: 114 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 115 movl %edi, %r8d /* arg5 */ 116 movl %ebp, %r9d /* arg6 */ 117 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 118 movl %ebx, %edi /* arg1 */ 119 movl %edx, %edx /* arg3 (zero extension) */ 120sysenter_dispatch: 121 cmpq $(IA32_NR_syscalls-1), %rax 122 ja 1f 123 call *ia32_sys_call_table(, %rax, 8) 124 movq %rax, RAX(%rsp) 1251: 126 DISABLE_INTERRUPTS(CLBR_NONE) 127 TRACE_IRQS_OFF 128 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 129 jnz sysexit_audit 130sysexit_from_sys_call: 131 /* 132 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an 133 * NMI between STI and SYSEXIT has poorly specified behavior, 134 * and and NMI followed by an IRQ with usergs is fatal. So 135 * we just pretend we're using SYSEXIT but we really use 136 * SYSRETL instead. 137 * 138 * This code path is still called 'sysexit' because it pairs 139 * with 'sysenter' and it uses the SYSENTER calling convention. 140 */ 141 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 142 movl RIP(%rsp), %ecx /* User %eip */ 143 RESTORE_RSI_RDI 144 xorl %edx, %edx /* Do not leak kernel information */ 145 xorq %r8, %r8 146 xorq %r9, %r9 147 xorq %r10, %r10 148 movl EFLAGS(%rsp), %r11d /* User eflags */ 149 TRACE_IRQS_ON 150 151 /* 152 * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, 153 * since it avoids a dicey window with interrupts enabled. 154 */ 155 movl RSP(%rsp), %esp 156 157 /* 158 * USERGS_SYSRET32 does: 159 * gsbase = user's gs base 160 * eip = ecx 161 * rflags = r11 162 * cs = __USER32_CS 163 * ss = __USER_DS 164 * 165 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: 166 * 167 * pop %ebp 168 * pop %edx 169 * pop %ecx 170 * 171 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to 172 * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's 173 * address (already known to user code), and R12-R15 are 174 * callee-saved and therefore don't contain any interesting 175 * kernel data. 176 */ 177 USERGS_SYSRET32 178 179#ifdef CONFIG_AUDITSYSCALL 180 .macro auditsys_entry_common 181 /* 182 * At this point, registers hold syscall args in the 32-bit syscall ABI: 183 * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. 184 * 185 * We want to pass them to __audit_syscall_entry(), which is a 64-bit 186 * C function with 5 parameters, so shuffle them to match what 187 * the function expects: RDI,RSI,RDX,RCX,R8. 188 */ 189 movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ 190 xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ 191 /* arg3 (RDX) <= 2nd syscall arg (ECX) */ 192 movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ 193 movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ 194 call __audit_syscall_entry 195 196 /* 197 * We are going to jump back to the syscall dispatch code. 198 * Prepare syscall args as required by the 64-bit C ABI. 199 * Registers clobbered by __audit_syscall_entry() are 200 * loaded from pt_regs on stack: 201 */ 202 movl ORIG_RAX(%rsp), %eax /* syscall number */ 203 movl %ebx, %edi /* arg1 */ 204 movl RCX(%rsp), %esi /* arg2 */ 205 movl RDX(%rsp), %edx /* arg3 */ 206 movl RSI(%rsp), %ecx /* arg4 */ 207 movl RDI(%rsp), %r8d /* arg5 */ 208 movl %ebp, %r9d /* arg6 */ 209 .endm 210 211 .macro auditsys_exit exit 212 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 213 jnz ia32_ret_from_sys_call 214 TRACE_IRQS_ON 215 ENABLE_INTERRUPTS(CLBR_NONE) 216 movl %eax, %esi /* second arg, syscall return value */ 217 cmpl $-MAX_ERRNO, %eax /* is it an error ? */ 218 jbe 1f 219 movslq %eax, %rsi /* if error sign extend to 64 bits */ 2201: setbe %al /* 1 if error, 0 if not */ 221 movzbl %al, %edi /* zero-extend that into %edi */ 222 call __audit_syscall_exit 223 movq RAX(%rsp), %rax /* reload syscall return value */ 224 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi 225 DISABLE_INTERRUPTS(CLBR_NONE) 226 TRACE_IRQS_OFF 227 testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 228 jz \exit 229 xorl %eax, %eax /* Do not leak kernel information */ 230 movq %rax, R11(%rsp) 231 movq %rax, R10(%rsp) 232 movq %rax, R9(%rsp) 233 movq %rax, R8(%rsp) 234 jmp int_with_check 235 .endm 236 237sysenter_auditsys: 238 auditsys_entry_common 239 jmp sysenter_dispatch 240 241sysexit_audit: 242 auditsys_exit sysexit_from_sys_call 243#endif 244 245sysenter_fix_flags: 246 pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) 247 popfq 248 jmp sysenter_flags_fixed 249 250sysenter_tracesys: 251#ifdef CONFIG_AUDITSYSCALL 252 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 253 jz sysenter_auditsys 254#endif 255 SAVE_EXTRA_REGS 256 xorl %eax, %eax /* Do not leak kernel information */ 257 movq %rax, R11(%rsp) 258 movq %rax, R10(%rsp) 259 movq %rax, R9(%rsp) 260 movq %rax, R8(%rsp) 261 movq %rsp, %rdi /* &pt_regs -> arg1 */ 262 call syscall_trace_enter 263 264 /* Reload arg registers from stack. (see sysenter_tracesys) */ 265 movl RCX(%rsp), %ecx 266 movl RDX(%rsp), %edx 267 movl RSI(%rsp), %esi 268 movl RDI(%rsp), %edi 269 movl %eax, %eax /* zero extension */ 270 271 RESTORE_EXTRA_REGS 272 jmp sysenter_do_call 273ENDPROC(entry_SYSENTER_compat) 274 275/* 276 * 32-bit SYSCALL instruction entry. 277 * 278 * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, 279 * then loads new ss, cs, and rip from previously programmed MSRs. 280 * rflags gets masked by a value from another MSR (so CLD and CLAC 281 * are not needed). SYSCALL does not save anything on the stack 282 * and does not change rsp. 283 * 284 * Note: rflags saving+masking-with-MSR happens only in Long mode 285 * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). 286 * Don't get confused: rflags saving+masking depends on Long Mode Active bit 287 * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes 288 * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). 289 * 290 * Arguments: 291 * eax system call number 292 * ecx return address 293 * ebx arg1 294 * ebp arg2 (note: not saved in the stack frame, should not be touched) 295 * edx arg3 296 * esi arg4 297 * edi arg5 298 * esp user stack 299 * 0(%esp) arg6 300 * 301 * This is purely a fast path. For anything complicated we use the int 0x80 302 * path below. We set up a complete hardware stack frame to share code 303 * with the int 0x80 path. 304 */ 305ENTRY(entry_SYSCALL_compat) 306 /* 307 * Interrupts are off on entry. 308 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 309 * it is too small to ever cause noticeable irq latency. 310 */ 311 SWAPGS_UNSAFE_STACK 312 movl %esp, %r8d 313 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 314 ENABLE_INTERRUPTS(CLBR_NONE) 315 316 /* Zero-extending 32-bit regs, do not remove */ 317 movl %eax, %eax 318 319 /* Construct struct pt_regs on stack */ 320 pushq $__USER32_DS /* pt_regs->ss */ 321 pushq %r8 /* pt_regs->sp */ 322 pushq %r11 /* pt_regs->flags */ 323 pushq $__USER32_CS /* pt_regs->cs */ 324 pushq %rcx /* pt_regs->ip */ 325 pushq %rax /* pt_regs->orig_ax */ 326 pushq %rdi /* pt_regs->di */ 327 pushq %rsi /* pt_regs->si */ 328 pushq %rdx /* pt_regs->dx */ 329 pushq %rbp /* pt_regs->cx */ 330 movl %ebp, %ecx 331 pushq $-ENOSYS /* pt_regs->ax */ 332 sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ 333 334 /* 335 * No need to do an access_ok check here because r8 has been 336 * 32-bit zero extended: 337 */ 338 ASM_STAC 3391: movl (%r8), %ebp 340 _ASM_EXTABLE(1b, ia32_badarg) 341 ASM_CLAC 342 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 343 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 344 jnz cstar_tracesys 345 346cstar_do_call: 347 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 348 movl %edi, %r8d /* arg5 */ 349 movl %ebp, %r9d /* arg6 */ 350 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 351 movl %ebx, %edi /* arg1 */ 352 movl %edx, %edx /* arg3 (zero extension) */ 353 354cstar_dispatch: 355 cmpq $(IA32_NR_syscalls-1), %rax 356 ja 1f 357 358 call *ia32_sys_call_table(, %rax, 8) 359 movq %rax, RAX(%rsp) 3601: 361 movl RCX(%rsp), %ebp 362 DISABLE_INTERRUPTS(CLBR_NONE) 363 TRACE_IRQS_OFF 364 testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 365 jnz sysretl_audit 366 367sysretl_from_sys_call: 368 andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 369 RESTORE_RSI_RDI_RDX 370 movl RIP(%rsp), %ecx 371 movl EFLAGS(%rsp), %r11d 372 xorq %r10, %r10 373 xorq %r9, %r9 374 xorq %r8, %r8 375 TRACE_IRQS_ON 376 movl RSP(%rsp), %esp 377 /* 378 * 64-bit->32-bit SYSRET restores eip from ecx, 379 * eflags from r11 (but RF and VM bits are forced to 0), 380 * cs and ss are loaded from MSRs. 381 * (Note: 32-bit->32-bit SYSRET is different: since r11 382 * does not exist, it merely sets eflags.IF=1). 383 * 384 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss 385 * descriptor is not reinitialized. This means that we must 386 * avoid SYSRET with SS == NULL, which could happen if we schedule, 387 * exit the kernel, and re-enter using an interrupt vector. (All 388 * interrupt entries on x86_64 set SS to NULL.) We prevent that 389 * from happening by reloading SS in __switch_to. 390 */ 391 USERGS_SYSRET32 392 393#ifdef CONFIG_AUDITSYSCALL 394cstar_auditsys: 395 auditsys_entry_common 396 jmp cstar_dispatch 397 398sysretl_audit: 399 auditsys_exit sysretl_from_sys_call 400#endif 401 402cstar_tracesys: 403#ifdef CONFIG_AUDITSYSCALL 404 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 405 jz cstar_auditsys 406#endif 407 SAVE_EXTRA_REGS 408 xorl %eax, %eax /* Do not leak kernel information */ 409 movq %rax, R11(%rsp) 410 movq %rax, R10(%rsp) 411 movq %rax, R9(%rsp) 412 movq %rax, R8(%rsp) 413 movq %rsp, %rdi /* &pt_regs -> arg1 */ 414 call syscall_trace_enter 415 416 /* Reload arg registers from stack. (see sysenter_tracesys) */ 417 movl RCX(%rsp), %ecx 418 movl RDX(%rsp), %edx 419 movl RSI(%rsp), %esi 420 movl RDI(%rsp), %edi 421 movl %eax, %eax /* zero extension */ 422 423 RESTORE_EXTRA_REGS 424 jmp cstar_do_call 425END(entry_SYSCALL_compat) 426 427ia32_badarg: 428 ASM_CLAC 429 movq $-EFAULT, RAX(%rsp) 430ia32_ret_from_sys_call: 431 xorl %eax, %eax /* Do not leak kernel information */ 432 movq %rax, R11(%rsp) 433 movq %rax, R10(%rsp) 434 movq %rax, R9(%rsp) 435 movq %rax, R8(%rsp) 436 jmp int_ret_from_sys_call 437 438/* 439 * Emulated IA32 system calls via int 0x80. 440 * 441 * Arguments: 442 * eax system call number 443 * ebx arg1 444 * ecx arg2 445 * edx arg3 446 * esi arg4 447 * edi arg5 448 * ebp arg6 (note: not saved in the stack frame, should not be touched) 449 * 450 * Notes: 451 * Uses the same stack frame as the x86-64 version. 452 * All registers except eax must be saved (but ptrace may violate that). 453 * Arguments are zero extended. For system calls that want sign extension and 454 * take long arguments a wrapper is needed. Most calls can just be called 455 * directly. 456 * Assumes it is only called from user space and entered with interrupts off. 457 */ 458 459ENTRY(entry_INT80_compat) 460 /* 461 * Interrupts are off on entry. 462 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, 463 * it is too small to ever cause noticeable irq latency. 464 */ 465 PARAVIRT_ADJUST_EXCEPTION_FRAME 466 SWAPGS 467 ENABLE_INTERRUPTS(CLBR_NONE) 468 469 /* Zero-extending 32-bit regs, do not remove */ 470 movl %eax, %eax 471 472 /* Construct struct pt_regs on stack (iret frame is already on stack) */ 473 pushq %rax /* pt_regs->orig_ax */ 474 pushq %rdi /* pt_regs->di */ 475 pushq %rsi /* pt_regs->si */ 476 pushq %rdx /* pt_regs->dx */ 477 pushq %rcx /* pt_regs->cx */ 478 pushq $-ENOSYS /* pt_regs->ax */ 479 pushq $0 /* pt_regs->r8 */ 480 pushq $0 /* pt_regs->r9 */ 481 pushq $0 /* pt_regs->r10 */ 482 pushq $0 /* pt_regs->r11 */ 483 cld 484 sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ 485 486 orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) 487 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) 488 jnz ia32_tracesys 489 490ia32_do_call: 491 /* 32-bit syscall -> 64-bit C ABI argument conversion */ 492 movl %edi, %r8d /* arg5 */ 493 movl %ebp, %r9d /* arg6 */ 494 xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ 495 movl %ebx, %edi /* arg1 */ 496 movl %edx, %edx /* arg3 (zero extension) */ 497 cmpq $(IA32_NR_syscalls-1), %rax 498 ja 1f 499 500 call *ia32_sys_call_table(, %rax, 8) 501 movq %rax, RAX(%rsp) 5021: 503 jmp int_ret_from_sys_call 504 505ia32_tracesys: 506 SAVE_EXTRA_REGS 507 movq %rsp, %rdi /* &pt_regs -> arg1 */ 508 call syscall_trace_enter 509 /* 510 * Reload arg registers from stack in case ptrace changed them. 511 * Don't reload %eax because syscall_trace_enter() returned 512 * the %rax value we should see. But do truncate it to 32 bits. 513 * If it's -1 to make us punt the syscall, then (u32)-1 is still 514 * an appropriately invalid value. 515 */ 516 movl RCX(%rsp), %ecx 517 movl RDX(%rsp), %edx 518 movl RSI(%rsp), %esi 519 movl RDI(%rsp), %edi 520 movl %eax, %eax /* zero extension */ 521 RESTORE_EXTRA_REGS 522 jmp ia32_do_call 523END(entry_INT80_compat) 524 525 .macro PTREGSCALL label, func 526 ALIGN 527GLOBAL(\label) 528 leaq \func(%rip), %rax 529 jmp ia32_ptregs_common 530 .endm 531 532 PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn 533 PTREGSCALL stub32_sigreturn, sys32_sigreturn 534 PTREGSCALL stub32_fork, sys_fork 535 PTREGSCALL stub32_vfork, sys_vfork 536 537 ALIGN 538GLOBAL(stub32_clone) 539 leaq sys_clone(%rip), %rax 540 /* 541 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). 542 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). 543 * 544 * The native 64-bit kernel's sys_clone() implements the latter, 545 * so we need to swap arguments here before calling it: 546 */ 547 xchg %r8, %rcx 548 jmp ia32_ptregs_common 549 550 ALIGN 551ia32_ptregs_common: 552 SAVE_EXTRA_REGS 8 553 call *%rax 554 RESTORE_EXTRA_REGS 8 555 ret 556END(ia32_ptregs_common) 557