1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * 4 * Pentium III FXSR, SSE support 5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * 7 * X86-64 port 8 * Andi Kleen. 9 * 10 * CPU hotplug support - ashok.raj@intel.com 11 */ 12 13 /* 14 * This file handles the architecture-dependent parts of process handling.. 15 */ 16 17 #include <linux/cpu.h> 18 #include <linux/errno.h> 19 #include <linux/sched.h> 20 #include <linux/sched/task.h> 21 #include <linux/sched/task_stack.h> 22 #include <linux/fs.h> 23 #include <linux/kernel.h> 24 #include <linux/mm.h> 25 #include <linux/elfcore.h> 26 #include <linux/smp.h> 27 #include <linux/slab.h> 28 #include <linux/user.h> 29 #include <linux/interrupt.h> 30 #include <linux/delay.h> 31 #include <linux/export.h> 32 #include <linux/ptrace.h> 33 #include <linux/notifier.h> 34 #include <linux/kprobes.h> 35 #include <linux/kdebug.h> 36 #include <linux/prctl.h> 37 #include <linux/uaccess.h> 38 #include <linux/io.h> 39 #include <linux/ftrace.h> 40 #include <linux/syscalls.h> 41 42 #include <asm/pgtable.h> 43 #include <asm/processor.h> 44 #include <asm/fpu/internal.h> 45 #include <asm/mmu_context.h> 46 #include <asm/prctl.h> 47 #include <asm/desc.h> 48 #include <asm/proto.h> 49 #include <asm/ia32.h> 50 #include <asm/syscalls.h> 51 #include <asm/debugreg.h> 52 #include <asm/switch_to.h> 53 #include <asm/xen/hypervisor.h> 54 #include <asm/vdso.h> 55 #include <asm/intel_rdt_sched.h> 56 #include <asm/unistd.h> 57 #include <asm/fsgsbase.h> 58 #ifdef CONFIG_IA32_EMULATION 59 /* Not included via unistd.h */ 60 #include <asm/unistd_32_ia32.h> 61 #endif 62 63 /* Prints also some state that isn't saved in the pt_regs */ 64 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) 65 { 66 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 67 unsigned long d0, d1, d2, d3, d6, d7; 68 unsigned int fsindex, gsindex; 69 unsigned int ds, cs, es; 70 71 show_iret_regs(regs); 72 73 if (regs->orig_ax != -1) 74 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); 75 else 76 pr_cont("\n"); 77 78 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", 79 regs->ax, regs->bx, regs->cx); 80 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", 81 regs->dx, regs->si, regs->di); 82 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", 83 regs->bp, regs->r8, regs->r9); 84 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", 85 regs->r10, regs->r11, regs->r12); 86 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", 87 regs->r13, regs->r14, regs->r15); 88 89 if (mode == SHOW_REGS_SHORT) 90 return; 91 92 if (mode == SHOW_REGS_USER) { 93 rdmsrl(MSR_FS_BASE, fs); 94 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 95 printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n", 96 fs, shadowgs); 97 return; 98 } 99 100 asm("movl %%ds,%0" : "=r" (ds)); 101 asm("movl %%cs,%0" : "=r" (cs)); 102 asm("movl %%es,%0" : "=r" (es)); 103 asm("movl %%fs,%0" : "=r" (fsindex)); 104 asm("movl %%gs,%0" : "=r" (gsindex)); 105 106 rdmsrl(MSR_FS_BASE, fs); 107 rdmsrl(MSR_GS_BASE, gs); 108 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 109 110 cr0 = read_cr0(); 111 cr2 = read_cr2(); 112 cr3 = __read_cr3(); 113 cr4 = __read_cr4(); 114 115 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 116 fs, fsindex, gs, gsindex, shadowgs); 117 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 118 es, cr0); 119 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 120 cr4); 121 122 get_debugreg(d0, 0); 123 get_debugreg(d1, 1); 124 get_debugreg(d2, 2); 125 get_debugreg(d3, 3); 126 get_debugreg(d6, 6); 127 get_debugreg(d7, 7); 128 129 /* Only print out debug registers if they are in their non-default state. */ 130 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && 131 (d6 == DR6_RESERVED) && (d7 == 0x400))) { 132 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", 133 d0, d1, d2); 134 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", 135 d3, d6, d7); 136 } 137 138 if (boot_cpu_has(X86_FEATURE_OSPKE)) 139 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); 140 } 141 142 void release_thread(struct task_struct *dead_task) 143 { 144 if (dead_task->mm) { 145 #ifdef CONFIG_MODIFY_LDT_SYSCALL 146 if (dead_task->mm->context.ldt) { 147 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", 148 dead_task->comm, 149 dead_task->mm->context.ldt->entries, 150 dead_task->mm->context.ldt->nr_entries); 151 BUG(); 152 } 153 #endif 154 } 155 } 156 157 enum which_selector { 158 FS, 159 GS 160 }; 161 162 /* 163 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are 164 * not available. The goal is to be reasonably fast on non-FSGSBASE systems. 165 * It's forcibly inlined because it'll generate better code and this function 166 * is hot. 167 */ 168 static __always_inline void save_base_legacy(struct task_struct *prev_p, 169 unsigned short selector, 170 enum which_selector which) 171 { 172 if (likely(selector == 0)) { 173 /* 174 * On Intel (without X86_BUG_NULL_SEG), the segment base could 175 * be the pre-existing saved base or it could be zero. On AMD 176 * (with X86_BUG_NULL_SEG), the segment base could be almost 177 * anything. 178 * 179 * This branch is very hot (it's hit twice on almost every 180 * context switch between 64-bit programs), and avoiding 181 * the RDMSR helps a lot, so we just assume that whatever 182 * value is already saved is correct. This matches historical 183 * Linux behavior, so it won't break existing applications. 184 * 185 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we 186 * report that the base is zero, it needs to actually be zero: 187 * see the corresponding logic in load_seg_legacy. 188 */ 189 } else { 190 /* 191 * If the selector is 1, 2, or 3, then the base is zero on 192 * !X86_BUG_NULL_SEG CPUs and could be anything on 193 * X86_BUG_NULL_SEG CPUs. In the latter case, Linux 194 * has never attempted to preserve the base across context 195 * switches. 196 * 197 * If selector > 3, then it refers to a real segment, and 198 * saving the base isn't necessary. 199 */ 200 if (which == FS) 201 prev_p->thread.fsbase = 0; 202 else 203 prev_p->thread.gsbase = 0; 204 } 205 } 206 207 static __always_inline void save_fsgs(struct task_struct *task) 208 { 209 savesegment(fs, task->thread.fsindex); 210 savesegment(gs, task->thread.gsindex); 211 save_base_legacy(task, task->thread.fsindex, FS); 212 save_base_legacy(task, task->thread.gsindex, GS); 213 } 214 215 #if IS_ENABLED(CONFIG_KVM) 216 /* 217 * While a process is running,current->thread.fsbase and current->thread.gsbase 218 * may not match the corresponding CPU registers (see save_base_legacy()). KVM 219 * wants an efficient way to save and restore FSBASE and GSBASE. 220 * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. 221 */ 222 void save_fsgs_for_kvm(void) 223 { 224 save_fsgs(current); 225 } 226 EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); 227 #endif 228 229 static __always_inline void loadseg(enum which_selector which, 230 unsigned short sel) 231 { 232 if (which == FS) 233 loadsegment(fs, sel); 234 else 235 load_gs_index(sel); 236 } 237 238 static __always_inline void load_seg_legacy(unsigned short prev_index, 239 unsigned long prev_base, 240 unsigned short next_index, 241 unsigned long next_base, 242 enum which_selector which) 243 { 244 if (likely(next_index <= 3)) { 245 /* 246 * The next task is using 64-bit TLS, is not using this 247 * segment at all, or is having fun with arcane CPU features. 248 */ 249 if (next_base == 0) { 250 /* 251 * Nasty case: on AMD CPUs, we need to forcibly zero 252 * the base. 253 */ 254 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { 255 loadseg(which, __USER_DS); 256 loadseg(which, next_index); 257 } else { 258 /* 259 * We could try to exhaustively detect cases 260 * under which we can skip the segment load, 261 * but there's really only one case that matters 262 * for performance: if both the previous and 263 * next states are fully zeroed, we can skip 264 * the load. 265 * 266 * (This assumes that prev_base == 0 has no 267 * false positives. This is the case on 268 * Intel-style CPUs.) 269 */ 270 if (likely(prev_index | next_index | prev_base)) 271 loadseg(which, next_index); 272 } 273 } else { 274 if (prev_index != next_index) 275 loadseg(which, next_index); 276 wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, 277 next_base); 278 } 279 } else { 280 /* 281 * The next task is using a real segment. Loading the selector 282 * is sufficient. 283 */ 284 loadseg(which, next_index); 285 } 286 } 287 288 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, 289 struct thread_struct *next) 290 { 291 load_seg_legacy(prev->fsindex, prev->fsbase, 292 next->fsindex, next->fsbase, FS); 293 load_seg_legacy(prev->gsindex, prev->gsbase, 294 next->gsindex, next->gsbase, GS); 295 } 296 297 static unsigned long x86_fsgsbase_read_task(struct task_struct *task, 298 unsigned short selector) 299 { 300 unsigned short idx = selector >> 3; 301 unsigned long base; 302 303 if (likely((selector & SEGMENT_TI_MASK) == 0)) { 304 if (unlikely(idx >= GDT_ENTRIES)) 305 return 0; 306 307 /* 308 * There are no user segments in the GDT with nonzero bases 309 * other than the TLS segments. 310 */ 311 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) 312 return 0; 313 314 idx -= GDT_ENTRY_TLS_MIN; 315 base = get_desc_base(&task->thread.tls_array[idx]); 316 } else { 317 #ifdef CONFIG_MODIFY_LDT_SYSCALL 318 struct ldt_struct *ldt; 319 320 /* 321 * If performance here mattered, we could protect the LDT 322 * with RCU. This is a slow path, though, so we can just 323 * take the mutex. 324 */ 325 mutex_lock(&task->mm->context.lock); 326 ldt = task->mm->context.ldt; 327 if (unlikely(idx >= ldt->nr_entries)) 328 base = 0; 329 else 330 base = get_desc_base(ldt->entries + idx); 331 mutex_unlock(&task->mm->context.lock); 332 #else 333 base = 0; 334 #endif 335 } 336 337 return base; 338 } 339 340 void x86_fsbase_write_cpu(unsigned long fsbase) 341 { 342 /* 343 * Set the selector to 0 as a notion, that the segment base is 344 * overwritten, which will be checked for skipping the segment load 345 * during context switch. 346 */ 347 loadseg(FS, 0); 348 wrmsrl(MSR_FS_BASE, fsbase); 349 } 350 351 void x86_gsbase_write_cpu_inactive(unsigned long gsbase) 352 { 353 /* Set the selector to 0 for the same reason as %fs above. */ 354 loadseg(GS, 0); 355 wrmsrl(MSR_KERNEL_GS_BASE, gsbase); 356 } 357 358 unsigned long x86_fsbase_read_task(struct task_struct *task) 359 { 360 unsigned long fsbase; 361 362 if (task == current) 363 fsbase = x86_fsbase_read_cpu(); 364 else if (task->thread.fsindex == 0) 365 fsbase = task->thread.fsbase; 366 else 367 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); 368 369 return fsbase; 370 } 371 372 unsigned long x86_gsbase_read_task(struct task_struct *task) 373 { 374 unsigned long gsbase; 375 376 if (task == current) 377 gsbase = x86_gsbase_read_cpu_inactive(); 378 else if (task->thread.gsindex == 0) 379 gsbase = task->thread.gsbase; 380 else 381 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); 382 383 return gsbase; 384 } 385 386 int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) 387 { 388 /* 389 * Not strictly needed for %fs, but do it for symmetry 390 * with %gs 391 */ 392 if (unlikely(fsbase >= TASK_SIZE_MAX)) 393 return -EPERM; 394 395 preempt_disable(); 396 task->thread.fsbase = fsbase; 397 if (task == current) 398 x86_fsbase_write_cpu(fsbase); 399 task->thread.fsindex = 0; 400 preempt_enable(); 401 402 return 0; 403 } 404 405 int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) 406 { 407 if (unlikely(gsbase >= TASK_SIZE_MAX)) 408 return -EPERM; 409 410 preempt_disable(); 411 task->thread.gsbase = gsbase; 412 if (task == current) 413 x86_gsbase_write_cpu_inactive(gsbase); 414 task->thread.gsindex = 0; 415 preempt_enable(); 416 417 return 0; 418 } 419 420 int copy_thread_tls(unsigned long clone_flags, unsigned long sp, 421 unsigned long arg, struct task_struct *p, unsigned long tls) 422 { 423 int err; 424 struct pt_regs *childregs; 425 struct fork_frame *fork_frame; 426 struct inactive_task_frame *frame; 427 struct task_struct *me = current; 428 429 childregs = task_pt_regs(p); 430 fork_frame = container_of(childregs, struct fork_frame, regs); 431 frame = &fork_frame->frame; 432 frame->bp = 0; 433 frame->ret_addr = (unsigned long) ret_from_fork; 434 p->thread.sp = (unsigned long) fork_frame; 435 p->thread.io_bitmap_ptr = NULL; 436 437 savesegment(gs, p->thread.gsindex); 438 p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase; 439 savesegment(fs, p->thread.fsindex); 440 p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase; 441 savesegment(es, p->thread.es); 442 savesegment(ds, p->thread.ds); 443 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 444 445 if (unlikely(p->flags & PF_KTHREAD)) { 446 /* kernel thread */ 447 memset(childregs, 0, sizeof(struct pt_regs)); 448 frame->bx = sp; /* function */ 449 frame->r12 = arg; 450 return 0; 451 } 452 frame->bx = 0; 453 *childregs = *current_pt_regs(); 454 455 childregs->ax = 0; 456 if (sp) 457 childregs->sp = sp; 458 459 err = -ENOMEM; 460 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 461 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, 462 IO_BITMAP_BYTES, GFP_KERNEL); 463 if (!p->thread.io_bitmap_ptr) { 464 p->thread.io_bitmap_max = 0; 465 return -ENOMEM; 466 } 467 set_tsk_thread_flag(p, TIF_IO_BITMAP); 468 } 469 470 /* 471 * Set a new TLS for the child thread? 472 */ 473 if (clone_flags & CLONE_SETTLS) { 474 #ifdef CONFIG_IA32_EMULATION 475 if (in_ia32_syscall()) 476 err = do_set_thread_area(p, -1, 477 (struct user_desc __user *)tls, 0); 478 else 479 #endif 480 err = do_arch_prctl_64(p, ARCH_SET_FS, tls); 481 if (err) 482 goto out; 483 } 484 err = 0; 485 out: 486 if (err && p->thread.io_bitmap_ptr) { 487 kfree(p->thread.io_bitmap_ptr); 488 p->thread.io_bitmap_max = 0; 489 } 490 491 return err; 492 } 493 494 static void 495 start_thread_common(struct pt_regs *regs, unsigned long new_ip, 496 unsigned long new_sp, 497 unsigned int _cs, unsigned int _ss, unsigned int _ds) 498 { 499 WARN_ON_ONCE(regs != current_pt_regs()); 500 501 if (static_cpu_has(X86_BUG_NULL_SEG)) { 502 /* Loading zero below won't clear the base. */ 503 loadsegment(fs, __USER_DS); 504 load_gs_index(__USER_DS); 505 } 506 507 loadsegment(fs, 0); 508 loadsegment(es, _ds); 509 loadsegment(ds, _ds); 510 load_gs_index(0); 511 512 regs->ip = new_ip; 513 regs->sp = new_sp; 514 regs->cs = _cs; 515 regs->ss = _ss; 516 regs->flags = X86_EFLAGS_IF; 517 force_iret(); 518 } 519 520 void 521 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 522 { 523 start_thread_common(regs, new_ip, new_sp, 524 __USER_CS, __USER_DS, 0); 525 } 526 EXPORT_SYMBOL_GPL(start_thread); 527 528 #ifdef CONFIG_COMPAT 529 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) 530 { 531 start_thread_common(regs, new_ip, new_sp, 532 test_thread_flag(TIF_X32) 533 ? __USER_CS : __USER32_CS, 534 __USER_DS, __USER_DS); 535 } 536 #endif 537 538 /* 539 * switch_to(x,y) should switch tasks from x to y. 540 * 541 * This could still be optimized: 542 * - fold all the options into a flag word and test it with a single test. 543 * - could test fs/gs bitsliced 544 * 545 * Kprobes not supported here. Set the probe on schedule instead. 546 * Function graph tracer not supported too. 547 */ 548 __visible __notrace_funcgraph struct task_struct * 549 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 550 { 551 struct thread_struct *prev = &prev_p->thread; 552 struct thread_struct *next = &next_p->thread; 553 struct fpu *prev_fpu = &prev->fpu; 554 struct fpu *next_fpu = &next->fpu; 555 int cpu = smp_processor_id(); 556 struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); 557 558 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && 559 this_cpu_read(irq_count) != -1); 560 561 switch_fpu_prepare(prev_fpu, cpu); 562 563 /* We must save %fs and %gs before load_TLS() because 564 * %fs and %gs may be cleared by load_TLS(). 565 * 566 * (e.g. xen_load_tls()) 567 */ 568 save_fsgs(prev_p); 569 570 /* 571 * Load TLS before restoring any segments so that segment loads 572 * reference the correct GDT entries. 573 */ 574 load_TLS(next, cpu); 575 576 /* 577 * Leave lazy mode, flushing any hypercalls made here. This 578 * must be done after loading TLS entries in the GDT but before 579 * loading segments that might reference them, and and it must 580 * be done before fpu__restore(), so the TS bit is up to 581 * date. 582 */ 583 arch_end_context_switch(next_p); 584 585 /* Switch DS and ES. 586 * 587 * Reading them only returns the selectors, but writing them (if 588 * nonzero) loads the full descriptor from the GDT or LDT. The 589 * LDT for next is loaded in switch_mm, and the GDT is loaded 590 * above. 591 * 592 * We therefore need to write new values to the segment 593 * registers on every context switch unless both the new and old 594 * values are zero. 595 * 596 * Note that we don't need to do anything for CS and SS, as 597 * those are saved and restored as part of pt_regs. 598 */ 599 savesegment(es, prev->es); 600 if (unlikely(next->es | prev->es)) 601 loadsegment(es, next->es); 602 603 savesegment(ds, prev->ds); 604 if (unlikely(next->ds | prev->ds)) 605 loadsegment(ds, next->ds); 606 607 x86_fsgsbase_load(prev, next); 608 609 switch_fpu_finish(next_fpu, cpu); 610 611 /* 612 * Switch the PDA and FPU contexts. 613 */ 614 this_cpu_write(current_task, next_p); 615 this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); 616 617 /* Reload sp0. */ 618 update_task_stack(next_p); 619 620 /* 621 * Now maybe reload the debug registers and handle I/O bitmaps 622 */ 623 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || 624 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 625 __switch_to_xtra(prev_p, next_p, tss); 626 627 #ifdef CONFIG_XEN_PV 628 /* 629 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and 630 * current_pt_regs()->flags may not match the current task's 631 * intended IOPL. We need to switch it manually. 632 */ 633 if (unlikely(static_cpu_has(X86_FEATURE_XENPV) && 634 prev->iopl != next->iopl)) 635 xen_set_iopl_mask(next->iopl); 636 #endif 637 638 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { 639 /* 640 * AMD CPUs have a misfeature: SYSRET sets the SS selector but 641 * does not update the cached descriptor. As a result, if we 642 * do SYSRET while SS is NULL, we'll end up in user mode with 643 * SS apparently equal to __USER_DS but actually unusable. 644 * 645 * The straightforward workaround would be to fix it up just 646 * before SYSRET, but that would slow down the system call 647 * fast paths. Instead, we ensure that SS is never NULL in 648 * system call context. We do this by replacing NULL SS 649 * selectors at every context switch. SYSCALL sets up a valid 650 * SS, so the only way to get NULL is to re-enter the kernel 651 * from CPL 3 through an interrupt. Since that can't happen 652 * in the same task as a running syscall, we are guaranteed to 653 * context switch between every interrupt vector entry and a 654 * subsequent SYSRET. 655 * 656 * We read SS first because SS reads are much faster than 657 * writes. Out of caution, we force SS to __KERNEL_DS even if 658 * it previously had a different non-NULL value. 659 */ 660 unsigned short ss_sel; 661 savesegment(ss, ss_sel); 662 if (ss_sel != __KERNEL_DS) 663 loadsegment(ss, __KERNEL_DS); 664 } 665 666 /* Load the Intel cache allocation PQR MSR. */ 667 intel_rdt_sched_in(); 668 669 return prev_p; 670 } 671 672 void set_personality_64bit(void) 673 { 674 /* inherit personality from parent */ 675 676 /* Make sure to be in 64bit mode */ 677 clear_thread_flag(TIF_IA32); 678 clear_thread_flag(TIF_ADDR32); 679 clear_thread_flag(TIF_X32); 680 /* Pretend that this comes from a 64bit execve */ 681 task_pt_regs(current)->orig_ax = __NR_execve; 682 current_thread_info()->status &= ~TS_COMPAT; 683 684 /* Ensure the corresponding mm is not marked. */ 685 if (current->mm) 686 current->mm->context.ia32_compat = 0; 687 688 /* TBD: overwrites user setup. Should have two bits. 689 But 64bit processes have always behaved this way, 690 so it's not too bad. The main problem is just that 691 32bit childs are affected again. */ 692 current->personality &= ~READ_IMPLIES_EXEC; 693 } 694 695 static void __set_personality_x32(void) 696 { 697 #ifdef CONFIG_X86_X32 698 clear_thread_flag(TIF_IA32); 699 set_thread_flag(TIF_X32); 700 if (current->mm) 701 current->mm->context.ia32_compat = TIF_X32; 702 current->personality &= ~READ_IMPLIES_EXEC; 703 /* 704 * in_32bit_syscall() uses the presence of the x32 syscall bit 705 * flag to determine compat status. The x86 mmap() code relies on 706 * the syscall bitness so set x32 syscall bit right here to make 707 * in_32bit_syscall() work during exec(). 708 * 709 * Pretend to come from a x32 execve. 710 */ 711 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; 712 current_thread_info()->status &= ~TS_COMPAT; 713 #endif 714 } 715 716 static void __set_personality_ia32(void) 717 { 718 #ifdef CONFIG_IA32_EMULATION 719 set_thread_flag(TIF_IA32); 720 clear_thread_flag(TIF_X32); 721 if (current->mm) 722 current->mm->context.ia32_compat = TIF_IA32; 723 current->personality |= force_personality32; 724 /* Prepare the first "return" to user space */ 725 task_pt_regs(current)->orig_ax = __NR_ia32_execve; 726 current_thread_info()->status |= TS_COMPAT; 727 #endif 728 } 729 730 void set_personality_ia32(bool x32) 731 { 732 /* Make sure to be in 32bit mode */ 733 set_thread_flag(TIF_ADDR32); 734 735 if (x32) 736 __set_personality_x32(); 737 else 738 __set_personality_ia32(); 739 } 740 EXPORT_SYMBOL_GPL(set_personality_ia32); 741 742 #ifdef CONFIG_CHECKPOINT_RESTORE 743 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) 744 { 745 int ret; 746 747 ret = map_vdso_once(image, addr); 748 if (ret) 749 return ret; 750 751 return (long)image->size; 752 } 753 #endif 754 755 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) 756 { 757 int ret = 0; 758 759 switch (option) { 760 case ARCH_SET_GS: { 761 ret = x86_gsbase_write_task(task, arg2); 762 break; 763 } 764 case ARCH_SET_FS: { 765 ret = x86_fsbase_write_task(task, arg2); 766 break; 767 } 768 case ARCH_GET_FS: { 769 unsigned long base = x86_fsbase_read_task(task); 770 771 ret = put_user(base, (unsigned long __user *)arg2); 772 break; 773 } 774 case ARCH_GET_GS: { 775 unsigned long base = x86_gsbase_read_task(task); 776 777 ret = put_user(base, (unsigned long __user *)arg2); 778 break; 779 } 780 781 #ifdef CONFIG_CHECKPOINT_RESTORE 782 # ifdef CONFIG_X86_X32_ABI 783 case ARCH_MAP_VDSO_X32: 784 return prctl_map_vdso(&vdso_image_x32, arg2); 785 # endif 786 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 787 case ARCH_MAP_VDSO_32: 788 return prctl_map_vdso(&vdso_image_32, arg2); 789 # endif 790 case ARCH_MAP_VDSO_64: 791 return prctl_map_vdso(&vdso_image_64, arg2); 792 #endif 793 794 default: 795 ret = -EINVAL; 796 break; 797 } 798 799 return ret; 800 } 801 802 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) 803 { 804 long ret; 805 806 ret = do_arch_prctl_64(current, option, arg2); 807 if (ret == -EINVAL) 808 ret = do_arch_prctl_common(current, option, arg2); 809 810 return ret; 811 } 812 813 #ifdef CONFIG_IA32_EMULATION 814 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) 815 { 816 return do_arch_prctl_common(current, option, arg2); 817 } 818 #endif 819 820 unsigned long KSTK_ESP(struct task_struct *task) 821 { 822 return task_pt_regs(task)->sp; 823 } 824