1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1995 Linus Torvalds 4 * 5 * Pentium III FXSR, SSE support 6 * Gareth Hughes <gareth@valinux.com>, May 2000 7 * 8 * X86-64 port 9 * Andi Kleen. 10 * 11 * CPU hotplug support - ashok.raj@intel.com 12 */ 13 14 /* 15 * This file handles the architecture-dependent parts of process handling.. 16 */ 17 18 #include <linux/cpu.h> 19 #include <linux/errno.h> 20 #include <linux/sched.h> 21 #include <linux/sched/task.h> 22 #include <linux/sched/task_stack.h> 23 #include <linux/fs.h> 24 #include <linux/kernel.h> 25 #include <linux/mm.h> 26 #include <linux/elfcore.h> 27 #include <linux/smp.h> 28 #include <linux/slab.h> 29 #include <linux/user.h> 30 #include <linux/interrupt.h> 31 #include <linux/delay.h> 32 #include <linux/export.h> 33 #include <linux/ptrace.h> 34 #include <linux/notifier.h> 35 #include <linux/kprobes.h> 36 #include <linux/kdebug.h> 37 #include <linux/prctl.h> 38 #include <linux/uaccess.h> 39 #include <linux/io.h> 40 #include <linux/ftrace.h> 41 #include <linux/syscalls.h> 42 #include <linux/iommu.h> 43 44 #include <asm/processor.h> 45 #include <asm/pkru.h> 46 #include <asm/fpu/sched.h> 47 #include <asm/mmu_context.h> 48 #include <asm/prctl.h> 49 #include <asm/desc.h> 50 #include <asm/proto.h> 51 #include <asm/ia32.h> 52 #include <asm/debugreg.h> 53 #include <asm/switch_to.h> 54 #include <asm/xen/hypervisor.h> 55 #include <asm/vdso.h> 56 #include <asm/resctrl.h> 57 #include <asm/unistd.h> 58 #include <asm/fsgsbase.h> 59 #ifdef CONFIG_IA32_EMULATION 60 /* Not included via unistd.h */ 61 #include <asm/unistd_32_ia32.h> 62 #endif 63 64 #include "process.h" 65 66 /* Prints also some state that isn't saved in the pt_regs */ 67 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, 68 const char *log_lvl) 69 { 70 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 71 unsigned long d0, d1, d2, d3, d6, d7; 72 unsigned int fsindex, gsindex; 73 unsigned int ds, es; 74 75 show_iret_regs(regs, log_lvl); 76 77 if (regs->orig_ax != -1) 78 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); 79 else 80 pr_cont("\n"); 81 82 printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", 83 log_lvl, regs->ax, regs->bx, regs->cx); 84 printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", 85 log_lvl, regs->dx, regs->si, regs->di); 86 printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", 87 log_lvl, regs->bp, regs->r8, regs->r9); 88 printk("%sR10: %016lx R11: %016lx R12: %016lx\n", 89 log_lvl, regs->r10, regs->r11, regs->r12); 90 printk("%sR13: %016lx R14: %016lx R15: %016lx\n", 91 log_lvl, regs->r13, regs->r14, regs->r15); 92 93 if (mode == SHOW_REGS_SHORT) 94 return; 95 96 if (mode == SHOW_REGS_USER) { 97 rdmsrl(MSR_FS_BASE, fs); 98 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 99 printk("%sFS: %016lx GS: %016lx\n", 100 log_lvl, fs, shadowgs); 101 return; 102 } 103 104 asm("movl %%ds,%0" : "=r" (ds)); 105 asm("movl %%es,%0" : "=r" (es)); 106 asm("movl %%fs,%0" : "=r" (fsindex)); 107 asm("movl %%gs,%0" : "=r" (gsindex)); 108 109 rdmsrl(MSR_FS_BASE, fs); 110 rdmsrl(MSR_GS_BASE, gs); 111 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 112 113 cr0 = read_cr0(); 114 cr2 = read_cr2(); 115 cr3 = __read_cr3(); 116 cr4 = __read_cr4(); 117 118 printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 119 log_lvl, fs, fsindex, gs, gsindex, shadowgs); 120 printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", 121 log_lvl, regs->cs, ds, es, cr0); 122 printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", 123 log_lvl, cr2, cr3, cr4); 124 125 get_debugreg(d0, 0); 126 get_debugreg(d1, 1); 127 get_debugreg(d2, 2); 128 get_debugreg(d3, 3); 129 get_debugreg(d6, 6); 130 get_debugreg(d7, 7); 131 132 /* Only print out debug registers if they are in their non-default state. */ 133 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && 134 (d6 == DR6_RESERVED) && (d7 == 0x400))) { 135 printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", 136 log_lvl, d0, d1, d2); 137 printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", 138 log_lvl, d3, d6, d7); 139 } 140 141 if (cr4 & X86_CR4_PKE) 142 printk("%sPKRU: %08x\n", log_lvl, read_pkru()); 143 } 144 145 void release_thread(struct task_struct *dead_task) 146 { 147 WARN_ON(dead_task->mm); 148 } 149 150 enum which_selector { 151 FS, 152 GS 153 }; 154 155 /* 156 * Out of line to be protected from kprobes and tracing. If this would be 157 * traced or probed than any access to a per CPU variable happens with 158 * the wrong GS. 159 * 160 * It is not used on Xen paravirt. When paravirt support is needed, it 161 * needs to be renamed with native_ prefix. 162 */ 163 static noinstr unsigned long __rdgsbase_inactive(void) 164 { 165 unsigned long gsbase; 166 167 lockdep_assert_irqs_disabled(); 168 169 if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { 170 native_swapgs(); 171 gsbase = rdgsbase(); 172 native_swapgs(); 173 } else { 174 instrumentation_begin(); 175 rdmsrl(MSR_KERNEL_GS_BASE, gsbase); 176 instrumentation_end(); 177 } 178 179 return gsbase; 180 } 181 182 /* 183 * Out of line to be protected from kprobes and tracing. If this would be 184 * traced or probed than any access to a per CPU variable happens with 185 * the wrong GS. 186 * 187 * It is not used on Xen paravirt. When paravirt support is needed, it 188 * needs to be renamed with native_ prefix. 189 */ 190 static noinstr void __wrgsbase_inactive(unsigned long gsbase) 191 { 192 lockdep_assert_irqs_disabled(); 193 194 if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { 195 native_swapgs(); 196 wrgsbase(gsbase); 197 native_swapgs(); 198 } else { 199 instrumentation_begin(); 200 wrmsrl(MSR_KERNEL_GS_BASE, gsbase); 201 instrumentation_end(); 202 } 203 } 204 205 /* 206 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are 207 * not available. The goal is to be reasonably fast on non-FSGSBASE systems. 208 * It's forcibly inlined because it'll generate better code and this function 209 * is hot. 210 */ 211 static __always_inline void save_base_legacy(struct task_struct *prev_p, 212 unsigned short selector, 213 enum which_selector which) 214 { 215 if (likely(selector == 0)) { 216 /* 217 * On Intel (without X86_BUG_NULL_SEG), the segment base could 218 * be the pre-existing saved base or it could be zero. On AMD 219 * (with X86_BUG_NULL_SEG), the segment base could be almost 220 * anything. 221 * 222 * This branch is very hot (it's hit twice on almost every 223 * context switch between 64-bit programs), and avoiding 224 * the RDMSR helps a lot, so we just assume that whatever 225 * value is already saved is correct. This matches historical 226 * Linux behavior, so it won't break existing applications. 227 * 228 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we 229 * report that the base is zero, it needs to actually be zero: 230 * see the corresponding logic in load_seg_legacy. 231 */ 232 } else { 233 /* 234 * If the selector is 1, 2, or 3, then the base is zero on 235 * !X86_BUG_NULL_SEG CPUs and could be anything on 236 * X86_BUG_NULL_SEG CPUs. In the latter case, Linux 237 * has never attempted to preserve the base across context 238 * switches. 239 * 240 * If selector > 3, then it refers to a real segment, and 241 * saving the base isn't necessary. 242 */ 243 if (which == FS) 244 prev_p->thread.fsbase = 0; 245 else 246 prev_p->thread.gsbase = 0; 247 } 248 } 249 250 static __always_inline void save_fsgs(struct task_struct *task) 251 { 252 savesegment(fs, task->thread.fsindex); 253 savesegment(gs, task->thread.gsindex); 254 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 255 /* 256 * If FSGSBASE is enabled, we can't make any useful guesses 257 * about the base, and user code expects us to save the current 258 * value. Fortunately, reading the base directly is efficient. 259 */ 260 task->thread.fsbase = rdfsbase(); 261 task->thread.gsbase = __rdgsbase_inactive(); 262 } else { 263 save_base_legacy(task, task->thread.fsindex, FS); 264 save_base_legacy(task, task->thread.gsindex, GS); 265 } 266 } 267 268 /* 269 * While a process is running,current->thread.fsbase and current->thread.gsbase 270 * may not match the corresponding CPU registers (see save_base_legacy()). 271 */ 272 void current_save_fsgs(void) 273 { 274 unsigned long flags; 275 276 /* Interrupts need to be off for FSGSBASE */ 277 local_irq_save(flags); 278 save_fsgs(current); 279 local_irq_restore(flags); 280 } 281 #if IS_ENABLED(CONFIG_KVM) 282 EXPORT_SYMBOL_GPL(current_save_fsgs); 283 #endif 284 285 static __always_inline void loadseg(enum which_selector which, 286 unsigned short sel) 287 { 288 if (which == FS) 289 loadsegment(fs, sel); 290 else 291 load_gs_index(sel); 292 } 293 294 static __always_inline void load_seg_legacy(unsigned short prev_index, 295 unsigned long prev_base, 296 unsigned short next_index, 297 unsigned long next_base, 298 enum which_selector which) 299 { 300 if (likely(next_index <= 3)) { 301 /* 302 * The next task is using 64-bit TLS, is not using this 303 * segment at all, or is having fun with arcane CPU features. 304 */ 305 if (next_base == 0) { 306 /* 307 * Nasty case: on AMD CPUs, we need to forcibly zero 308 * the base. 309 */ 310 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { 311 loadseg(which, __USER_DS); 312 loadseg(which, next_index); 313 } else { 314 /* 315 * We could try to exhaustively detect cases 316 * under which we can skip the segment load, 317 * but there's really only one case that matters 318 * for performance: if both the previous and 319 * next states are fully zeroed, we can skip 320 * the load. 321 * 322 * (This assumes that prev_base == 0 has no 323 * false positives. This is the case on 324 * Intel-style CPUs.) 325 */ 326 if (likely(prev_index | next_index | prev_base)) 327 loadseg(which, next_index); 328 } 329 } else { 330 if (prev_index != next_index) 331 loadseg(which, next_index); 332 wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, 333 next_base); 334 } 335 } else { 336 /* 337 * The next task is using a real segment. Loading the selector 338 * is sufficient. 339 */ 340 loadseg(which, next_index); 341 } 342 } 343 344 /* 345 * Store prev's PKRU value and load next's PKRU value if they differ. PKRU 346 * is not XSTATE managed on context switch because that would require a 347 * lookup in the task's FPU xsave buffer and require to keep that updated 348 * in various places. 349 */ 350 static __always_inline void x86_pkru_load(struct thread_struct *prev, 351 struct thread_struct *next) 352 { 353 if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) 354 return; 355 356 /* Stash the prev task's value: */ 357 prev->pkru = rdpkru(); 358 359 /* 360 * PKRU writes are slightly expensive. Avoid them when not 361 * strictly necessary: 362 */ 363 if (prev->pkru != next->pkru) 364 wrpkru(next->pkru); 365 } 366 367 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, 368 struct thread_struct *next) 369 { 370 if (static_cpu_has(X86_FEATURE_FSGSBASE)) { 371 /* Update the FS and GS selectors if they could have changed. */ 372 if (unlikely(prev->fsindex || next->fsindex)) 373 loadseg(FS, next->fsindex); 374 if (unlikely(prev->gsindex || next->gsindex)) 375 loadseg(GS, next->gsindex); 376 377 /* Update the bases. */ 378 wrfsbase(next->fsbase); 379 __wrgsbase_inactive(next->gsbase); 380 } else { 381 load_seg_legacy(prev->fsindex, prev->fsbase, 382 next->fsindex, next->fsbase, FS); 383 load_seg_legacy(prev->gsindex, prev->gsbase, 384 next->gsindex, next->gsbase, GS); 385 } 386 } 387 388 unsigned long x86_fsgsbase_read_task(struct task_struct *task, 389 unsigned short selector) 390 { 391 unsigned short idx = selector >> 3; 392 unsigned long base; 393 394 if (likely((selector & SEGMENT_TI_MASK) == 0)) { 395 if (unlikely(idx >= GDT_ENTRIES)) 396 return 0; 397 398 /* 399 * There are no user segments in the GDT with nonzero bases 400 * other than the TLS segments. 401 */ 402 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) 403 return 0; 404 405 idx -= GDT_ENTRY_TLS_MIN; 406 base = get_desc_base(&task->thread.tls_array[idx]); 407 } else { 408 #ifdef CONFIG_MODIFY_LDT_SYSCALL 409 struct ldt_struct *ldt; 410 411 /* 412 * If performance here mattered, we could protect the LDT 413 * with RCU. This is a slow path, though, so we can just 414 * take the mutex. 415 */ 416 mutex_lock(&task->mm->context.lock); 417 ldt = task->mm->context.ldt; 418 if (unlikely(!ldt || idx >= ldt->nr_entries)) 419 base = 0; 420 else 421 base = get_desc_base(ldt->entries + idx); 422 mutex_unlock(&task->mm->context.lock); 423 #else 424 base = 0; 425 #endif 426 } 427 428 return base; 429 } 430 431 unsigned long x86_gsbase_read_cpu_inactive(void) 432 { 433 unsigned long gsbase; 434 435 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { 436 unsigned long flags; 437 438 local_irq_save(flags); 439 gsbase = __rdgsbase_inactive(); 440 local_irq_restore(flags); 441 } else { 442 rdmsrl(MSR_KERNEL_GS_BASE, gsbase); 443 } 444 445 return gsbase; 446 } 447 448 void x86_gsbase_write_cpu_inactive(unsigned long gsbase) 449 { 450 if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { 451 unsigned long flags; 452 453 local_irq_save(flags); 454 __wrgsbase_inactive(gsbase); 455 local_irq_restore(flags); 456 } else { 457 wrmsrl(MSR_KERNEL_GS_BASE, gsbase); 458 } 459 } 460 461 unsigned long x86_fsbase_read_task(struct task_struct *task) 462 { 463 unsigned long fsbase; 464 465 if (task == current) 466 fsbase = x86_fsbase_read_cpu(); 467 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || 468 (task->thread.fsindex == 0)) 469 fsbase = task->thread.fsbase; 470 else 471 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); 472 473 return fsbase; 474 } 475 476 unsigned long x86_gsbase_read_task(struct task_struct *task) 477 { 478 unsigned long gsbase; 479 480 if (task == current) 481 gsbase = x86_gsbase_read_cpu_inactive(); 482 else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || 483 (task->thread.gsindex == 0)) 484 gsbase = task->thread.gsbase; 485 else 486 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); 487 488 return gsbase; 489 } 490 491 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) 492 { 493 WARN_ON_ONCE(task == current); 494 495 task->thread.fsbase = fsbase; 496 } 497 498 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) 499 { 500 WARN_ON_ONCE(task == current); 501 502 task->thread.gsbase = gsbase; 503 } 504 505 static void 506 start_thread_common(struct pt_regs *regs, unsigned long new_ip, 507 unsigned long new_sp, 508 unsigned int _cs, unsigned int _ss, unsigned int _ds) 509 { 510 WARN_ON_ONCE(regs != current_pt_regs()); 511 512 if (static_cpu_has(X86_BUG_NULL_SEG)) { 513 /* Loading zero below won't clear the base. */ 514 loadsegment(fs, __USER_DS); 515 load_gs_index(__USER_DS); 516 } 517 518 reset_thread_features(); 519 520 loadsegment(fs, 0); 521 loadsegment(es, _ds); 522 loadsegment(ds, _ds); 523 load_gs_index(0); 524 525 regs->ip = new_ip; 526 regs->sp = new_sp; 527 regs->cs = _cs; 528 regs->ss = _ss; 529 regs->flags = X86_EFLAGS_IF; 530 } 531 532 void 533 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 534 { 535 start_thread_common(regs, new_ip, new_sp, 536 __USER_CS, __USER_DS, 0); 537 } 538 EXPORT_SYMBOL_GPL(start_thread); 539 540 #ifdef CONFIG_COMPAT 541 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) 542 { 543 start_thread_common(regs, new_ip, new_sp, 544 x32 ? __USER_CS : __USER32_CS, 545 __USER_DS, __USER_DS); 546 } 547 #endif 548 549 /* 550 * switch_to(x,y) should switch tasks from x to y. 551 * 552 * This could still be optimized: 553 * - fold all the options into a flag word and test it with a single test. 554 * - could test fs/gs bitsliced 555 * 556 * Kprobes not supported here. Set the probe on schedule instead. 557 * Function graph tracer not supported too. 558 */ 559 __no_kmsan_checks 560 __visible __notrace_funcgraph struct task_struct * 561 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 562 { 563 struct thread_struct *prev = &prev_p->thread; 564 struct thread_struct *next = &next_p->thread; 565 struct fpu *prev_fpu = &prev->fpu; 566 int cpu = smp_processor_id(); 567 568 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && 569 this_cpu_read(pcpu_hot.hardirq_stack_inuse)); 570 571 if (!test_thread_flag(TIF_NEED_FPU_LOAD)) 572 switch_fpu_prepare(prev_fpu, cpu); 573 574 /* We must save %fs and %gs before load_TLS() because 575 * %fs and %gs may be cleared by load_TLS(). 576 * 577 * (e.g. xen_load_tls()) 578 */ 579 save_fsgs(prev_p); 580 581 /* 582 * Load TLS before restoring any segments so that segment loads 583 * reference the correct GDT entries. 584 */ 585 load_TLS(next, cpu); 586 587 /* 588 * Leave lazy mode, flushing any hypercalls made here. This 589 * must be done after loading TLS entries in the GDT but before 590 * loading segments that might reference them. 591 */ 592 arch_end_context_switch(next_p); 593 594 /* Switch DS and ES. 595 * 596 * Reading them only returns the selectors, but writing them (if 597 * nonzero) loads the full descriptor from the GDT or LDT. The 598 * LDT for next is loaded in switch_mm, and the GDT is loaded 599 * above. 600 * 601 * We therefore need to write new values to the segment 602 * registers on every context switch unless both the new and old 603 * values are zero. 604 * 605 * Note that we don't need to do anything for CS and SS, as 606 * those are saved and restored as part of pt_regs. 607 */ 608 savesegment(es, prev->es); 609 if (unlikely(next->es | prev->es)) 610 loadsegment(es, next->es); 611 612 savesegment(ds, prev->ds); 613 if (unlikely(next->ds | prev->ds)) 614 loadsegment(ds, next->ds); 615 616 x86_fsgsbase_load(prev, next); 617 618 x86_pkru_load(prev, next); 619 620 /* 621 * Switch the PDA and FPU contexts. 622 */ 623 raw_cpu_write(pcpu_hot.current_task, next_p); 624 raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); 625 626 switch_fpu_finish(); 627 628 /* Reload sp0. */ 629 update_task_stack(next_p); 630 631 switch_to_extra(prev_p, next_p); 632 633 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { 634 /* 635 * AMD CPUs have a misfeature: SYSRET sets the SS selector but 636 * does not update the cached descriptor. As a result, if we 637 * do SYSRET while SS is NULL, we'll end up in user mode with 638 * SS apparently equal to __USER_DS but actually unusable. 639 * 640 * The straightforward workaround would be to fix it up just 641 * before SYSRET, but that would slow down the system call 642 * fast paths. Instead, we ensure that SS is never NULL in 643 * system call context. We do this by replacing NULL SS 644 * selectors at every context switch. SYSCALL sets up a valid 645 * SS, so the only way to get NULL is to re-enter the kernel 646 * from CPL 3 through an interrupt. Since that can't happen 647 * in the same task as a running syscall, we are guaranteed to 648 * context switch between every interrupt vector entry and a 649 * subsequent SYSRET. 650 * 651 * We read SS first because SS reads are much faster than 652 * writes. Out of caution, we force SS to __KERNEL_DS even if 653 * it previously had a different non-NULL value. 654 */ 655 unsigned short ss_sel; 656 savesegment(ss, ss_sel); 657 if (ss_sel != __KERNEL_DS) 658 loadsegment(ss, __KERNEL_DS); 659 } 660 661 /* Load the Intel cache allocation PQR MSR. */ 662 resctrl_sched_in(next_p); 663 664 return prev_p; 665 } 666 667 void set_personality_64bit(void) 668 { 669 /* inherit personality from parent */ 670 671 /* Make sure to be in 64bit mode */ 672 clear_thread_flag(TIF_ADDR32); 673 /* Pretend that this comes from a 64bit execve */ 674 task_pt_regs(current)->orig_ax = __NR_execve; 675 current_thread_info()->status &= ~TS_COMPAT; 676 if (current->mm) 677 __set_bit(MM_CONTEXT_HAS_VSYSCALL, ¤t->mm->context.flags); 678 679 /* TBD: overwrites user setup. Should have two bits. 680 But 64bit processes have always behaved this way, 681 so it's not too bad. The main problem is just that 682 32bit children are affected again. */ 683 current->personality &= ~READ_IMPLIES_EXEC; 684 } 685 686 static void __set_personality_x32(void) 687 { 688 #ifdef CONFIG_X86_X32_ABI 689 if (current->mm) 690 current->mm->context.flags = 0; 691 692 current->personality &= ~READ_IMPLIES_EXEC; 693 /* 694 * in_32bit_syscall() uses the presence of the x32 syscall bit 695 * flag to determine compat status. The x86 mmap() code relies on 696 * the syscall bitness so set x32 syscall bit right here to make 697 * in_32bit_syscall() work during exec(). 698 * 699 * Pretend to come from a x32 execve. 700 */ 701 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; 702 current_thread_info()->status &= ~TS_COMPAT; 703 #endif 704 } 705 706 static void __set_personality_ia32(void) 707 { 708 #ifdef CONFIG_IA32_EMULATION 709 if (current->mm) { 710 /* 711 * uprobes applied to this MM need to know this and 712 * cannot use user_64bit_mode() at that time. 713 */ 714 __set_bit(MM_CONTEXT_UPROBE_IA32, ¤t->mm->context.flags); 715 } 716 717 current->personality |= force_personality32; 718 /* Prepare the first "return" to user space */ 719 task_pt_regs(current)->orig_ax = __NR_ia32_execve; 720 current_thread_info()->status |= TS_COMPAT; 721 #endif 722 } 723 724 void set_personality_ia32(bool x32) 725 { 726 /* Make sure to be in 32bit mode */ 727 set_thread_flag(TIF_ADDR32); 728 729 if (x32) 730 __set_personality_x32(); 731 else 732 __set_personality_ia32(); 733 } 734 EXPORT_SYMBOL_GPL(set_personality_ia32); 735 736 #ifdef CONFIG_CHECKPOINT_RESTORE 737 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) 738 { 739 int ret; 740 741 ret = map_vdso_once(image, addr); 742 if (ret) 743 return ret; 744 745 return (long)image->size; 746 } 747 #endif 748 749 #ifdef CONFIG_ADDRESS_MASKING 750 751 #define LAM_U57_BITS 6 752 753 static void enable_lam_func(void *__mm) 754 { 755 struct mm_struct *mm = __mm; 756 757 if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { 758 write_cr3(__read_cr3() | mm->context.lam_cr3_mask); 759 set_tlbstate_lam_mode(mm); 760 } 761 } 762 763 static void mm_enable_lam(struct mm_struct *mm) 764 { 765 /* 766 * Even though the process must still be single-threaded at this 767 * point, kernel threads may be using the mm. IPI those kernel 768 * threads if they exist. 769 */ 770 on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); 771 set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); 772 } 773 774 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) 775 { 776 if (!cpu_feature_enabled(X86_FEATURE_LAM)) 777 return -ENODEV; 778 779 /* PTRACE_ARCH_PRCTL */ 780 if (current->mm != mm) 781 return -EINVAL; 782 783 if (mm_valid_pasid(mm) && 784 !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) 785 return -EINVAL; 786 787 if (mmap_write_lock_killable(mm)) 788 return -EINTR; 789 790 /* 791 * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from 792 * being enabled unless the process is single threaded: 793 */ 794 if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { 795 mmap_write_unlock(mm); 796 return -EBUSY; 797 } 798 799 if (!nr_bits) { 800 mmap_write_unlock(mm); 801 return -EINVAL; 802 } else if (nr_bits <= LAM_U57_BITS) { 803 mm->context.lam_cr3_mask = X86_CR3_LAM_U57; 804 mm->context.untag_mask = ~GENMASK(62, 57); 805 } else { 806 mmap_write_unlock(mm); 807 return -EINVAL; 808 } 809 810 mm_enable_lam(mm); 811 812 mmap_write_unlock(mm); 813 814 return 0; 815 } 816 #endif 817 818 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) 819 { 820 int ret = 0; 821 822 switch (option) { 823 case ARCH_SET_GS: { 824 if (unlikely(arg2 >= TASK_SIZE_MAX)) 825 return -EPERM; 826 827 preempt_disable(); 828 /* 829 * ARCH_SET_GS has always overwritten the index 830 * and the base. Zero is the most sensible value 831 * to put in the index, and is the only value that 832 * makes any sense if FSGSBASE is unavailable. 833 */ 834 if (task == current) { 835 loadseg(GS, 0); 836 x86_gsbase_write_cpu_inactive(arg2); 837 838 /* 839 * On non-FSGSBASE systems, save_base_legacy() expects 840 * that we also fill in thread.gsbase. 841 */ 842 task->thread.gsbase = arg2; 843 844 } else { 845 task->thread.gsindex = 0; 846 x86_gsbase_write_task(task, arg2); 847 } 848 preempt_enable(); 849 break; 850 } 851 case ARCH_SET_FS: { 852 /* 853 * Not strictly needed for %fs, but do it for symmetry 854 * with %gs 855 */ 856 if (unlikely(arg2 >= TASK_SIZE_MAX)) 857 return -EPERM; 858 859 preempt_disable(); 860 /* 861 * Set the selector to 0 for the same reason 862 * as %gs above. 863 */ 864 if (task == current) { 865 loadseg(FS, 0); 866 x86_fsbase_write_cpu(arg2); 867 868 /* 869 * On non-FSGSBASE systems, save_base_legacy() expects 870 * that we also fill in thread.fsbase. 871 */ 872 task->thread.fsbase = arg2; 873 } else { 874 task->thread.fsindex = 0; 875 x86_fsbase_write_task(task, arg2); 876 } 877 preempt_enable(); 878 break; 879 } 880 case ARCH_GET_FS: { 881 unsigned long base = x86_fsbase_read_task(task); 882 883 ret = put_user(base, (unsigned long __user *)arg2); 884 break; 885 } 886 case ARCH_GET_GS: { 887 unsigned long base = x86_gsbase_read_task(task); 888 889 ret = put_user(base, (unsigned long __user *)arg2); 890 break; 891 } 892 893 #ifdef CONFIG_CHECKPOINT_RESTORE 894 # ifdef CONFIG_X86_X32_ABI 895 case ARCH_MAP_VDSO_X32: 896 return prctl_map_vdso(&vdso_image_x32, arg2); 897 # endif 898 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 899 case ARCH_MAP_VDSO_32: 900 return prctl_map_vdso(&vdso_image_32, arg2); 901 # endif 902 case ARCH_MAP_VDSO_64: 903 return prctl_map_vdso(&vdso_image_64, arg2); 904 #endif 905 #ifdef CONFIG_ADDRESS_MASKING 906 case ARCH_GET_UNTAG_MASK: 907 return put_user(task->mm->context.untag_mask, 908 (unsigned long __user *)arg2); 909 case ARCH_ENABLE_TAGGED_ADDR: 910 return prctl_enable_tagged_addr(task->mm, arg2); 911 case ARCH_FORCE_TAGGED_SVA: 912 if (current != task) 913 return -EINVAL; 914 set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); 915 return 0; 916 case ARCH_GET_MAX_TAG_BITS: 917 if (!cpu_feature_enabled(X86_FEATURE_LAM)) 918 return put_user(0, (unsigned long __user *)arg2); 919 else 920 return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); 921 #endif 922 case ARCH_SHSTK_ENABLE: 923 case ARCH_SHSTK_DISABLE: 924 case ARCH_SHSTK_LOCK: 925 case ARCH_SHSTK_UNLOCK: 926 case ARCH_SHSTK_STATUS: 927 return shstk_prctl(task, option, arg2); 928 default: 929 ret = -EINVAL; 930 break; 931 } 932 933 return ret; 934 } 935 936 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) 937 { 938 long ret; 939 940 ret = do_arch_prctl_64(current, option, arg2); 941 if (ret == -EINVAL) 942 ret = do_arch_prctl_common(option, arg2); 943 944 return ret; 945 } 946 947 #ifdef CONFIG_IA32_EMULATION 948 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) 949 { 950 return do_arch_prctl_common(option, arg2); 951 } 952 #endif 953 954 unsigned long KSTK_ESP(struct task_struct *task) 955 { 956 return task_pt_regs(task)->sp; 957 } 958