1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/errno.h> 5 #include <linux/kernel.h> 6 #include <linux/mm.h> 7 #include <linux/smp.h> 8 #include <linux/prctl.h> 9 #include <linux/slab.h> 10 #include <linux/sched.h> 11 #include <linux/sched/idle.h> 12 #include <linux/sched/debug.h> 13 #include <linux/sched/task.h> 14 #include <linux/sched/task_stack.h> 15 #include <linux/init.h> 16 #include <linux/export.h> 17 #include <linux/pm.h> 18 #include <linux/tick.h> 19 #include <linux/random.h> 20 #include <linux/user-return-notifier.h> 21 #include <linux/dmi.h> 22 #include <linux/utsname.h> 23 #include <linux/stackprotector.h> 24 #include <linux/tick.h> 25 #include <linux/cpuidle.h> 26 #include <trace/events/power.h> 27 #include <linux/hw_breakpoint.h> 28 #include <asm/cpu.h> 29 #include <asm/apic.h> 30 #include <asm/syscalls.h> 31 #include <linux/uaccess.h> 32 #include <asm/mwait.h> 33 #include <asm/fpu/internal.h> 34 #include <asm/debugreg.h> 35 #include <asm/nmi.h> 36 #include <asm/tlbflush.h> 37 #include <asm/mce.h> 38 #include <asm/vm86.h> 39 #include <asm/switch_to.h> 40 #include <asm/desc.h> 41 #include <asm/prctl.h> 42 43 /* 44 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 45 * no more per-task TSS's. The TSS size is kept cacheline-aligned 46 * so they are allowed to end up in the .data..cacheline_aligned 47 * section. Since TSS's are completely CPU-local, we want them 48 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 49 */ 50 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 51 .x86_tss = { 52 .sp0 = TOP_OF_INIT_STACK, 53 #ifdef CONFIG_X86_32 54 .ss0 = __KERNEL_DS, 55 .ss1 = __KERNEL_CS, 56 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 57 #endif 58 }, 59 #ifdef CONFIG_X86_32 60 /* 61 * Note that the .io_bitmap member must be extra-big. This is because 62 * the CPU will access an additional byte beyond the end of the IO 63 * permission bitmap. The extra byte must be all 1 bits, and must 64 * be within the limit. 65 */ 66 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 67 #endif 68 #ifdef CONFIG_X86_32 69 .SYSENTER_stack_canary = STACK_END_MAGIC, 70 #endif 71 }; 72 EXPORT_PER_CPU_SYMBOL(cpu_tss); 73 74 DEFINE_PER_CPU(bool, __tss_limit_invalid); 75 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); 76 77 /* 78 * this gets called so that we can store lazy state into memory and copy the 79 * current task into the new thread. 80 */ 81 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 82 { 83 memcpy(dst, src, arch_task_struct_size); 84 #ifdef CONFIG_VM86 85 dst->thread.vm86 = NULL; 86 #endif 87 88 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 89 } 90 91 /* 92 * Free current thread data structures etc.. 93 */ 94 void exit_thread(struct task_struct *tsk) 95 { 96 struct thread_struct *t = &tsk->thread; 97 unsigned long *bp = t->io_bitmap_ptr; 98 struct fpu *fpu = &t->fpu; 99 100 if (bp) { 101 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 102 103 t->io_bitmap_ptr = NULL; 104 clear_thread_flag(TIF_IO_BITMAP); 105 /* 106 * Careful, clear this in the TSS too: 107 */ 108 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 109 t->io_bitmap_max = 0; 110 put_cpu(); 111 kfree(bp); 112 } 113 114 free_vm86(t); 115 116 fpu__drop(fpu); 117 } 118 119 void flush_thread(void) 120 { 121 struct task_struct *tsk = current; 122 123 flush_ptrace_hw_breakpoint(tsk); 124 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 125 126 fpu__clear(&tsk->thread.fpu); 127 } 128 129 void disable_TSC(void) 130 { 131 preempt_disable(); 132 if (!test_and_set_thread_flag(TIF_NOTSC)) 133 /* 134 * Must flip the CPU state synchronously with 135 * TIF_NOTSC in the current running context. 136 */ 137 cr4_set_bits(X86_CR4_TSD); 138 preempt_enable(); 139 } 140 141 static void enable_TSC(void) 142 { 143 preempt_disable(); 144 if (test_and_clear_thread_flag(TIF_NOTSC)) 145 /* 146 * Must flip the CPU state synchronously with 147 * TIF_NOTSC in the current running context. 148 */ 149 cr4_clear_bits(X86_CR4_TSD); 150 preempt_enable(); 151 } 152 153 int get_tsc_mode(unsigned long adr) 154 { 155 unsigned int val; 156 157 if (test_thread_flag(TIF_NOTSC)) 158 val = PR_TSC_SIGSEGV; 159 else 160 val = PR_TSC_ENABLE; 161 162 return put_user(val, (unsigned int __user *)adr); 163 } 164 165 int set_tsc_mode(unsigned int val) 166 { 167 if (val == PR_TSC_SIGSEGV) 168 disable_TSC(); 169 else if (val == PR_TSC_ENABLE) 170 enable_TSC(); 171 else 172 return -EINVAL; 173 174 return 0; 175 } 176 177 DEFINE_PER_CPU(u64, msr_misc_features_shadow); 178 179 static void set_cpuid_faulting(bool on) 180 { 181 u64 msrval; 182 183 msrval = this_cpu_read(msr_misc_features_shadow); 184 msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; 185 msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); 186 this_cpu_write(msr_misc_features_shadow, msrval); 187 wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); 188 } 189 190 static void disable_cpuid(void) 191 { 192 preempt_disable(); 193 if (!test_and_set_thread_flag(TIF_NOCPUID)) { 194 /* 195 * Must flip the CPU state synchronously with 196 * TIF_NOCPUID in the current running context. 197 */ 198 set_cpuid_faulting(true); 199 } 200 preempt_enable(); 201 } 202 203 static void enable_cpuid(void) 204 { 205 preempt_disable(); 206 if (test_and_clear_thread_flag(TIF_NOCPUID)) { 207 /* 208 * Must flip the CPU state synchronously with 209 * TIF_NOCPUID in the current running context. 210 */ 211 set_cpuid_faulting(false); 212 } 213 preempt_enable(); 214 } 215 216 static int get_cpuid_mode(void) 217 { 218 return !test_thread_flag(TIF_NOCPUID); 219 } 220 221 static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) 222 { 223 if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) 224 return -ENODEV; 225 226 if (cpuid_enabled) 227 enable_cpuid(); 228 else 229 disable_cpuid(); 230 231 return 0; 232 } 233 234 /* 235 * Called immediately after a successful exec. 236 */ 237 void arch_setup_new_exec(void) 238 { 239 /* If cpuid was previously disabled for this task, re-enable it. */ 240 if (test_thread_flag(TIF_NOCPUID)) 241 enable_cpuid(); 242 } 243 244 static inline void switch_to_bitmap(struct tss_struct *tss, 245 struct thread_struct *prev, 246 struct thread_struct *next, 247 unsigned long tifp, unsigned long tifn) 248 { 249 if (tifn & _TIF_IO_BITMAP) { 250 /* 251 * Copy the relevant range of the IO bitmap. 252 * Normally this is 128 bytes or less: 253 */ 254 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 255 max(prev->io_bitmap_max, next->io_bitmap_max)); 256 /* 257 * Make sure that the TSS limit is correct for the CPU 258 * to notice the IO bitmap. 259 */ 260 refresh_tss_limit(); 261 } else if (tifp & _TIF_IO_BITMAP) { 262 /* 263 * Clear any possible leftover bits: 264 */ 265 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 266 } 267 } 268 269 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 270 struct tss_struct *tss) 271 { 272 struct thread_struct *prev, *next; 273 unsigned long tifp, tifn; 274 275 prev = &prev_p->thread; 276 next = &next_p->thread; 277 278 tifn = READ_ONCE(task_thread_info(next_p)->flags); 279 tifp = READ_ONCE(task_thread_info(prev_p)->flags); 280 switch_to_bitmap(tss, prev, next, tifp, tifn); 281 282 propagate_user_return_notify(prev_p, next_p); 283 284 if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && 285 arch_has_block_step()) { 286 unsigned long debugctl, msk; 287 288 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 289 debugctl &= ~DEBUGCTLMSR_BTF; 290 msk = tifn & _TIF_BLOCKSTEP; 291 debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; 292 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 293 } 294 295 if ((tifp ^ tifn) & _TIF_NOTSC) 296 cr4_toggle_bits(X86_CR4_TSD); 297 298 if ((tifp ^ tifn) & _TIF_NOCPUID) 299 set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); 300 } 301 302 /* 303 * Idle related variables and functions 304 */ 305 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 306 EXPORT_SYMBOL(boot_option_idle_override); 307 308 static void (*x86_idle)(void); 309 310 #ifndef CONFIG_SMP 311 static inline void play_dead(void) 312 { 313 BUG(); 314 } 315 #endif 316 317 void arch_cpu_idle_enter(void) 318 { 319 tsc_verify_tsc_adjust(false); 320 local_touch_nmi(); 321 } 322 323 void arch_cpu_idle_dead(void) 324 { 325 play_dead(); 326 } 327 328 /* 329 * Called from the generic idle code. 330 */ 331 void arch_cpu_idle(void) 332 { 333 x86_idle(); 334 } 335 336 /* 337 * We use this if we don't have any better idle routine.. 338 */ 339 void __cpuidle default_idle(void) 340 { 341 trace_cpu_idle_rcuidle(1, smp_processor_id()); 342 safe_halt(); 343 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 344 } 345 #ifdef CONFIG_APM_MODULE 346 EXPORT_SYMBOL(default_idle); 347 #endif 348 349 #ifdef CONFIG_XEN 350 bool xen_set_default_idle(void) 351 { 352 bool ret = !!x86_idle; 353 354 x86_idle = default_idle; 355 356 return ret; 357 } 358 #endif 359 360 void stop_this_cpu(void *dummy) 361 { 362 local_irq_disable(); 363 /* 364 * Remove this CPU: 365 */ 366 set_cpu_online(smp_processor_id(), false); 367 disable_local_APIC(); 368 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 369 370 for (;;) { 371 /* 372 * Use wbinvd followed by hlt to stop the processor. This 373 * provides support for kexec on a processor that supports 374 * SME. With kexec, going from SME inactive to SME active 375 * requires clearing cache entries so that addresses without 376 * the encryption bit set don't corrupt the same physical 377 * address that has the encryption bit set when caches are 378 * flushed. To achieve this a wbinvd is performed followed by 379 * a hlt. Even if the processor is not in the kexec/SME 380 * scenario this only adds a wbinvd to a halting processor. 381 */ 382 asm volatile("wbinvd; hlt" : : : "memory"); 383 } 384 } 385 386 /* 387 * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power 388 * states (local apic timer and TSC stop). 389 */ 390 static void amd_e400_idle(void) 391 { 392 /* 393 * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E 394 * gets set after static_cpu_has() places have been converted via 395 * alternatives. 396 */ 397 if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 398 default_idle(); 399 return; 400 } 401 402 tick_broadcast_enter(); 403 404 default_idle(); 405 406 /* 407 * The switch back from broadcast mode needs to be called with 408 * interrupts disabled. 409 */ 410 local_irq_disable(); 411 tick_broadcast_exit(); 412 local_irq_enable(); 413 } 414 415 /* 416 * Intel Core2 and older machines prefer MWAIT over HALT for C1. 417 * We can't rely on cpuidle installing MWAIT, because it will not load 418 * on systems that support only C1 -- so the boot default must be MWAIT. 419 * 420 * Some AMD machines are the opposite, they depend on using HALT. 421 * 422 * So for default C1, which is used during boot until cpuidle loads, 423 * use MWAIT-C1 on Intel HW that has it, else use HALT. 424 */ 425 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 426 { 427 if (c->x86_vendor != X86_VENDOR_INTEL) 428 return 0; 429 430 if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) 431 return 0; 432 433 return 1; 434 } 435 436 /* 437 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT 438 * with interrupts enabled and no flags, which is backwards compatible with the 439 * original MWAIT implementation. 440 */ 441 static __cpuidle void mwait_idle(void) 442 { 443 if (!current_set_polling_and_test()) { 444 trace_cpu_idle_rcuidle(1, smp_processor_id()); 445 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 446 mb(); /* quirk */ 447 clflush((void *)¤t_thread_info()->flags); 448 mb(); /* quirk */ 449 } 450 451 __monitor((void *)¤t_thread_info()->flags, 0, 0); 452 if (!need_resched()) 453 __sti_mwait(0, 0); 454 else 455 local_irq_enable(); 456 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 457 } else { 458 local_irq_enable(); 459 } 460 __current_clr_polling(); 461 } 462 463 void select_idle_routine(const struct cpuinfo_x86 *c) 464 { 465 #ifdef CONFIG_SMP 466 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 467 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 468 #endif 469 if (x86_idle || boot_option_idle_override == IDLE_POLL) 470 return; 471 472 if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { 473 pr_info("using AMD E400 aware idle routine\n"); 474 x86_idle = amd_e400_idle; 475 } else if (prefer_mwait_c1_over_halt(c)) { 476 pr_info("using mwait in idle threads\n"); 477 x86_idle = mwait_idle; 478 } else 479 x86_idle = default_idle; 480 } 481 482 void amd_e400_c1e_apic_setup(void) 483 { 484 if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 485 pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); 486 local_irq_disable(); 487 tick_broadcast_force(); 488 local_irq_enable(); 489 } 490 } 491 492 void __init arch_post_acpi_subsys_init(void) 493 { 494 u32 lo, hi; 495 496 if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) 497 return; 498 499 /* 500 * AMD E400 detection needs to happen after ACPI has been enabled. If 501 * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in 502 * MSR_K8_INT_PENDING_MSG. 503 */ 504 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 505 if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) 506 return; 507 508 boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); 509 510 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 511 mark_tsc_unstable("TSC halt in AMD C1E"); 512 pr_info("System has AMD C1E enabled\n"); 513 } 514 515 static int __init idle_setup(char *str) 516 { 517 if (!str) 518 return -EINVAL; 519 520 if (!strcmp(str, "poll")) { 521 pr_info("using polling idle threads\n"); 522 boot_option_idle_override = IDLE_POLL; 523 cpu_idle_poll_ctrl(true); 524 } else if (!strcmp(str, "halt")) { 525 /* 526 * When the boot option of idle=halt is added, halt is 527 * forced to be used for CPU idle. In such case CPU C2/C3 528 * won't be used again. 529 * To continue to load the CPU idle driver, don't touch 530 * the boot_option_idle_override. 531 */ 532 x86_idle = default_idle; 533 boot_option_idle_override = IDLE_HALT; 534 } else if (!strcmp(str, "nomwait")) { 535 /* 536 * If the boot option of "idle=nomwait" is added, 537 * it means that mwait will be disabled for CPU C2/C3 538 * states. In such case it won't touch the variable 539 * of boot_option_idle_override. 540 */ 541 boot_option_idle_override = IDLE_NOMWAIT; 542 } else 543 return -1; 544 545 return 0; 546 } 547 early_param("idle", idle_setup); 548 549 unsigned long arch_align_stack(unsigned long sp) 550 { 551 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 552 sp -= get_random_int() % 8192; 553 return sp & ~0xf; 554 } 555 556 unsigned long arch_randomize_brk(struct mm_struct *mm) 557 { 558 return randomize_page(mm->brk, 0x02000000); 559 } 560 561 /* 562 * Called from fs/proc with a reference on @p to find the function 563 * which called into schedule(). This needs to be done carefully 564 * because the task might wake up and we might look at a stack 565 * changing under us. 566 */ 567 unsigned long get_wchan(struct task_struct *p) 568 { 569 unsigned long start, bottom, top, sp, fp, ip, ret = 0; 570 int count = 0; 571 572 if (!p || p == current || p->state == TASK_RUNNING) 573 return 0; 574 575 if (!try_get_task_stack(p)) 576 return 0; 577 578 start = (unsigned long)task_stack_page(p); 579 if (!start) 580 goto out; 581 582 /* 583 * Layout of the stack page: 584 * 585 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) 586 * PADDING 587 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING 588 * stack 589 * ----------- bottom = start 590 * 591 * The tasks stack pointer points at the location where the 592 * framepointer is stored. The data on the stack is: 593 * ... IP FP ... IP FP 594 * 595 * We need to read FP and IP, so we need to adjust the upper 596 * bound by another unsigned long. 597 */ 598 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; 599 top -= 2 * sizeof(unsigned long); 600 bottom = start; 601 602 sp = READ_ONCE(p->thread.sp); 603 if (sp < bottom || sp > top) 604 goto out; 605 606 fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); 607 do { 608 if (fp < bottom || fp > top) 609 goto out; 610 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); 611 if (!in_sched_functions(ip)) { 612 ret = ip; 613 goto out; 614 } 615 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); 616 } while (count++ < 16 && p->state != TASK_RUNNING); 617 618 out: 619 put_task_stack(p); 620 return ret; 621 } 622 623 long do_arch_prctl_common(struct task_struct *task, int option, 624 unsigned long cpuid_enabled) 625 { 626 switch (option) { 627 case ARCH_GET_CPUID: 628 return get_cpuid_mode(); 629 case ARCH_SET_CPUID: 630 return set_cpuid_mode(task, cpuid_enabled); 631 } 632 633 return -EINVAL; 634 } 635