1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/errno.h> 5 #include <linux/kernel.h> 6 #include <linux/mm.h> 7 #include <linux/smp.h> 8 #include <linux/prctl.h> 9 #include <linux/slab.h> 10 #include <linux/sched.h> 11 #include <linux/sched/idle.h> 12 #include <linux/sched/debug.h> 13 #include <linux/sched/task.h> 14 #include <linux/sched/task_stack.h> 15 #include <linux/init.h> 16 #include <linux/export.h> 17 #include <linux/pm.h> 18 #include <linux/tick.h> 19 #include <linux/random.h> 20 #include <linux/user-return-notifier.h> 21 #include <linux/dmi.h> 22 #include <linux/utsname.h> 23 #include <linux/stackprotector.h> 24 #include <linux/tick.h> 25 #include <linux/cpuidle.h> 26 #include <trace/events/power.h> 27 #include <linux/hw_breakpoint.h> 28 #include <asm/cpu.h> 29 #include <asm/apic.h> 30 #include <asm/syscalls.h> 31 #include <linux/uaccess.h> 32 #include <asm/mwait.h> 33 #include <asm/fpu/internal.h> 34 #include <asm/debugreg.h> 35 #include <asm/nmi.h> 36 #include <asm/tlbflush.h> 37 #include <asm/mce.h> 38 #include <asm/vm86.h> 39 #include <asm/switch_to.h> 40 #include <asm/desc.h> 41 #include <asm/prctl.h> 42 43 /* 44 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 45 * no more per-task TSS's. The TSS size is kept cacheline-aligned 46 * so they are allowed to end up in the .data..cacheline_aligned 47 * section. Since TSS's are completely CPU-local, we want them 48 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 49 */ 50 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 51 .x86_tss = { 52 /* 53 * .sp0 is only used when entering ring 0 from a lower 54 * privilege level. Since the init task never runs anything 55 * but ring 0 code, there is no need for a valid value here. 56 * Poison it. 57 */ 58 .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, 59 #ifdef CONFIG_X86_32 60 .ss0 = __KERNEL_DS, 61 .ss1 = __KERNEL_CS, 62 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 63 #endif 64 }, 65 #ifdef CONFIG_X86_32 66 /* 67 * Note that the .io_bitmap member must be extra-big. This is because 68 * the CPU will access an additional byte beyond the end of the IO 69 * permission bitmap. The extra byte must be all 1 bits, and must 70 * be within the limit. 71 */ 72 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 73 #endif 74 #ifdef CONFIG_X86_32 75 .SYSENTER_stack_canary = STACK_END_MAGIC, 76 #endif 77 }; 78 EXPORT_PER_CPU_SYMBOL(cpu_tss); 79 80 DEFINE_PER_CPU(bool, __tss_limit_invalid); 81 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); 82 83 /* 84 * this gets called so that we can store lazy state into memory and copy the 85 * current task into the new thread. 86 */ 87 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 88 { 89 memcpy(dst, src, arch_task_struct_size); 90 #ifdef CONFIG_VM86 91 dst->thread.vm86 = NULL; 92 #endif 93 94 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 95 } 96 97 /* 98 * Free current thread data structures etc.. 99 */ 100 void exit_thread(struct task_struct *tsk) 101 { 102 struct thread_struct *t = &tsk->thread; 103 unsigned long *bp = t->io_bitmap_ptr; 104 struct fpu *fpu = &t->fpu; 105 106 if (bp) { 107 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 108 109 t->io_bitmap_ptr = NULL; 110 clear_thread_flag(TIF_IO_BITMAP); 111 /* 112 * Careful, clear this in the TSS too: 113 */ 114 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 115 t->io_bitmap_max = 0; 116 put_cpu(); 117 kfree(bp); 118 } 119 120 free_vm86(t); 121 122 fpu__drop(fpu); 123 } 124 125 void flush_thread(void) 126 { 127 struct task_struct *tsk = current; 128 129 flush_ptrace_hw_breakpoint(tsk); 130 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 131 132 fpu__clear(&tsk->thread.fpu); 133 } 134 135 void disable_TSC(void) 136 { 137 preempt_disable(); 138 if (!test_and_set_thread_flag(TIF_NOTSC)) 139 /* 140 * Must flip the CPU state synchronously with 141 * TIF_NOTSC in the current running context. 142 */ 143 cr4_set_bits(X86_CR4_TSD); 144 preempt_enable(); 145 } 146 147 static void enable_TSC(void) 148 { 149 preempt_disable(); 150 if (test_and_clear_thread_flag(TIF_NOTSC)) 151 /* 152 * Must flip the CPU state synchronously with 153 * TIF_NOTSC in the current running context. 154 */ 155 cr4_clear_bits(X86_CR4_TSD); 156 preempt_enable(); 157 } 158 159 int get_tsc_mode(unsigned long adr) 160 { 161 unsigned int val; 162 163 if (test_thread_flag(TIF_NOTSC)) 164 val = PR_TSC_SIGSEGV; 165 else 166 val = PR_TSC_ENABLE; 167 168 return put_user(val, (unsigned int __user *)adr); 169 } 170 171 int set_tsc_mode(unsigned int val) 172 { 173 if (val == PR_TSC_SIGSEGV) 174 disable_TSC(); 175 else if (val == PR_TSC_ENABLE) 176 enable_TSC(); 177 else 178 return -EINVAL; 179 180 return 0; 181 } 182 183 DEFINE_PER_CPU(u64, msr_misc_features_shadow); 184 185 static void set_cpuid_faulting(bool on) 186 { 187 u64 msrval; 188 189 msrval = this_cpu_read(msr_misc_features_shadow); 190 msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; 191 msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); 192 this_cpu_write(msr_misc_features_shadow, msrval); 193 wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); 194 } 195 196 static void disable_cpuid(void) 197 { 198 preempt_disable(); 199 if (!test_and_set_thread_flag(TIF_NOCPUID)) { 200 /* 201 * Must flip the CPU state synchronously with 202 * TIF_NOCPUID in the current running context. 203 */ 204 set_cpuid_faulting(true); 205 } 206 preempt_enable(); 207 } 208 209 static void enable_cpuid(void) 210 { 211 preempt_disable(); 212 if (test_and_clear_thread_flag(TIF_NOCPUID)) { 213 /* 214 * Must flip the CPU state synchronously with 215 * TIF_NOCPUID in the current running context. 216 */ 217 set_cpuid_faulting(false); 218 } 219 preempt_enable(); 220 } 221 222 static int get_cpuid_mode(void) 223 { 224 return !test_thread_flag(TIF_NOCPUID); 225 } 226 227 static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) 228 { 229 if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) 230 return -ENODEV; 231 232 if (cpuid_enabled) 233 enable_cpuid(); 234 else 235 disable_cpuid(); 236 237 return 0; 238 } 239 240 /* 241 * Called immediately after a successful exec. 242 */ 243 void arch_setup_new_exec(void) 244 { 245 /* If cpuid was previously disabled for this task, re-enable it. */ 246 if (test_thread_flag(TIF_NOCPUID)) 247 enable_cpuid(); 248 } 249 250 static inline void switch_to_bitmap(struct tss_struct *tss, 251 struct thread_struct *prev, 252 struct thread_struct *next, 253 unsigned long tifp, unsigned long tifn) 254 { 255 if (tifn & _TIF_IO_BITMAP) { 256 /* 257 * Copy the relevant range of the IO bitmap. 258 * Normally this is 128 bytes or less: 259 */ 260 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 261 max(prev->io_bitmap_max, next->io_bitmap_max)); 262 /* 263 * Make sure that the TSS limit is correct for the CPU 264 * to notice the IO bitmap. 265 */ 266 refresh_tss_limit(); 267 } else if (tifp & _TIF_IO_BITMAP) { 268 /* 269 * Clear any possible leftover bits: 270 */ 271 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 272 } 273 } 274 275 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 276 struct tss_struct *tss) 277 { 278 struct thread_struct *prev, *next; 279 unsigned long tifp, tifn; 280 281 prev = &prev_p->thread; 282 next = &next_p->thread; 283 284 tifn = READ_ONCE(task_thread_info(next_p)->flags); 285 tifp = READ_ONCE(task_thread_info(prev_p)->flags); 286 switch_to_bitmap(tss, prev, next, tifp, tifn); 287 288 propagate_user_return_notify(prev_p, next_p); 289 290 if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && 291 arch_has_block_step()) { 292 unsigned long debugctl, msk; 293 294 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 295 debugctl &= ~DEBUGCTLMSR_BTF; 296 msk = tifn & _TIF_BLOCKSTEP; 297 debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; 298 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 299 } 300 301 if ((tifp ^ tifn) & _TIF_NOTSC) 302 cr4_toggle_bits_irqsoff(X86_CR4_TSD); 303 304 if ((tifp ^ tifn) & _TIF_NOCPUID) 305 set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); 306 } 307 308 /* 309 * Idle related variables and functions 310 */ 311 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 312 EXPORT_SYMBOL(boot_option_idle_override); 313 314 static void (*x86_idle)(void); 315 316 #ifndef CONFIG_SMP 317 static inline void play_dead(void) 318 { 319 BUG(); 320 } 321 #endif 322 323 void arch_cpu_idle_enter(void) 324 { 325 tsc_verify_tsc_adjust(false); 326 local_touch_nmi(); 327 } 328 329 void arch_cpu_idle_dead(void) 330 { 331 play_dead(); 332 } 333 334 /* 335 * Called from the generic idle code. 336 */ 337 void arch_cpu_idle(void) 338 { 339 x86_idle(); 340 } 341 342 /* 343 * We use this if we don't have any better idle routine.. 344 */ 345 void __cpuidle default_idle(void) 346 { 347 trace_cpu_idle_rcuidle(1, smp_processor_id()); 348 safe_halt(); 349 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 350 } 351 #ifdef CONFIG_APM_MODULE 352 EXPORT_SYMBOL(default_idle); 353 #endif 354 355 #ifdef CONFIG_XEN 356 bool xen_set_default_idle(void) 357 { 358 bool ret = !!x86_idle; 359 360 x86_idle = default_idle; 361 362 return ret; 363 } 364 #endif 365 366 void stop_this_cpu(void *dummy) 367 { 368 local_irq_disable(); 369 /* 370 * Remove this CPU: 371 */ 372 set_cpu_online(smp_processor_id(), false); 373 disable_local_APIC(); 374 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 375 376 for (;;) { 377 /* 378 * Use wbinvd followed by hlt to stop the processor. This 379 * provides support for kexec on a processor that supports 380 * SME. With kexec, going from SME inactive to SME active 381 * requires clearing cache entries so that addresses without 382 * the encryption bit set don't corrupt the same physical 383 * address that has the encryption bit set when caches are 384 * flushed. To achieve this a wbinvd is performed followed by 385 * a hlt. Even if the processor is not in the kexec/SME 386 * scenario this only adds a wbinvd to a halting processor. 387 */ 388 asm volatile("wbinvd; hlt" : : : "memory"); 389 } 390 } 391 392 /* 393 * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power 394 * states (local apic timer and TSC stop). 395 */ 396 static void amd_e400_idle(void) 397 { 398 /* 399 * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E 400 * gets set after static_cpu_has() places have been converted via 401 * alternatives. 402 */ 403 if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 404 default_idle(); 405 return; 406 } 407 408 tick_broadcast_enter(); 409 410 default_idle(); 411 412 /* 413 * The switch back from broadcast mode needs to be called with 414 * interrupts disabled. 415 */ 416 local_irq_disable(); 417 tick_broadcast_exit(); 418 local_irq_enable(); 419 } 420 421 /* 422 * Intel Core2 and older machines prefer MWAIT over HALT for C1. 423 * We can't rely on cpuidle installing MWAIT, because it will not load 424 * on systems that support only C1 -- so the boot default must be MWAIT. 425 * 426 * Some AMD machines are the opposite, they depend on using HALT. 427 * 428 * So for default C1, which is used during boot until cpuidle loads, 429 * use MWAIT-C1 on Intel HW that has it, else use HALT. 430 */ 431 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 432 { 433 if (c->x86_vendor != X86_VENDOR_INTEL) 434 return 0; 435 436 if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) 437 return 0; 438 439 return 1; 440 } 441 442 /* 443 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT 444 * with interrupts enabled and no flags, which is backwards compatible with the 445 * original MWAIT implementation. 446 */ 447 static __cpuidle void mwait_idle(void) 448 { 449 if (!current_set_polling_and_test()) { 450 trace_cpu_idle_rcuidle(1, smp_processor_id()); 451 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 452 mb(); /* quirk */ 453 clflush((void *)¤t_thread_info()->flags); 454 mb(); /* quirk */ 455 } 456 457 __monitor((void *)¤t_thread_info()->flags, 0, 0); 458 if (!need_resched()) 459 __sti_mwait(0, 0); 460 else 461 local_irq_enable(); 462 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 463 } else { 464 local_irq_enable(); 465 } 466 __current_clr_polling(); 467 } 468 469 void select_idle_routine(const struct cpuinfo_x86 *c) 470 { 471 #ifdef CONFIG_SMP 472 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 473 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 474 #endif 475 if (x86_idle || boot_option_idle_override == IDLE_POLL) 476 return; 477 478 if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { 479 pr_info("using AMD E400 aware idle routine\n"); 480 x86_idle = amd_e400_idle; 481 } else if (prefer_mwait_c1_over_halt(c)) { 482 pr_info("using mwait in idle threads\n"); 483 x86_idle = mwait_idle; 484 } else 485 x86_idle = default_idle; 486 } 487 488 void amd_e400_c1e_apic_setup(void) 489 { 490 if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 491 pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); 492 local_irq_disable(); 493 tick_broadcast_force(); 494 local_irq_enable(); 495 } 496 } 497 498 void __init arch_post_acpi_subsys_init(void) 499 { 500 u32 lo, hi; 501 502 if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) 503 return; 504 505 /* 506 * AMD E400 detection needs to happen after ACPI has been enabled. If 507 * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in 508 * MSR_K8_INT_PENDING_MSG. 509 */ 510 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 511 if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) 512 return; 513 514 boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); 515 516 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 517 mark_tsc_unstable("TSC halt in AMD C1E"); 518 pr_info("System has AMD C1E enabled\n"); 519 } 520 521 static int __init idle_setup(char *str) 522 { 523 if (!str) 524 return -EINVAL; 525 526 if (!strcmp(str, "poll")) { 527 pr_info("using polling idle threads\n"); 528 boot_option_idle_override = IDLE_POLL; 529 cpu_idle_poll_ctrl(true); 530 } else if (!strcmp(str, "halt")) { 531 /* 532 * When the boot option of idle=halt is added, halt is 533 * forced to be used for CPU idle. In such case CPU C2/C3 534 * won't be used again. 535 * To continue to load the CPU idle driver, don't touch 536 * the boot_option_idle_override. 537 */ 538 x86_idle = default_idle; 539 boot_option_idle_override = IDLE_HALT; 540 } else if (!strcmp(str, "nomwait")) { 541 /* 542 * If the boot option of "idle=nomwait" is added, 543 * it means that mwait will be disabled for CPU C2/C3 544 * states. In such case it won't touch the variable 545 * of boot_option_idle_override. 546 */ 547 boot_option_idle_override = IDLE_NOMWAIT; 548 } else 549 return -1; 550 551 return 0; 552 } 553 early_param("idle", idle_setup); 554 555 unsigned long arch_align_stack(unsigned long sp) 556 { 557 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 558 sp -= get_random_int() % 8192; 559 return sp & ~0xf; 560 } 561 562 unsigned long arch_randomize_brk(struct mm_struct *mm) 563 { 564 return randomize_page(mm->brk, 0x02000000); 565 } 566 567 /* 568 * Called from fs/proc with a reference on @p to find the function 569 * which called into schedule(). This needs to be done carefully 570 * because the task might wake up and we might look at a stack 571 * changing under us. 572 */ 573 unsigned long get_wchan(struct task_struct *p) 574 { 575 unsigned long start, bottom, top, sp, fp, ip, ret = 0; 576 int count = 0; 577 578 if (!p || p == current || p->state == TASK_RUNNING) 579 return 0; 580 581 if (!try_get_task_stack(p)) 582 return 0; 583 584 start = (unsigned long)task_stack_page(p); 585 if (!start) 586 goto out; 587 588 /* 589 * Layout of the stack page: 590 * 591 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) 592 * PADDING 593 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING 594 * stack 595 * ----------- bottom = start 596 * 597 * The tasks stack pointer points at the location where the 598 * framepointer is stored. The data on the stack is: 599 * ... IP FP ... IP FP 600 * 601 * We need to read FP and IP, so we need to adjust the upper 602 * bound by another unsigned long. 603 */ 604 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; 605 top -= 2 * sizeof(unsigned long); 606 bottom = start; 607 608 sp = READ_ONCE(p->thread.sp); 609 if (sp < bottom || sp > top) 610 goto out; 611 612 fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); 613 do { 614 if (fp < bottom || fp > top) 615 goto out; 616 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); 617 if (!in_sched_functions(ip)) { 618 ret = ip; 619 goto out; 620 } 621 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); 622 } while (count++ < 16 && p->state != TASK_RUNNING); 623 624 out: 625 put_task_stack(p); 626 return ret; 627 } 628 629 long do_arch_prctl_common(struct task_struct *task, int option, 630 unsigned long cpuid_enabled) 631 { 632 switch (option) { 633 case ARCH_GET_CPUID: 634 return get_cpuid_mode(); 635 case ARCH_SET_CPUID: 636 return set_cpuid_mode(task, cpuid_enabled); 637 } 638 639 return -EINVAL; 640 } 641