1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/errno.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/smp.h> 7 #include <linux/prctl.h> 8 #include <linux/slab.h> 9 #include <linux/sched.h> 10 #include <linux/sched/idle.h> 11 #include <linux/init.h> 12 #include <linux/export.h> 13 #include <linux/pm.h> 14 #include <linux/tick.h> 15 #include <linux/random.h> 16 #include <linux/user-return-notifier.h> 17 #include <linux/dmi.h> 18 #include <linux/utsname.h> 19 #include <linux/stackprotector.h> 20 #include <linux/tick.h> 21 #include <linux/cpuidle.h> 22 #include <trace/events/power.h> 23 #include <linux/hw_breakpoint.h> 24 #include <asm/cpu.h> 25 #include <asm/apic.h> 26 #include <asm/syscalls.h> 27 #include <linux/uaccess.h> 28 #include <asm/mwait.h> 29 #include <asm/fpu/internal.h> 30 #include <asm/debugreg.h> 31 #include <asm/nmi.h> 32 #include <asm/tlbflush.h> 33 #include <asm/mce.h> 34 #include <asm/vm86.h> 35 #include <asm/switch_to.h> 36 #include <asm/desc.h> 37 38 /* 39 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 40 * no more per-task TSS's. The TSS size is kept cacheline-aligned 41 * so they are allowed to end up in the .data..cacheline_aligned 42 * section. Since TSS's are completely CPU-local, we want them 43 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 44 */ 45 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 46 .x86_tss = { 47 .sp0 = TOP_OF_INIT_STACK, 48 #ifdef CONFIG_X86_32 49 .ss0 = __KERNEL_DS, 50 .ss1 = __KERNEL_CS, 51 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 52 #endif 53 }, 54 #ifdef CONFIG_X86_32 55 /* 56 * Note that the .io_bitmap member must be extra-big. This is because 57 * the CPU will access an additional byte beyond the end of the IO 58 * permission bitmap. The extra byte must be all 1 bits, and must 59 * be within the limit. 60 */ 61 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 62 #endif 63 #ifdef CONFIG_X86_32 64 .SYSENTER_stack_canary = STACK_END_MAGIC, 65 #endif 66 }; 67 EXPORT_PER_CPU_SYMBOL(cpu_tss); 68 69 DEFINE_PER_CPU(bool, need_tr_refresh); 70 EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh); 71 72 /* 73 * this gets called so that we can store lazy state into memory and copy the 74 * current task into the new thread. 75 */ 76 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 77 { 78 memcpy(dst, src, arch_task_struct_size); 79 #ifdef CONFIG_VM86 80 dst->thread.vm86 = NULL; 81 #endif 82 83 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 84 } 85 86 /* 87 * Free current thread data structures etc.. 88 */ 89 void exit_thread(struct task_struct *tsk) 90 { 91 struct thread_struct *t = &tsk->thread; 92 unsigned long *bp = t->io_bitmap_ptr; 93 struct fpu *fpu = &t->fpu; 94 95 if (bp) { 96 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 97 98 t->io_bitmap_ptr = NULL; 99 clear_thread_flag(TIF_IO_BITMAP); 100 /* 101 * Careful, clear this in the TSS too: 102 */ 103 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 104 t->io_bitmap_max = 0; 105 put_cpu(); 106 kfree(bp); 107 } 108 109 free_vm86(t); 110 111 fpu__drop(fpu); 112 } 113 114 void flush_thread(void) 115 { 116 struct task_struct *tsk = current; 117 118 flush_ptrace_hw_breakpoint(tsk); 119 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 120 121 fpu__clear(&tsk->thread.fpu); 122 } 123 124 static void hard_disable_TSC(void) 125 { 126 cr4_set_bits(X86_CR4_TSD); 127 } 128 129 void disable_TSC(void) 130 { 131 preempt_disable(); 132 if (!test_and_set_thread_flag(TIF_NOTSC)) 133 /* 134 * Must flip the CPU state synchronously with 135 * TIF_NOTSC in the current running context. 136 */ 137 hard_disable_TSC(); 138 preempt_enable(); 139 } 140 141 static void hard_enable_TSC(void) 142 { 143 cr4_clear_bits(X86_CR4_TSD); 144 } 145 146 static void enable_TSC(void) 147 { 148 preempt_disable(); 149 if (test_and_clear_thread_flag(TIF_NOTSC)) 150 /* 151 * Must flip the CPU state synchronously with 152 * TIF_NOTSC in the current running context. 153 */ 154 hard_enable_TSC(); 155 preempt_enable(); 156 } 157 158 int get_tsc_mode(unsigned long adr) 159 { 160 unsigned int val; 161 162 if (test_thread_flag(TIF_NOTSC)) 163 val = PR_TSC_SIGSEGV; 164 else 165 val = PR_TSC_ENABLE; 166 167 return put_user(val, (unsigned int __user *)adr); 168 } 169 170 int set_tsc_mode(unsigned int val) 171 { 172 if (val == PR_TSC_SIGSEGV) 173 disable_TSC(); 174 else if (val == PR_TSC_ENABLE) 175 enable_TSC(); 176 else 177 return -EINVAL; 178 179 return 0; 180 } 181 182 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 183 struct tss_struct *tss) 184 { 185 struct thread_struct *prev, *next; 186 187 prev = &prev_p->thread; 188 next = &next_p->thread; 189 190 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 191 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 192 unsigned long debugctl = get_debugctlmsr(); 193 194 debugctl &= ~DEBUGCTLMSR_BTF; 195 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 196 debugctl |= DEBUGCTLMSR_BTF; 197 198 update_debugctlmsr(debugctl); 199 } 200 201 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 202 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 203 /* prev and next are different */ 204 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 205 hard_disable_TSC(); 206 else 207 hard_enable_TSC(); 208 } 209 210 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 211 /* 212 * Copy the relevant range of the IO bitmap. 213 * Normally this is 128 bytes or less: 214 */ 215 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 216 max(prev->io_bitmap_max, next->io_bitmap_max)); 217 218 /* 219 * Make sure that the TSS limit is correct for the CPU 220 * to notice the IO bitmap. 221 */ 222 refresh_TR(); 223 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 224 /* 225 * Clear any possible leftover bits: 226 */ 227 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 228 } 229 propagate_user_return_notify(prev_p, next_p); 230 } 231 232 /* 233 * Idle related variables and functions 234 */ 235 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 236 EXPORT_SYMBOL(boot_option_idle_override); 237 238 static void (*x86_idle)(void); 239 240 #ifndef CONFIG_SMP 241 static inline void play_dead(void) 242 { 243 BUG(); 244 } 245 #endif 246 247 void arch_cpu_idle_enter(void) 248 { 249 tsc_verify_tsc_adjust(false); 250 local_touch_nmi(); 251 } 252 253 void arch_cpu_idle_dead(void) 254 { 255 play_dead(); 256 } 257 258 /* 259 * Called from the generic idle code. 260 */ 261 void arch_cpu_idle(void) 262 { 263 x86_idle(); 264 } 265 266 /* 267 * We use this if we don't have any better idle routine.. 268 */ 269 void __cpuidle default_idle(void) 270 { 271 trace_cpu_idle_rcuidle(1, smp_processor_id()); 272 safe_halt(); 273 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 274 } 275 #ifdef CONFIG_APM_MODULE 276 EXPORT_SYMBOL(default_idle); 277 #endif 278 279 #ifdef CONFIG_XEN 280 bool xen_set_default_idle(void) 281 { 282 bool ret = !!x86_idle; 283 284 x86_idle = default_idle; 285 286 return ret; 287 } 288 #endif 289 void stop_this_cpu(void *dummy) 290 { 291 local_irq_disable(); 292 /* 293 * Remove this CPU: 294 */ 295 set_cpu_online(smp_processor_id(), false); 296 disable_local_APIC(); 297 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 298 299 for (;;) 300 halt(); 301 } 302 303 /* 304 * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power 305 * states (local apic timer and TSC stop). 306 */ 307 static void amd_e400_idle(void) 308 { 309 /* 310 * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E 311 * gets set after static_cpu_has() places have been converted via 312 * alternatives. 313 */ 314 if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 315 default_idle(); 316 return; 317 } 318 319 tick_broadcast_enter(); 320 321 default_idle(); 322 323 /* 324 * The switch back from broadcast mode needs to be called with 325 * interrupts disabled. 326 */ 327 local_irq_disable(); 328 tick_broadcast_exit(); 329 local_irq_enable(); 330 } 331 332 /* 333 * Intel Core2 and older machines prefer MWAIT over HALT for C1. 334 * We can't rely on cpuidle installing MWAIT, because it will not load 335 * on systems that support only C1 -- so the boot default must be MWAIT. 336 * 337 * Some AMD machines are the opposite, they depend on using HALT. 338 * 339 * So for default C1, which is used during boot until cpuidle loads, 340 * use MWAIT-C1 on Intel HW that has it, else use HALT. 341 */ 342 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 343 { 344 if (c->x86_vendor != X86_VENDOR_INTEL) 345 return 0; 346 347 if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) 348 return 0; 349 350 return 1; 351 } 352 353 /* 354 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT 355 * with interrupts enabled and no flags, which is backwards compatible with the 356 * original MWAIT implementation. 357 */ 358 static __cpuidle void mwait_idle(void) 359 { 360 if (!current_set_polling_and_test()) { 361 trace_cpu_idle_rcuidle(1, smp_processor_id()); 362 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 363 mb(); /* quirk */ 364 clflush((void *)¤t_thread_info()->flags); 365 mb(); /* quirk */ 366 } 367 368 __monitor((void *)¤t_thread_info()->flags, 0, 0); 369 if (!need_resched()) 370 __sti_mwait(0, 0); 371 else 372 local_irq_enable(); 373 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 374 } else { 375 local_irq_enable(); 376 } 377 __current_clr_polling(); 378 } 379 380 void select_idle_routine(const struct cpuinfo_x86 *c) 381 { 382 #ifdef CONFIG_SMP 383 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 384 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 385 #endif 386 if (x86_idle || boot_option_idle_override == IDLE_POLL) 387 return; 388 389 if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { 390 pr_info("using AMD E400 aware idle routine\n"); 391 x86_idle = amd_e400_idle; 392 } else if (prefer_mwait_c1_over_halt(c)) { 393 pr_info("using mwait in idle threads\n"); 394 x86_idle = mwait_idle; 395 } else 396 x86_idle = default_idle; 397 } 398 399 void amd_e400_c1e_apic_setup(void) 400 { 401 if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 402 pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); 403 local_irq_disable(); 404 tick_broadcast_force(); 405 local_irq_enable(); 406 } 407 } 408 409 void __init arch_post_acpi_subsys_init(void) 410 { 411 u32 lo, hi; 412 413 if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) 414 return; 415 416 /* 417 * AMD E400 detection needs to happen after ACPI has been enabled. If 418 * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in 419 * MSR_K8_INT_PENDING_MSG. 420 */ 421 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 422 if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) 423 return; 424 425 boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); 426 427 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 428 mark_tsc_unstable("TSC halt in AMD C1E"); 429 pr_info("System has AMD C1E enabled\n"); 430 } 431 432 static int __init idle_setup(char *str) 433 { 434 if (!str) 435 return -EINVAL; 436 437 if (!strcmp(str, "poll")) { 438 pr_info("using polling idle threads\n"); 439 boot_option_idle_override = IDLE_POLL; 440 cpu_idle_poll_ctrl(true); 441 } else if (!strcmp(str, "halt")) { 442 /* 443 * When the boot option of idle=halt is added, halt is 444 * forced to be used for CPU idle. In such case CPU C2/C3 445 * won't be used again. 446 * To continue to load the CPU idle driver, don't touch 447 * the boot_option_idle_override. 448 */ 449 x86_idle = default_idle; 450 boot_option_idle_override = IDLE_HALT; 451 } else if (!strcmp(str, "nomwait")) { 452 /* 453 * If the boot option of "idle=nomwait" is added, 454 * it means that mwait will be disabled for CPU C2/C3 455 * states. In such case it won't touch the variable 456 * of boot_option_idle_override. 457 */ 458 boot_option_idle_override = IDLE_NOMWAIT; 459 } else 460 return -1; 461 462 return 0; 463 } 464 early_param("idle", idle_setup); 465 466 unsigned long arch_align_stack(unsigned long sp) 467 { 468 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 469 sp -= get_random_int() % 8192; 470 return sp & ~0xf; 471 } 472 473 unsigned long arch_randomize_brk(struct mm_struct *mm) 474 { 475 return randomize_page(mm->brk, 0x02000000); 476 } 477 478 /* 479 * Return saved PC of a blocked thread. 480 * What is this good for? it will be always the scheduler or ret_from_fork. 481 */ 482 unsigned long thread_saved_pc(struct task_struct *tsk) 483 { 484 struct inactive_task_frame *frame = 485 (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); 486 return READ_ONCE_NOCHECK(frame->ret_addr); 487 } 488 489 /* 490 * Called from fs/proc with a reference on @p to find the function 491 * which called into schedule(). This needs to be done carefully 492 * because the task might wake up and we might look at a stack 493 * changing under us. 494 */ 495 unsigned long get_wchan(struct task_struct *p) 496 { 497 unsigned long start, bottom, top, sp, fp, ip, ret = 0; 498 int count = 0; 499 500 if (!p || p == current || p->state == TASK_RUNNING) 501 return 0; 502 503 if (!try_get_task_stack(p)) 504 return 0; 505 506 start = (unsigned long)task_stack_page(p); 507 if (!start) 508 goto out; 509 510 /* 511 * Layout of the stack page: 512 * 513 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) 514 * PADDING 515 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING 516 * stack 517 * ----------- bottom = start 518 * 519 * The tasks stack pointer points at the location where the 520 * framepointer is stored. The data on the stack is: 521 * ... IP FP ... IP FP 522 * 523 * We need to read FP and IP, so we need to adjust the upper 524 * bound by another unsigned long. 525 */ 526 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; 527 top -= 2 * sizeof(unsigned long); 528 bottom = start; 529 530 sp = READ_ONCE(p->thread.sp); 531 if (sp < bottom || sp > top) 532 goto out; 533 534 fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); 535 do { 536 if (fp < bottom || fp > top) 537 goto out; 538 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); 539 if (!in_sched_functions(ip)) { 540 ret = ip; 541 goto out; 542 } 543 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); 544 } while (count++ < 16 && p->state != TASK_RUNNING); 545 546 out: 547 put_task_stack(p); 548 return ret; 549 } 550