1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/errno.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/smp.h> 7 #include <linux/prctl.h> 8 #include <linux/slab.h> 9 #include <linux/sched.h> 10 #include <linux/module.h> 11 #include <linux/pm.h> 12 #include <linux/tick.h> 13 #include <linux/random.h> 14 #include <linux/user-return-notifier.h> 15 #include <linux/dmi.h> 16 #include <linux/utsname.h> 17 #include <linux/stackprotector.h> 18 #include <linux/tick.h> 19 #include <linux/cpuidle.h> 20 #include <trace/events/power.h> 21 #include <linux/hw_breakpoint.h> 22 #include <asm/cpu.h> 23 #include <asm/apic.h> 24 #include <asm/syscalls.h> 25 #include <asm/idle.h> 26 #include <asm/uaccess.h> 27 #include <asm/mwait.h> 28 #include <asm/fpu/internal.h> 29 #include <asm/debugreg.h> 30 #include <asm/nmi.h> 31 #include <asm/tlbflush.h> 32 #include <asm/mce.h> 33 #include <asm/vm86.h> 34 35 /* 36 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 37 * no more per-task TSS's. The TSS size is kept cacheline-aligned 38 * so they are allowed to end up in the .data..cacheline_aligned 39 * section. Since TSS's are completely CPU-local, we want them 40 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 41 */ 42 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 43 .x86_tss = { 44 .sp0 = TOP_OF_INIT_STACK, 45 #ifdef CONFIG_X86_32 46 .ss0 = __KERNEL_DS, 47 .ss1 = __KERNEL_CS, 48 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 49 #endif 50 }, 51 #ifdef CONFIG_X86_32 52 /* 53 * Note that the .io_bitmap member must be extra-big. This is because 54 * the CPU will access an additional byte beyond the end of the IO 55 * permission bitmap. The extra byte must be all 1 bits, and must 56 * be within the limit. 57 */ 58 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 59 #endif 60 }; 61 EXPORT_PER_CPU_SYMBOL(cpu_tss); 62 63 #ifdef CONFIG_X86_64 64 static DEFINE_PER_CPU(unsigned char, is_idle); 65 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 66 67 void idle_notifier_register(struct notifier_block *n) 68 { 69 atomic_notifier_chain_register(&idle_notifier, n); 70 } 71 EXPORT_SYMBOL_GPL(idle_notifier_register); 72 73 void idle_notifier_unregister(struct notifier_block *n) 74 { 75 atomic_notifier_chain_unregister(&idle_notifier, n); 76 } 77 EXPORT_SYMBOL_GPL(idle_notifier_unregister); 78 #endif 79 80 /* 81 * this gets called so that we can store lazy state into memory and copy the 82 * current task into the new thread. 83 */ 84 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 85 { 86 memcpy(dst, src, arch_task_struct_size); 87 88 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 89 } 90 91 /* 92 * Free current thread data structures etc.. 93 */ 94 void exit_thread(void) 95 { 96 struct task_struct *me = current; 97 struct thread_struct *t = &me->thread; 98 unsigned long *bp = t->io_bitmap_ptr; 99 struct fpu *fpu = &t->fpu; 100 101 if (bp) { 102 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 103 104 t->io_bitmap_ptr = NULL; 105 clear_thread_flag(TIF_IO_BITMAP); 106 /* 107 * Careful, clear this in the TSS too: 108 */ 109 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 110 t->io_bitmap_max = 0; 111 put_cpu(); 112 kfree(bp); 113 } 114 115 free_vm86(t); 116 117 fpu__drop(fpu); 118 } 119 120 void flush_thread(void) 121 { 122 struct task_struct *tsk = current; 123 124 flush_ptrace_hw_breakpoint(tsk); 125 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 126 127 fpu__clear(&tsk->thread.fpu); 128 } 129 130 static void hard_disable_TSC(void) 131 { 132 cr4_set_bits(X86_CR4_TSD); 133 } 134 135 void disable_TSC(void) 136 { 137 preempt_disable(); 138 if (!test_and_set_thread_flag(TIF_NOTSC)) 139 /* 140 * Must flip the CPU state synchronously with 141 * TIF_NOTSC in the current running context. 142 */ 143 hard_disable_TSC(); 144 preempt_enable(); 145 } 146 147 static void hard_enable_TSC(void) 148 { 149 cr4_clear_bits(X86_CR4_TSD); 150 } 151 152 static void enable_TSC(void) 153 { 154 preempt_disable(); 155 if (test_and_clear_thread_flag(TIF_NOTSC)) 156 /* 157 * Must flip the CPU state synchronously with 158 * TIF_NOTSC in the current running context. 159 */ 160 hard_enable_TSC(); 161 preempt_enable(); 162 } 163 164 int get_tsc_mode(unsigned long adr) 165 { 166 unsigned int val; 167 168 if (test_thread_flag(TIF_NOTSC)) 169 val = PR_TSC_SIGSEGV; 170 else 171 val = PR_TSC_ENABLE; 172 173 return put_user(val, (unsigned int __user *)adr); 174 } 175 176 int set_tsc_mode(unsigned int val) 177 { 178 if (val == PR_TSC_SIGSEGV) 179 disable_TSC(); 180 else if (val == PR_TSC_ENABLE) 181 enable_TSC(); 182 else 183 return -EINVAL; 184 185 return 0; 186 } 187 188 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 189 struct tss_struct *tss) 190 { 191 struct thread_struct *prev, *next; 192 193 prev = &prev_p->thread; 194 next = &next_p->thread; 195 196 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 197 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 198 unsigned long debugctl = get_debugctlmsr(); 199 200 debugctl &= ~DEBUGCTLMSR_BTF; 201 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 202 debugctl |= DEBUGCTLMSR_BTF; 203 204 update_debugctlmsr(debugctl); 205 } 206 207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 208 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 209 /* prev and next are different */ 210 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 211 hard_disable_TSC(); 212 else 213 hard_enable_TSC(); 214 } 215 216 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 217 /* 218 * Copy the relevant range of the IO bitmap. 219 * Normally this is 128 bytes or less: 220 */ 221 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 222 max(prev->io_bitmap_max, next->io_bitmap_max)); 223 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 224 /* 225 * Clear any possible leftover bits: 226 */ 227 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 228 } 229 propagate_user_return_notify(prev_p, next_p); 230 } 231 232 /* 233 * Idle related variables and functions 234 */ 235 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 236 EXPORT_SYMBOL(boot_option_idle_override); 237 238 static void (*x86_idle)(void); 239 240 #ifndef CONFIG_SMP 241 static inline void play_dead(void) 242 { 243 BUG(); 244 } 245 #endif 246 247 #ifdef CONFIG_X86_64 248 void enter_idle(void) 249 { 250 this_cpu_write(is_idle, 1); 251 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 252 } 253 254 static void __exit_idle(void) 255 { 256 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 257 return; 258 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 259 } 260 261 /* Called from interrupts to signify idle end */ 262 void exit_idle(void) 263 { 264 /* idle loop has pid 0 */ 265 if (current->pid) 266 return; 267 __exit_idle(); 268 } 269 #endif 270 271 void arch_cpu_idle_enter(void) 272 { 273 local_touch_nmi(); 274 enter_idle(); 275 } 276 277 void arch_cpu_idle_exit(void) 278 { 279 __exit_idle(); 280 } 281 282 void arch_cpu_idle_dead(void) 283 { 284 play_dead(); 285 } 286 287 /* 288 * Called from the generic idle code. 289 */ 290 void arch_cpu_idle(void) 291 { 292 x86_idle(); 293 } 294 295 /* 296 * We use this if we don't have any better idle routine.. 297 */ 298 void default_idle(void) 299 { 300 trace_cpu_idle_rcuidle(1, smp_processor_id()); 301 safe_halt(); 302 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 303 } 304 #ifdef CONFIG_APM_MODULE 305 EXPORT_SYMBOL(default_idle); 306 #endif 307 308 #ifdef CONFIG_XEN 309 bool xen_set_default_idle(void) 310 { 311 bool ret = !!x86_idle; 312 313 x86_idle = default_idle; 314 315 return ret; 316 } 317 #endif 318 void stop_this_cpu(void *dummy) 319 { 320 local_irq_disable(); 321 /* 322 * Remove this CPU: 323 */ 324 set_cpu_online(smp_processor_id(), false); 325 disable_local_APIC(); 326 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 327 328 for (;;) 329 halt(); 330 } 331 332 bool amd_e400_c1e_detected; 333 EXPORT_SYMBOL(amd_e400_c1e_detected); 334 335 static cpumask_var_t amd_e400_c1e_mask; 336 337 void amd_e400_remove_cpu(int cpu) 338 { 339 if (amd_e400_c1e_mask != NULL) 340 cpumask_clear_cpu(cpu, amd_e400_c1e_mask); 341 } 342 343 /* 344 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt 345 * pending message MSR. If we detect C1E, then we handle it the same 346 * way as C3 power states (local apic timer and TSC stop) 347 */ 348 static void amd_e400_idle(void) 349 { 350 if (!amd_e400_c1e_detected) { 351 u32 lo, hi; 352 353 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 354 355 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 356 amd_e400_c1e_detected = true; 357 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 358 mark_tsc_unstable("TSC halt in AMD C1E"); 359 pr_info("System has AMD C1E enabled\n"); 360 } 361 } 362 363 if (amd_e400_c1e_detected) { 364 int cpu = smp_processor_id(); 365 366 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 367 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 368 /* Force broadcast so ACPI can not interfere. */ 369 tick_broadcast_force(); 370 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 371 } 372 tick_broadcast_enter(); 373 374 default_idle(); 375 376 /* 377 * The switch back from broadcast mode needs to be 378 * called with interrupts disabled. 379 */ 380 local_irq_disable(); 381 tick_broadcast_exit(); 382 local_irq_enable(); 383 } else 384 default_idle(); 385 } 386 387 /* 388 * Intel Core2 and older machines prefer MWAIT over HALT for C1. 389 * We can't rely on cpuidle installing MWAIT, because it will not load 390 * on systems that support only C1 -- so the boot default must be MWAIT. 391 * 392 * Some AMD machines are the opposite, they depend on using HALT. 393 * 394 * So for default C1, which is used during boot until cpuidle loads, 395 * use MWAIT-C1 on Intel HW that has it, else use HALT. 396 */ 397 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 398 { 399 if (c->x86_vendor != X86_VENDOR_INTEL) 400 return 0; 401 402 if (!cpu_has(c, X86_FEATURE_MWAIT)) 403 return 0; 404 405 return 1; 406 } 407 408 /* 409 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT 410 * with interrupts enabled and no flags, which is backwards compatible with the 411 * original MWAIT implementation. 412 */ 413 static void mwait_idle(void) 414 { 415 if (!current_set_polling_and_test()) { 416 trace_cpu_idle_rcuidle(1, smp_processor_id()); 417 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 418 smp_mb(); /* quirk */ 419 clflush((void *)¤t_thread_info()->flags); 420 smp_mb(); /* quirk */ 421 } 422 423 __monitor((void *)¤t_thread_info()->flags, 0, 0); 424 if (!need_resched()) 425 __sti_mwait(0, 0); 426 else 427 local_irq_enable(); 428 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 429 } else { 430 local_irq_enable(); 431 } 432 __current_clr_polling(); 433 } 434 435 void select_idle_routine(const struct cpuinfo_x86 *c) 436 { 437 #ifdef CONFIG_SMP 438 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 439 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 440 #endif 441 if (x86_idle || boot_option_idle_override == IDLE_POLL) 442 return; 443 444 if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { 445 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 446 pr_info("using AMD E400 aware idle routine\n"); 447 x86_idle = amd_e400_idle; 448 } else if (prefer_mwait_c1_over_halt(c)) { 449 pr_info("using mwait in idle threads\n"); 450 x86_idle = mwait_idle; 451 } else 452 x86_idle = default_idle; 453 } 454 455 void __init init_amd_e400_c1e_mask(void) 456 { 457 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ 458 if (x86_idle == amd_e400_idle) 459 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); 460 } 461 462 static int __init idle_setup(char *str) 463 { 464 if (!str) 465 return -EINVAL; 466 467 if (!strcmp(str, "poll")) { 468 pr_info("using polling idle threads\n"); 469 boot_option_idle_override = IDLE_POLL; 470 cpu_idle_poll_ctrl(true); 471 } else if (!strcmp(str, "halt")) { 472 /* 473 * When the boot option of idle=halt is added, halt is 474 * forced to be used for CPU idle. In such case CPU C2/C3 475 * won't be used again. 476 * To continue to load the CPU idle driver, don't touch 477 * the boot_option_idle_override. 478 */ 479 x86_idle = default_idle; 480 boot_option_idle_override = IDLE_HALT; 481 } else if (!strcmp(str, "nomwait")) { 482 /* 483 * If the boot option of "idle=nomwait" is added, 484 * it means that mwait will be disabled for CPU C2/C3 485 * states. In such case it won't touch the variable 486 * of boot_option_idle_override. 487 */ 488 boot_option_idle_override = IDLE_NOMWAIT; 489 } else 490 return -1; 491 492 return 0; 493 } 494 early_param("idle", idle_setup); 495 496 unsigned long arch_align_stack(unsigned long sp) 497 { 498 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 499 sp -= get_random_int() % 8192; 500 return sp & ~0xf; 501 } 502 503 unsigned long arch_randomize_brk(struct mm_struct *mm) 504 { 505 unsigned long range_end = mm->brk + 0x02000000; 506 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 507 } 508 509 /* 510 * Called from fs/proc with a reference on @p to find the function 511 * which called into schedule(). This needs to be done carefully 512 * because the task might wake up and we might look at a stack 513 * changing under us. 514 */ 515 unsigned long get_wchan(struct task_struct *p) 516 { 517 unsigned long start, bottom, top, sp, fp, ip; 518 int count = 0; 519 520 if (!p || p == current || p->state == TASK_RUNNING) 521 return 0; 522 523 start = (unsigned long)task_stack_page(p); 524 if (!start) 525 return 0; 526 527 /* 528 * Layout of the stack page: 529 * 530 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) 531 * PADDING 532 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING 533 * stack 534 * ----------- bottom = start + sizeof(thread_info) 535 * thread_info 536 * ----------- start 537 * 538 * The tasks stack pointer points at the location where the 539 * framepointer is stored. The data on the stack is: 540 * ... IP FP ... IP FP 541 * 542 * We need to read FP and IP, so we need to adjust the upper 543 * bound by another unsigned long. 544 */ 545 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; 546 top -= 2 * sizeof(unsigned long); 547 bottom = start + sizeof(struct thread_info); 548 549 sp = READ_ONCE(p->thread.sp); 550 if (sp < bottom || sp > top) 551 return 0; 552 553 fp = READ_ONCE(*(unsigned long *)sp); 554 do { 555 if (fp < bottom || fp > top) 556 return 0; 557 ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long))); 558 if (!in_sched_functions(ip)) 559 return ip; 560 fp = READ_ONCE(*(unsigned long *)fp); 561 } while (count++ < 16 && p->state != TASK_RUNNING); 562 return 0; 563 } 564