1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/errno.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/smp.h> 7 #include <linux/prctl.h> 8 #include <linux/slab.h> 9 #include <linux/sched.h> 10 #include <linux/module.h> 11 #include <linux/pm.h> 12 #include <linux/clockchips.h> 13 #include <linux/random.h> 14 #include <linux/user-return-notifier.h> 15 #include <linux/dmi.h> 16 #include <linux/utsname.h> 17 #include <linux/stackprotector.h> 18 #include <linux/tick.h> 19 #include <linux/cpuidle.h> 20 #include <trace/events/power.h> 21 #include <linux/hw_breakpoint.h> 22 #include <asm/cpu.h> 23 #include <asm/apic.h> 24 #include <asm/syscalls.h> 25 #include <asm/idle.h> 26 #include <asm/uaccess.h> 27 #include <asm/i387.h> 28 #include <asm/fpu-internal.h> 29 #include <asm/debugreg.h> 30 #include <asm/nmi.h> 31 32 /* 33 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 34 * no more per-task TSS's. The TSS size is kept cacheline-aligned 35 * so they are allowed to end up in the .data..cacheline_aligned 36 * section. Since TSS's are completely CPU-local, we want them 37 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 38 */ 39 DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; 40 41 #ifdef CONFIG_X86_64 42 static DEFINE_PER_CPU(unsigned char, is_idle); 43 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 44 45 void idle_notifier_register(struct notifier_block *n) 46 { 47 atomic_notifier_chain_register(&idle_notifier, n); 48 } 49 EXPORT_SYMBOL_GPL(idle_notifier_register); 50 51 void idle_notifier_unregister(struct notifier_block *n) 52 { 53 atomic_notifier_chain_unregister(&idle_notifier, n); 54 } 55 EXPORT_SYMBOL_GPL(idle_notifier_unregister); 56 #endif 57 58 struct kmem_cache *task_xstate_cachep; 59 EXPORT_SYMBOL_GPL(task_xstate_cachep); 60 61 /* 62 * this gets called so that we can store lazy state into memory and copy the 63 * current task into the new thread. 64 */ 65 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 66 { 67 int ret; 68 69 *dst = *src; 70 if (fpu_allocated(&src->thread.fpu)) { 71 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); 72 ret = fpu_alloc(&dst->thread.fpu); 73 if (ret) 74 return ret; 75 fpu_copy(dst, src); 76 } 77 return 0; 78 } 79 80 void free_thread_xstate(struct task_struct *tsk) 81 { 82 fpu_free(&tsk->thread.fpu); 83 } 84 85 void arch_release_task_struct(struct task_struct *tsk) 86 { 87 free_thread_xstate(tsk); 88 } 89 90 void arch_task_cache_init(void) 91 { 92 task_xstate_cachep = 93 kmem_cache_create("task_xstate", xstate_size, 94 __alignof__(union thread_xstate), 95 SLAB_PANIC | SLAB_NOTRACK, NULL); 96 } 97 98 /* 99 * Free current thread data structures etc.. 100 */ 101 void exit_thread(void) 102 { 103 struct task_struct *me = current; 104 struct thread_struct *t = &me->thread; 105 unsigned long *bp = t->io_bitmap_ptr; 106 107 if (bp) { 108 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 109 110 t->io_bitmap_ptr = NULL; 111 clear_thread_flag(TIF_IO_BITMAP); 112 /* 113 * Careful, clear this in the TSS too: 114 */ 115 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 116 t->io_bitmap_max = 0; 117 put_cpu(); 118 kfree(bp); 119 } 120 121 drop_fpu(me); 122 } 123 124 void show_regs_common(void) 125 { 126 const char *vendor, *product, *board; 127 128 vendor = dmi_get_system_info(DMI_SYS_VENDOR); 129 if (!vendor) 130 vendor = ""; 131 product = dmi_get_system_info(DMI_PRODUCT_NAME); 132 if (!product) 133 product = ""; 134 135 /* Board Name is optional */ 136 board = dmi_get_system_info(DMI_BOARD_NAME); 137 138 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n", 139 current->pid, current->comm, print_tainted(), 140 init_utsname()->release, 141 (int)strcspn(init_utsname()->version, " "), 142 init_utsname()->version, 143 vendor, product, 144 board ? "/" : "", 145 board ? board : ""); 146 } 147 148 void flush_thread(void) 149 { 150 struct task_struct *tsk = current; 151 152 flush_ptrace_hw_breakpoint(tsk); 153 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 154 drop_init_fpu(tsk); 155 /* 156 * Free the FPU state for non xsave platforms. They get reallocated 157 * lazily at the first use. 158 */ 159 if (!use_eager_fpu()) 160 free_thread_xstate(tsk); 161 } 162 163 static void hard_disable_TSC(void) 164 { 165 write_cr4(read_cr4() | X86_CR4_TSD); 166 } 167 168 void disable_TSC(void) 169 { 170 preempt_disable(); 171 if (!test_and_set_thread_flag(TIF_NOTSC)) 172 /* 173 * Must flip the CPU state synchronously with 174 * TIF_NOTSC in the current running context. 175 */ 176 hard_disable_TSC(); 177 preempt_enable(); 178 } 179 180 static void hard_enable_TSC(void) 181 { 182 write_cr4(read_cr4() & ~X86_CR4_TSD); 183 } 184 185 static void enable_TSC(void) 186 { 187 preempt_disable(); 188 if (test_and_clear_thread_flag(TIF_NOTSC)) 189 /* 190 * Must flip the CPU state synchronously with 191 * TIF_NOTSC in the current running context. 192 */ 193 hard_enable_TSC(); 194 preempt_enable(); 195 } 196 197 int get_tsc_mode(unsigned long adr) 198 { 199 unsigned int val; 200 201 if (test_thread_flag(TIF_NOTSC)) 202 val = PR_TSC_SIGSEGV; 203 else 204 val = PR_TSC_ENABLE; 205 206 return put_user(val, (unsigned int __user *)adr); 207 } 208 209 int set_tsc_mode(unsigned int val) 210 { 211 if (val == PR_TSC_SIGSEGV) 212 disable_TSC(); 213 else if (val == PR_TSC_ENABLE) 214 enable_TSC(); 215 else 216 return -EINVAL; 217 218 return 0; 219 } 220 221 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 222 struct tss_struct *tss) 223 { 224 struct thread_struct *prev, *next; 225 226 prev = &prev_p->thread; 227 next = &next_p->thread; 228 229 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 230 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 231 unsigned long debugctl = get_debugctlmsr(); 232 233 debugctl &= ~DEBUGCTLMSR_BTF; 234 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 235 debugctl |= DEBUGCTLMSR_BTF; 236 237 update_debugctlmsr(debugctl); 238 } 239 240 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 241 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 242 /* prev and next are different */ 243 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 244 hard_disable_TSC(); 245 else 246 hard_enable_TSC(); 247 } 248 249 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 250 /* 251 * Copy the relevant range of the IO bitmap. 252 * Normally this is 128 bytes or less: 253 */ 254 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 255 max(prev->io_bitmap_max, next->io_bitmap_max)); 256 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 257 /* 258 * Clear any possible leftover bits: 259 */ 260 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 261 } 262 propagate_user_return_notify(prev_p, next_p); 263 } 264 265 int sys_fork(struct pt_regs *regs) 266 { 267 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); 268 } 269 270 /* 271 * This is trivial, and on the face of it looks like it 272 * could equally well be done in user mode. 273 * 274 * Not so, for quite unobvious reasons - register pressure. 275 * In user mode vfork() cannot have a stack frame, and if 276 * done by calling the "clone()" system call directly, you 277 * do not have enough call-clobbered registers to hold all 278 * the information you need. 279 */ 280 int sys_vfork(struct pt_regs *regs) 281 { 282 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, 283 NULL, NULL); 284 } 285 286 long 287 sys_clone(unsigned long clone_flags, unsigned long newsp, 288 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 289 { 290 if (!newsp) 291 newsp = regs->sp; 292 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 293 } 294 295 /* 296 * Idle related variables and functions 297 */ 298 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 299 EXPORT_SYMBOL(boot_option_idle_override); 300 301 /* 302 * Powermanagement idle function, if any.. 303 */ 304 void (*pm_idle)(void); 305 #ifdef CONFIG_APM_MODULE 306 EXPORT_SYMBOL(pm_idle); 307 #endif 308 309 static inline int hlt_use_halt(void) 310 { 311 return 1; 312 } 313 314 #ifndef CONFIG_SMP 315 static inline void play_dead(void) 316 { 317 BUG(); 318 } 319 #endif 320 321 #ifdef CONFIG_X86_64 322 void enter_idle(void) 323 { 324 this_cpu_write(is_idle, 1); 325 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 326 } 327 328 static void __exit_idle(void) 329 { 330 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 331 return; 332 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 333 } 334 335 /* Called from interrupts to signify idle end */ 336 void exit_idle(void) 337 { 338 /* idle loop has pid 0 */ 339 if (current->pid) 340 return; 341 __exit_idle(); 342 } 343 #endif 344 345 /* 346 * The idle thread. There's no useful work to be 347 * done, so just try to conserve power and have a 348 * low exit latency (ie sit in a loop waiting for 349 * somebody to say that they'd like to reschedule) 350 */ 351 void cpu_idle(void) 352 { 353 /* 354 * If we're the non-boot CPU, nothing set the stack canary up 355 * for us. CPU0 already has it initialized but no harm in 356 * doing it again. This is a good place for updating it, as 357 * we wont ever return from this function (so the invalid 358 * canaries already on the stack wont ever trigger). 359 */ 360 boot_init_stack_canary(); 361 current_thread_info()->status |= TS_POLLING; 362 363 while (1) { 364 tick_nohz_idle_enter(); 365 366 while (!need_resched()) { 367 rmb(); 368 369 if (cpu_is_offline(smp_processor_id())) 370 play_dead(); 371 372 /* 373 * Idle routines should keep interrupts disabled 374 * from here on, until they go to idle. 375 * Otherwise, idle callbacks can misfire. 376 */ 377 local_touch_nmi(); 378 local_irq_disable(); 379 380 enter_idle(); 381 382 /* Don't trace irqs off for idle */ 383 stop_critical_timings(); 384 385 /* enter_idle() needs rcu for notifiers */ 386 rcu_idle_enter(); 387 388 if (cpuidle_idle_call()) 389 pm_idle(); 390 391 rcu_idle_exit(); 392 start_critical_timings(); 393 394 /* In many cases the interrupt that ended idle 395 has already called exit_idle. But some idle 396 loops can be woken up without interrupt. */ 397 __exit_idle(); 398 } 399 400 tick_nohz_idle_exit(); 401 preempt_enable_no_resched(); 402 schedule(); 403 preempt_disable(); 404 } 405 } 406 407 /* 408 * We use this if we don't have any better 409 * idle routine.. 410 */ 411 void default_idle(void) 412 { 413 if (hlt_use_halt()) { 414 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); 415 trace_cpu_idle_rcuidle(1, smp_processor_id()); 416 current_thread_info()->status &= ~TS_POLLING; 417 /* 418 * TS_POLLING-cleared state must be visible before we 419 * test NEED_RESCHED: 420 */ 421 smp_mb(); 422 423 if (!need_resched()) 424 safe_halt(); /* enables interrupts racelessly */ 425 else 426 local_irq_enable(); 427 current_thread_info()->status |= TS_POLLING; 428 trace_power_end_rcuidle(smp_processor_id()); 429 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 430 } else { 431 local_irq_enable(); 432 /* loop is done by the caller */ 433 cpu_relax(); 434 } 435 } 436 #ifdef CONFIG_APM_MODULE 437 EXPORT_SYMBOL(default_idle); 438 #endif 439 440 bool set_pm_idle_to_default(void) 441 { 442 bool ret = !!pm_idle; 443 444 pm_idle = default_idle; 445 446 return ret; 447 } 448 void stop_this_cpu(void *dummy) 449 { 450 local_irq_disable(); 451 /* 452 * Remove this CPU: 453 */ 454 set_cpu_online(smp_processor_id(), false); 455 disable_local_APIC(); 456 457 for (;;) { 458 if (hlt_works(smp_processor_id())) 459 halt(); 460 } 461 } 462 463 /* Default MONITOR/MWAIT with no hints, used for default C1 state */ 464 static void mwait_idle(void) 465 { 466 if (!need_resched()) { 467 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); 468 trace_cpu_idle_rcuidle(1, smp_processor_id()); 469 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) 470 clflush((void *)¤t_thread_info()->flags); 471 472 __monitor((void *)¤t_thread_info()->flags, 0, 0); 473 smp_mb(); 474 if (!need_resched()) 475 __sti_mwait(0, 0); 476 else 477 local_irq_enable(); 478 trace_power_end_rcuidle(smp_processor_id()); 479 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 480 } else 481 local_irq_enable(); 482 } 483 484 /* 485 * On SMP it's slightly faster (but much more power-consuming!) 486 * to poll the ->work.need_resched flag instead of waiting for the 487 * cross-CPU IPI to arrive. Use this option with caution. 488 */ 489 static void poll_idle(void) 490 { 491 trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id()); 492 trace_cpu_idle_rcuidle(0, smp_processor_id()); 493 local_irq_enable(); 494 while (!need_resched()) 495 cpu_relax(); 496 trace_power_end_rcuidle(smp_processor_id()); 497 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 498 } 499 500 /* 501 * mwait selection logic: 502 * 503 * It depends on the CPU. For AMD CPUs that support MWAIT this is 504 * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings 505 * then depend on a clock divisor and current Pstate of the core. If 506 * all cores of a processor are in halt state (C1) the processor can 507 * enter the C1E (C1 enhanced) state. If mwait is used this will never 508 * happen. 509 * 510 * idle=mwait overrides this decision and forces the usage of mwait. 511 */ 512 513 #define MWAIT_INFO 0x05 514 #define MWAIT_ECX_EXTENDED_INFO 0x01 515 #define MWAIT_EDX_C1 0xf0 516 517 int mwait_usable(const struct cpuinfo_x86 *c) 518 { 519 u32 eax, ebx, ecx, edx; 520 521 /* Use mwait if idle=mwait boot option is given */ 522 if (boot_option_idle_override == IDLE_FORCE_MWAIT) 523 return 1; 524 525 /* 526 * Any idle= boot option other than idle=mwait means that we must not 527 * use mwait. Eg: idle=halt or idle=poll or idle=nomwait 528 */ 529 if (boot_option_idle_override != IDLE_NO_OVERRIDE) 530 return 0; 531 532 if (c->cpuid_level < MWAIT_INFO) 533 return 0; 534 535 cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx); 536 /* Check, whether EDX has extended info about MWAIT */ 537 if (!(ecx & MWAIT_ECX_EXTENDED_INFO)) 538 return 1; 539 540 /* 541 * edx enumeratios MONITOR/MWAIT extensions. Check, whether 542 * C1 supports MWAIT 543 */ 544 return (edx & MWAIT_EDX_C1); 545 } 546 547 bool amd_e400_c1e_detected; 548 EXPORT_SYMBOL(amd_e400_c1e_detected); 549 550 static cpumask_var_t amd_e400_c1e_mask; 551 552 void amd_e400_remove_cpu(int cpu) 553 { 554 if (amd_e400_c1e_mask != NULL) 555 cpumask_clear_cpu(cpu, amd_e400_c1e_mask); 556 } 557 558 /* 559 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt 560 * pending message MSR. If we detect C1E, then we handle it the same 561 * way as C3 power states (local apic timer and TSC stop) 562 */ 563 static void amd_e400_idle(void) 564 { 565 if (need_resched()) 566 return; 567 568 if (!amd_e400_c1e_detected) { 569 u32 lo, hi; 570 571 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 572 573 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 574 amd_e400_c1e_detected = true; 575 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 576 mark_tsc_unstable("TSC halt in AMD C1E"); 577 pr_info("System has AMD C1E enabled\n"); 578 } 579 } 580 581 if (amd_e400_c1e_detected) { 582 int cpu = smp_processor_id(); 583 584 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 585 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 586 /* 587 * Force broadcast so ACPI can not interfere. 588 */ 589 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 590 &cpu); 591 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 592 } 593 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 594 595 default_idle(); 596 597 /* 598 * The switch back from broadcast mode needs to be 599 * called with interrupts disabled. 600 */ 601 local_irq_disable(); 602 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 603 local_irq_enable(); 604 } else 605 default_idle(); 606 } 607 608 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 609 { 610 #ifdef CONFIG_SMP 611 if (pm_idle == poll_idle && smp_num_siblings > 1) { 612 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 613 } 614 #endif 615 if (pm_idle) 616 return; 617 618 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { 619 /* 620 * One CPU supports mwait => All CPUs supports mwait 621 */ 622 pr_info("using mwait in idle threads\n"); 623 pm_idle = mwait_idle; 624 } else if (cpu_has_amd_erratum(amd_erratum_400)) { 625 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 626 pr_info("using AMD E400 aware idle routine\n"); 627 pm_idle = amd_e400_idle; 628 } else 629 pm_idle = default_idle; 630 } 631 632 void __init init_amd_e400_c1e_mask(void) 633 { 634 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ 635 if (pm_idle == amd_e400_idle) 636 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); 637 } 638 639 static int __init idle_setup(char *str) 640 { 641 if (!str) 642 return -EINVAL; 643 644 if (!strcmp(str, "poll")) { 645 pr_info("using polling idle threads\n"); 646 pm_idle = poll_idle; 647 boot_option_idle_override = IDLE_POLL; 648 } else if (!strcmp(str, "mwait")) { 649 boot_option_idle_override = IDLE_FORCE_MWAIT; 650 WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n"); 651 } else if (!strcmp(str, "halt")) { 652 /* 653 * When the boot option of idle=halt is added, halt is 654 * forced to be used for CPU idle. In such case CPU C2/C3 655 * won't be used again. 656 * To continue to load the CPU idle driver, don't touch 657 * the boot_option_idle_override. 658 */ 659 pm_idle = default_idle; 660 boot_option_idle_override = IDLE_HALT; 661 } else if (!strcmp(str, "nomwait")) { 662 /* 663 * If the boot option of "idle=nomwait" is added, 664 * it means that mwait will be disabled for CPU C2/C3 665 * states. In such case it won't touch the variable 666 * of boot_option_idle_override. 667 */ 668 boot_option_idle_override = IDLE_NOMWAIT; 669 } else 670 return -1; 671 672 return 0; 673 } 674 early_param("idle", idle_setup); 675 676 unsigned long arch_align_stack(unsigned long sp) 677 { 678 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 679 sp -= get_random_int() % 8192; 680 return sp & ~0xf; 681 } 682 683 unsigned long arch_randomize_brk(struct mm_struct *mm) 684 { 685 unsigned long range_end = mm->brk + 0x02000000; 686 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 687 } 688 689