1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/errno.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/smp.h> 7 #include <linux/prctl.h> 8 #include <linux/slab.h> 9 #include <linux/sched.h> 10 #include <linux/module.h> 11 #include <linux/pm.h> 12 #include <linux/clockchips.h> 13 #include <linux/random.h> 14 #include <linux/user-return-notifier.h> 15 #include <linux/dmi.h> 16 #include <linux/utsname.h> 17 #include <linux/stackprotector.h> 18 #include <linux/tick.h> 19 #include <linux/cpuidle.h> 20 #include <trace/events/power.h> 21 #include <linux/hw_breakpoint.h> 22 #include <asm/cpu.h> 23 #include <asm/apic.h> 24 #include <asm/syscalls.h> 25 #include <asm/idle.h> 26 #include <asm/uaccess.h> 27 #include <asm/i387.h> 28 #include <asm/fpu-internal.h> 29 #include <asm/debugreg.h> 30 #include <asm/nmi.h> 31 32 /* 33 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 34 * no more per-task TSS's. The TSS size is kept cacheline-aligned 35 * so they are allowed to end up in the .data..cacheline_aligned 36 * section. Since TSS's are completely CPU-local, we want them 37 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 38 */ 39 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; 40 41 #ifdef CONFIG_X86_64 42 static DEFINE_PER_CPU(unsigned char, is_idle); 43 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 44 45 void idle_notifier_register(struct notifier_block *n) 46 { 47 atomic_notifier_chain_register(&idle_notifier, n); 48 } 49 EXPORT_SYMBOL_GPL(idle_notifier_register); 50 51 void idle_notifier_unregister(struct notifier_block *n) 52 { 53 atomic_notifier_chain_unregister(&idle_notifier, n); 54 } 55 EXPORT_SYMBOL_GPL(idle_notifier_unregister); 56 #endif 57 58 struct kmem_cache *task_xstate_cachep; 59 EXPORT_SYMBOL_GPL(task_xstate_cachep); 60 61 /* 62 * this gets called so that we can store lazy state into memory and copy the 63 * current task into the new thread. 64 */ 65 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 66 { 67 int ret; 68 69 *dst = *src; 70 if (fpu_allocated(&src->thread.fpu)) { 71 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); 72 ret = fpu_alloc(&dst->thread.fpu); 73 if (ret) 74 return ret; 75 fpu_copy(dst, src); 76 } 77 return 0; 78 } 79 80 void free_thread_xstate(struct task_struct *tsk) 81 { 82 fpu_free(&tsk->thread.fpu); 83 } 84 85 void arch_release_task_struct(struct task_struct *tsk) 86 { 87 free_thread_xstate(tsk); 88 } 89 90 void arch_task_cache_init(void) 91 { 92 task_xstate_cachep = 93 kmem_cache_create("task_xstate", xstate_size, 94 __alignof__(union thread_xstate), 95 SLAB_PANIC | SLAB_NOTRACK, NULL); 96 } 97 98 /* 99 * Free current thread data structures etc.. 100 */ 101 void exit_thread(void) 102 { 103 struct task_struct *me = current; 104 struct thread_struct *t = &me->thread; 105 unsigned long *bp = t->io_bitmap_ptr; 106 107 if (bp) { 108 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 109 110 t->io_bitmap_ptr = NULL; 111 clear_thread_flag(TIF_IO_BITMAP); 112 /* 113 * Careful, clear this in the TSS too: 114 */ 115 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 116 t->io_bitmap_max = 0; 117 put_cpu(); 118 kfree(bp); 119 } 120 121 drop_fpu(me); 122 } 123 124 void flush_thread(void) 125 { 126 struct task_struct *tsk = current; 127 128 flush_ptrace_hw_breakpoint(tsk); 129 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 130 drop_init_fpu(tsk); 131 /* 132 * Free the FPU state for non xsave platforms. They get reallocated 133 * lazily at the first use. 134 */ 135 if (!use_eager_fpu()) 136 free_thread_xstate(tsk); 137 } 138 139 static void hard_disable_TSC(void) 140 { 141 write_cr4(read_cr4() | X86_CR4_TSD); 142 } 143 144 void disable_TSC(void) 145 { 146 preempt_disable(); 147 if (!test_and_set_thread_flag(TIF_NOTSC)) 148 /* 149 * Must flip the CPU state synchronously with 150 * TIF_NOTSC in the current running context. 151 */ 152 hard_disable_TSC(); 153 preempt_enable(); 154 } 155 156 static void hard_enable_TSC(void) 157 { 158 write_cr4(read_cr4() & ~X86_CR4_TSD); 159 } 160 161 static void enable_TSC(void) 162 { 163 preempt_disable(); 164 if (test_and_clear_thread_flag(TIF_NOTSC)) 165 /* 166 * Must flip the CPU state synchronously with 167 * TIF_NOTSC in the current running context. 168 */ 169 hard_enable_TSC(); 170 preempt_enable(); 171 } 172 173 int get_tsc_mode(unsigned long adr) 174 { 175 unsigned int val; 176 177 if (test_thread_flag(TIF_NOTSC)) 178 val = PR_TSC_SIGSEGV; 179 else 180 val = PR_TSC_ENABLE; 181 182 return put_user(val, (unsigned int __user *)adr); 183 } 184 185 int set_tsc_mode(unsigned int val) 186 { 187 if (val == PR_TSC_SIGSEGV) 188 disable_TSC(); 189 else if (val == PR_TSC_ENABLE) 190 enable_TSC(); 191 else 192 return -EINVAL; 193 194 return 0; 195 } 196 197 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 198 struct tss_struct *tss) 199 { 200 struct thread_struct *prev, *next; 201 202 prev = &prev_p->thread; 203 next = &next_p->thread; 204 205 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 206 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 207 unsigned long debugctl = get_debugctlmsr(); 208 209 debugctl &= ~DEBUGCTLMSR_BTF; 210 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 211 debugctl |= DEBUGCTLMSR_BTF; 212 213 update_debugctlmsr(debugctl); 214 } 215 216 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 217 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 218 /* prev and next are different */ 219 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 220 hard_disable_TSC(); 221 else 222 hard_enable_TSC(); 223 } 224 225 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 226 /* 227 * Copy the relevant range of the IO bitmap. 228 * Normally this is 128 bytes or less: 229 */ 230 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 231 max(prev->io_bitmap_max, next->io_bitmap_max)); 232 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 233 /* 234 * Clear any possible leftover bits: 235 */ 236 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 237 } 238 propagate_user_return_notify(prev_p, next_p); 239 } 240 241 /* 242 * Idle related variables and functions 243 */ 244 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 245 EXPORT_SYMBOL(boot_option_idle_override); 246 247 static void (*x86_idle)(void); 248 249 #ifndef CONFIG_SMP 250 static inline void play_dead(void) 251 { 252 BUG(); 253 } 254 #endif 255 256 #ifdef CONFIG_X86_64 257 void enter_idle(void) 258 { 259 this_cpu_write(is_idle, 1); 260 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 261 } 262 263 static void __exit_idle(void) 264 { 265 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 266 return; 267 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 268 } 269 270 /* Called from interrupts to signify idle end */ 271 void exit_idle(void) 272 { 273 /* idle loop has pid 0 */ 274 if (current->pid) 275 return; 276 __exit_idle(); 277 } 278 #endif 279 280 void arch_cpu_idle_enter(void) 281 { 282 local_touch_nmi(); 283 enter_idle(); 284 } 285 286 void arch_cpu_idle_exit(void) 287 { 288 __exit_idle(); 289 } 290 291 void arch_cpu_idle_dead(void) 292 { 293 play_dead(); 294 } 295 296 /* 297 * Called from the generic idle code. 298 */ 299 void arch_cpu_idle(void) 300 { 301 x86_idle(); 302 } 303 304 /* 305 * We use this if we don't have any better idle routine.. 306 */ 307 void default_idle(void) 308 { 309 trace_cpu_idle_rcuidle(1, smp_processor_id()); 310 safe_halt(); 311 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 312 } 313 #ifdef CONFIG_APM_MODULE 314 EXPORT_SYMBOL(default_idle); 315 #endif 316 317 #ifdef CONFIG_XEN 318 bool xen_set_default_idle(void) 319 { 320 bool ret = !!x86_idle; 321 322 x86_idle = default_idle; 323 324 return ret; 325 } 326 #endif 327 void stop_this_cpu(void *dummy) 328 { 329 local_irq_disable(); 330 /* 331 * Remove this CPU: 332 */ 333 set_cpu_online(smp_processor_id(), false); 334 disable_local_APIC(); 335 336 for (;;) 337 halt(); 338 } 339 340 bool amd_e400_c1e_detected; 341 EXPORT_SYMBOL(amd_e400_c1e_detected); 342 343 static cpumask_var_t amd_e400_c1e_mask; 344 345 void amd_e400_remove_cpu(int cpu) 346 { 347 if (amd_e400_c1e_mask != NULL) 348 cpumask_clear_cpu(cpu, amd_e400_c1e_mask); 349 } 350 351 /* 352 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt 353 * pending message MSR. If we detect C1E, then we handle it the same 354 * way as C3 power states (local apic timer and TSC stop) 355 */ 356 static void amd_e400_idle(void) 357 { 358 if (!amd_e400_c1e_detected) { 359 u32 lo, hi; 360 361 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 362 363 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 364 amd_e400_c1e_detected = true; 365 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 366 mark_tsc_unstable("TSC halt in AMD C1E"); 367 pr_info("System has AMD C1E enabled\n"); 368 } 369 } 370 371 if (amd_e400_c1e_detected) { 372 int cpu = smp_processor_id(); 373 374 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 375 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 376 /* 377 * Force broadcast so ACPI can not interfere. 378 */ 379 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 380 &cpu); 381 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 382 } 383 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 384 385 default_idle(); 386 387 /* 388 * The switch back from broadcast mode needs to be 389 * called with interrupts disabled. 390 */ 391 local_irq_disable(); 392 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 393 local_irq_enable(); 394 } else 395 default_idle(); 396 } 397 398 void select_idle_routine(const struct cpuinfo_x86 *c) 399 { 400 #ifdef CONFIG_SMP 401 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 402 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 403 #endif 404 if (x86_idle || boot_option_idle_override == IDLE_POLL) 405 return; 406 407 if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { 408 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 409 pr_info("using AMD E400 aware idle routine\n"); 410 x86_idle = amd_e400_idle; 411 } else 412 x86_idle = default_idle; 413 } 414 415 void __init init_amd_e400_c1e_mask(void) 416 { 417 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ 418 if (x86_idle == amd_e400_idle) 419 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); 420 } 421 422 static int __init idle_setup(char *str) 423 { 424 if (!str) 425 return -EINVAL; 426 427 if (!strcmp(str, "poll")) { 428 pr_info("using polling idle threads\n"); 429 boot_option_idle_override = IDLE_POLL; 430 cpu_idle_poll_ctrl(true); 431 } else if (!strcmp(str, "halt")) { 432 /* 433 * When the boot option of idle=halt is added, halt is 434 * forced to be used for CPU idle. In such case CPU C2/C3 435 * won't be used again. 436 * To continue to load the CPU idle driver, don't touch 437 * the boot_option_idle_override. 438 */ 439 x86_idle = default_idle; 440 boot_option_idle_override = IDLE_HALT; 441 } else if (!strcmp(str, "nomwait")) { 442 /* 443 * If the boot option of "idle=nomwait" is added, 444 * it means that mwait will be disabled for CPU C2/C3 445 * states. In such case it won't touch the variable 446 * of boot_option_idle_override. 447 */ 448 boot_option_idle_override = IDLE_NOMWAIT; 449 } else 450 return -1; 451 452 return 0; 453 } 454 early_param("idle", idle_setup); 455 456 unsigned long arch_align_stack(unsigned long sp) 457 { 458 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 459 sp -= get_random_int() % 8192; 460 return sp & ~0xf; 461 } 462 463 unsigned long arch_randomize_brk(struct mm_struct *mm) 464 { 465 unsigned long range_end = mm->brk + 0x02000000; 466 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 467 } 468 469