1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/errno.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/smp.h> 7 #include <linux/prctl.h> 8 #include <linux/slab.h> 9 #include <linux/sched.h> 10 #include <linux/module.h> 11 #include <linux/pm.h> 12 #include <linux/clockchips.h> 13 #include <linux/random.h> 14 #include <linux/user-return-notifier.h> 15 #include <linux/dmi.h> 16 #include <linux/utsname.h> 17 #include <linux/stackprotector.h> 18 #include <linux/tick.h> 19 #include <linux/cpuidle.h> 20 #include <trace/events/power.h> 21 #include <linux/hw_breakpoint.h> 22 #include <asm/cpu.h> 23 #include <asm/apic.h> 24 #include <asm/syscalls.h> 25 #include <asm/idle.h> 26 #include <asm/uaccess.h> 27 #include <asm/i387.h> 28 #include <asm/fpu-internal.h> 29 #include <asm/debugreg.h> 30 #include <asm/nmi.h> 31 32 /* 33 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 34 * no more per-task TSS's. The TSS size is kept cacheline-aligned 35 * so they are allowed to end up in the .data..cacheline_aligned 36 * section. Since TSS's are completely CPU-local, we want them 37 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 38 */ 39 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; 40 41 #ifdef CONFIG_X86_64 42 static DEFINE_PER_CPU(unsigned char, is_idle); 43 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 44 45 void idle_notifier_register(struct notifier_block *n) 46 { 47 atomic_notifier_chain_register(&idle_notifier, n); 48 } 49 EXPORT_SYMBOL_GPL(idle_notifier_register); 50 51 void idle_notifier_unregister(struct notifier_block *n) 52 { 53 atomic_notifier_chain_unregister(&idle_notifier, n); 54 } 55 EXPORT_SYMBOL_GPL(idle_notifier_unregister); 56 #endif 57 58 struct kmem_cache *task_xstate_cachep; 59 EXPORT_SYMBOL_GPL(task_xstate_cachep); 60 61 /* 62 * this gets called so that we can store lazy state into memory and copy the 63 * current task into the new thread. 64 */ 65 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 66 { 67 int ret; 68 69 *dst = *src; 70 if (fpu_allocated(&src->thread.fpu)) { 71 memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); 72 ret = fpu_alloc(&dst->thread.fpu); 73 if (ret) 74 return ret; 75 fpu_copy(dst, src); 76 } 77 return 0; 78 } 79 80 void free_thread_xstate(struct task_struct *tsk) 81 { 82 fpu_free(&tsk->thread.fpu); 83 } 84 85 void arch_release_task_struct(struct task_struct *tsk) 86 { 87 free_thread_xstate(tsk); 88 } 89 90 void arch_task_cache_init(void) 91 { 92 task_xstate_cachep = 93 kmem_cache_create("task_xstate", xstate_size, 94 __alignof__(union thread_xstate), 95 SLAB_PANIC | SLAB_NOTRACK, NULL); 96 } 97 98 /* 99 * Free current thread data structures etc.. 100 */ 101 void exit_thread(void) 102 { 103 struct task_struct *me = current; 104 struct thread_struct *t = &me->thread; 105 unsigned long *bp = t->io_bitmap_ptr; 106 107 if (bp) { 108 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 109 110 t->io_bitmap_ptr = NULL; 111 clear_thread_flag(TIF_IO_BITMAP); 112 /* 113 * Careful, clear this in the TSS too: 114 */ 115 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 116 t->io_bitmap_max = 0; 117 put_cpu(); 118 kfree(bp); 119 } 120 121 drop_fpu(me); 122 } 123 124 void flush_thread(void) 125 { 126 struct task_struct *tsk = current; 127 128 flush_ptrace_hw_breakpoint(tsk); 129 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 130 drop_init_fpu(tsk); 131 /* 132 * Free the FPU state for non xsave platforms. They get reallocated 133 * lazily at the first use. 134 */ 135 if (!use_eager_fpu()) 136 free_thread_xstate(tsk); 137 } 138 139 static void hard_disable_TSC(void) 140 { 141 write_cr4(read_cr4() | X86_CR4_TSD); 142 } 143 144 void disable_TSC(void) 145 { 146 preempt_disable(); 147 if (!test_and_set_thread_flag(TIF_NOTSC)) 148 /* 149 * Must flip the CPU state synchronously with 150 * TIF_NOTSC in the current running context. 151 */ 152 hard_disable_TSC(); 153 preempt_enable(); 154 } 155 156 static void hard_enable_TSC(void) 157 { 158 write_cr4(read_cr4() & ~X86_CR4_TSD); 159 } 160 161 static void enable_TSC(void) 162 { 163 preempt_disable(); 164 if (test_and_clear_thread_flag(TIF_NOTSC)) 165 /* 166 * Must flip the CPU state synchronously with 167 * TIF_NOTSC in the current running context. 168 */ 169 hard_enable_TSC(); 170 preempt_enable(); 171 } 172 173 int get_tsc_mode(unsigned long adr) 174 { 175 unsigned int val; 176 177 if (test_thread_flag(TIF_NOTSC)) 178 val = PR_TSC_SIGSEGV; 179 else 180 val = PR_TSC_ENABLE; 181 182 return put_user(val, (unsigned int __user *)adr); 183 } 184 185 int set_tsc_mode(unsigned int val) 186 { 187 if (val == PR_TSC_SIGSEGV) 188 disable_TSC(); 189 else if (val == PR_TSC_ENABLE) 190 enable_TSC(); 191 else 192 return -EINVAL; 193 194 return 0; 195 } 196 197 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 198 struct tss_struct *tss) 199 { 200 struct thread_struct *prev, *next; 201 202 prev = &prev_p->thread; 203 next = &next_p->thread; 204 205 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 206 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 207 unsigned long debugctl = get_debugctlmsr(); 208 209 debugctl &= ~DEBUGCTLMSR_BTF; 210 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 211 debugctl |= DEBUGCTLMSR_BTF; 212 213 update_debugctlmsr(debugctl); 214 } 215 216 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 217 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 218 /* prev and next are different */ 219 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 220 hard_disable_TSC(); 221 else 222 hard_enable_TSC(); 223 } 224 225 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 226 /* 227 * Copy the relevant range of the IO bitmap. 228 * Normally this is 128 bytes or less: 229 */ 230 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 231 max(prev->io_bitmap_max, next->io_bitmap_max)); 232 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 233 /* 234 * Clear any possible leftover bits: 235 */ 236 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 237 } 238 propagate_user_return_notify(prev_p, next_p); 239 } 240 241 /* 242 * Idle related variables and functions 243 */ 244 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 245 EXPORT_SYMBOL(boot_option_idle_override); 246 247 static void (*x86_idle)(void); 248 249 #ifndef CONFIG_SMP 250 static inline void play_dead(void) 251 { 252 BUG(); 253 } 254 #endif 255 256 #ifdef CONFIG_X86_64 257 void enter_idle(void) 258 { 259 this_cpu_write(is_idle, 1); 260 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 261 } 262 263 static void __exit_idle(void) 264 { 265 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 266 return; 267 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 268 } 269 270 /* Called from interrupts to signify idle end */ 271 void exit_idle(void) 272 { 273 /* idle loop has pid 0 */ 274 if (current->pid) 275 return; 276 __exit_idle(); 277 } 278 #endif 279 280 void arch_cpu_idle_enter(void) 281 { 282 local_touch_nmi(); 283 enter_idle(); 284 } 285 286 void arch_cpu_idle_exit(void) 287 { 288 __exit_idle(); 289 } 290 291 void arch_cpu_idle_dead(void) 292 { 293 play_dead(); 294 } 295 296 /* 297 * Called from the generic idle code. 298 */ 299 void arch_cpu_idle(void) 300 { 301 if (cpuidle_idle_call()) 302 x86_idle(); 303 else 304 local_irq_enable(); 305 } 306 307 /* 308 * We use this if we don't have any better idle routine.. 309 */ 310 void default_idle(void) 311 { 312 trace_cpu_idle_rcuidle(1, smp_processor_id()); 313 safe_halt(); 314 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 315 } 316 #ifdef CONFIG_APM_MODULE 317 EXPORT_SYMBOL(default_idle); 318 #endif 319 320 #ifdef CONFIG_XEN 321 bool xen_set_default_idle(void) 322 { 323 bool ret = !!x86_idle; 324 325 x86_idle = default_idle; 326 327 return ret; 328 } 329 #endif 330 void stop_this_cpu(void *dummy) 331 { 332 local_irq_disable(); 333 /* 334 * Remove this CPU: 335 */ 336 set_cpu_online(smp_processor_id(), false); 337 disable_local_APIC(); 338 339 for (;;) 340 halt(); 341 } 342 343 bool amd_e400_c1e_detected; 344 EXPORT_SYMBOL(amd_e400_c1e_detected); 345 346 static cpumask_var_t amd_e400_c1e_mask; 347 348 void amd_e400_remove_cpu(int cpu) 349 { 350 if (amd_e400_c1e_mask != NULL) 351 cpumask_clear_cpu(cpu, amd_e400_c1e_mask); 352 } 353 354 /* 355 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt 356 * pending message MSR. If we detect C1E, then we handle it the same 357 * way as C3 power states (local apic timer and TSC stop) 358 */ 359 static void amd_e400_idle(void) 360 { 361 if (!amd_e400_c1e_detected) { 362 u32 lo, hi; 363 364 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 365 366 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 367 amd_e400_c1e_detected = true; 368 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 369 mark_tsc_unstable("TSC halt in AMD C1E"); 370 pr_info("System has AMD C1E enabled\n"); 371 } 372 } 373 374 if (amd_e400_c1e_detected) { 375 int cpu = smp_processor_id(); 376 377 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 378 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 379 /* 380 * Force broadcast so ACPI can not interfere. 381 */ 382 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 383 &cpu); 384 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 385 } 386 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 387 388 default_idle(); 389 390 /* 391 * The switch back from broadcast mode needs to be 392 * called with interrupts disabled. 393 */ 394 local_irq_disable(); 395 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 396 local_irq_enable(); 397 } else 398 default_idle(); 399 } 400 401 void select_idle_routine(const struct cpuinfo_x86 *c) 402 { 403 #ifdef CONFIG_SMP 404 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 405 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 406 #endif 407 if (x86_idle || boot_option_idle_override == IDLE_POLL) 408 return; 409 410 if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { 411 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 412 pr_info("using AMD E400 aware idle routine\n"); 413 x86_idle = amd_e400_idle; 414 } else 415 x86_idle = default_idle; 416 } 417 418 void __init init_amd_e400_c1e_mask(void) 419 { 420 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ 421 if (x86_idle == amd_e400_idle) 422 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); 423 } 424 425 static int __init idle_setup(char *str) 426 { 427 if (!str) 428 return -EINVAL; 429 430 if (!strcmp(str, "poll")) { 431 pr_info("using polling idle threads\n"); 432 boot_option_idle_override = IDLE_POLL; 433 cpu_idle_poll_ctrl(true); 434 } else if (!strcmp(str, "halt")) { 435 /* 436 * When the boot option of idle=halt is added, halt is 437 * forced to be used for CPU idle. In such case CPU C2/C3 438 * won't be used again. 439 * To continue to load the CPU idle driver, don't touch 440 * the boot_option_idle_override. 441 */ 442 x86_idle = default_idle; 443 boot_option_idle_override = IDLE_HALT; 444 } else if (!strcmp(str, "nomwait")) { 445 /* 446 * If the boot option of "idle=nomwait" is added, 447 * it means that mwait will be disabled for CPU C2/C3 448 * states. In such case it won't touch the variable 449 * of boot_option_idle_override. 450 */ 451 boot_option_idle_override = IDLE_NOMWAIT; 452 } else 453 return -1; 454 455 return 0; 456 } 457 early_param("idle", idle_setup); 458 459 unsigned long arch_align_stack(unsigned long sp) 460 { 461 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 462 sp -= get_random_int() % 8192; 463 return sp & ~0xf; 464 } 465 466 unsigned long arch_randomize_brk(struct mm_struct *mm) 467 { 468 unsigned long range_end = mm->brk + 0x02000000; 469 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 470 } 471 472