1 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3 #include <linux/errno.h> 4 #include <linux/kernel.h> 5 #include <linux/mm.h> 6 #include <linux/smp.h> 7 #include <linux/prctl.h> 8 #include <linux/slab.h> 9 #include <linux/sched.h> 10 #include <linux/init.h> 11 #include <linux/export.h> 12 #include <linux/pm.h> 13 #include <linux/tick.h> 14 #include <linux/random.h> 15 #include <linux/user-return-notifier.h> 16 #include <linux/dmi.h> 17 #include <linux/utsname.h> 18 #include <linux/stackprotector.h> 19 #include <linux/tick.h> 20 #include <linux/cpuidle.h> 21 #include <trace/events/power.h> 22 #include <linux/hw_breakpoint.h> 23 #include <asm/cpu.h> 24 #include <asm/apic.h> 25 #include <asm/syscalls.h> 26 #include <linux/uaccess.h> 27 #include <asm/mwait.h> 28 #include <asm/fpu/internal.h> 29 #include <asm/debugreg.h> 30 #include <asm/nmi.h> 31 #include <asm/tlbflush.h> 32 #include <asm/mce.h> 33 #include <asm/vm86.h> 34 #include <asm/switch_to.h> 35 #include <asm/desc.h> 36 37 /* 38 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 39 * no more per-task TSS's. The TSS size is kept cacheline-aligned 40 * so they are allowed to end up in the .data..cacheline_aligned 41 * section. Since TSS's are completely CPU-local, we want them 42 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 43 */ 44 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 45 .x86_tss = { 46 .sp0 = TOP_OF_INIT_STACK, 47 #ifdef CONFIG_X86_32 48 .ss0 = __KERNEL_DS, 49 .ss1 = __KERNEL_CS, 50 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 51 #endif 52 }, 53 #ifdef CONFIG_X86_32 54 /* 55 * Note that the .io_bitmap member must be extra-big. This is because 56 * the CPU will access an additional byte beyond the end of the IO 57 * permission bitmap. The extra byte must be all 1 bits, and must 58 * be within the limit. 59 */ 60 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 61 #endif 62 #ifdef CONFIG_X86_32 63 .SYSENTER_stack_canary = STACK_END_MAGIC, 64 #endif 65 }; 66 EXPORT_PER_CPU_SYMBOL(cpu_tss); 67 68 DEFINE_PER_CPU(bool, need_tr_refresh); 69 EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh); 70 71 /* 72 * this gets called so that we can store lazy state into memory and copy the 73 * current task into the new thread. 74 */ 75 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 76 { 77 memcpy(dst, src, arch_task_struct_size); 78 #ifdef CONFIG_VM86 79 dst->thread.vm86 = NULL; 80 #endif 81 82 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 83 } 84 85 /* 86 * Free current thread data structures etc.. 87 */ 88 void exit_thread(struct task_struct *tsk) 89 { 90 struct thread_struct *t = &tsk->thread; 91 unsigned long *bp = t->io_bitmap_ptr; 92 struct fpu *fpu = &t->fpu; 93 94 if (bp) { 95 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 96 97 t->io_bitmap_ptr = NULL; 98 clear_thread_flag(TIF_IO_BITMAP); 99 /* 100 * Careful, clear this in the TSS too: 101 */ 102 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 103 t->io_bitmap_max = 0; 104 put_cpu(); 105 kfree(bp); 106 } 107 108 free_vm86(t); 109 110 fpu__drop(fpu); 111 } 112 113 void flush_thread(void) 114 { 115 struct task_struct *tsk = current; 116 117 flush_ptrace_hw_breakpoint(tsk); 118 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 119 120 fpu__clear(&tsk->thread.fpu); 121 } 122 123 static void hard_disable_TSC(void) 124 { 125 cr4_set_bits(X86_CR4_TSD); 126 } 127 128 void disable_TSC(void) 129 { 130 preempt_disable(); 131 if (!test_and_set_thread_flag(TIF_NOTSC)) 132 /* 133 * Must flip the CPU state synchronously with 134 * TIF_NOTSC in the current running context. 135 */ 136 hard_disable_TSC(); 137 preempt_enable(); 138 } 139 140 static void hard_enable_TSC(void) 141 { 142 cr4_clear_bits(X86_CR4_TSD); 143 } 144 145 static void enable_TSC(void) 146 { 147 preempt_disable(); 148 if (test_and_clear_thread_flag(TIF_NOTSC)) 149 /* 150 * Must flip the CPU state synchronously with 151 * TIF_NOTSC in the current running context. 152 */ 153 hard_enable_TSC(); 154 preempt_enable(); 155 } 156 157 int get_tsc_mode(unsigned long adr) 158 { 159 unsigned int val; 160 161 if (test_thread_flag(TIF_NOTSC)) 162 val = PR_TSC_SIGSEGV; 163 else 164 val = PR_TSC_ENABLE; 165 166 return put_user(val, (unsigned int __user *)adr); 167 } 168 169 int set_tsc_mode(unsigned int val) 170 { 171 if (val == PR_TSC_SIGSEGV) 172 disable_TSC(); 173 else if (val == PR_TSC_ENABLE) 174 enable_TSC(); 175 else 176 return -EINVAL; 177 178 return 0; 179 } 180 181 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 182 struct tss_struct *tss) 183 { 184 struct thread_struct *prev, *next; 185 186 prev = &prev_p->thread; 187 next = &next_p->thread; 188 189 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 190 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 191 unsigned long debugctl = get_debugctlmsr(); 192 193 debugctl &= ~DEBUGCTLMSR_BTF; 194 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 195 debugctl |= DEBUGCTLMSR_BTF; 196 197 update_debugctlmsr(debugctl); 198 } 199 200 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 201 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 202 /* prev and next are different */ 203 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 204 hard_disable_TSC(); 205 else 206 hard_enable_TSC(); 207 } 208 209 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 210 /* 211 * Copy the relevant range of the IO bitmap. 212 * Normally this is 128 bytes or less: 213 */ 214 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 215 max(prev->io_bitmap_max, next->io_bitmap_max)); 216 217 /* 218 * Make sure that the TSS limit is correct for the CPU 219 * to notice the IO bitmap. 220 */ 221 refresh_TR(); 222 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 223 /* 224 * Clear any possible leftover bits: 225 */ 226 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 227 } 228 propagate_user_return_notify(prev_p, next_p); 229 } 230 231 /* 232 * Idle related variables and functions 233 */ 234 unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 235 EXPORT_SYMBOL(boot_option_idle_override); 236 237 static void (*x86_idle)(void); 238 239 #ifndef CONFIG_SMP 240 static inline void play_dead(void) 241 { 242 BUG(); 243 } 244 #endif 245 246 void arch_cpu_idle_enter(void) 247 { 248 tsc_verify_tsc_adjust(false); 249 local_touch_nmi(); 250 } 251 252 void arch_cpu_idle_dead(void) 253 { 254 play_dead(); 255 } 256 257 /* 258 * Called from the generic idle code. 259 */ 260 void arch_cpu_idle(void) 261 { 262 x86_idle(); 263 } 264 265 /* 266 * We use this if we don't have any better idle routine.. 267 */ 268 void __cpuidle default_idle(void) 269 { 270 trace_cpu_idle_rcuidle(1, smp_processor_id()); 271 safe_halt(); 272 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 273 } 274 #ifdef CONFIG_APM_MODULE 275 EXPORT_SYMBOL(default_idle); 276 #endif 277 278 #ifdef CONFIG_XEN 279 bool xen_set_default_idle(void) 280 { 281 bool ret = !!x86_idle; 282 283 x86_idle = default_idle; 284 285 return ret; 286 } 287 #endif 288 void stop_this_cpu(void *dummy) 289 { 290 local_irq_disable(); 291 /* 292 * Remove this CPU: 293 */ 294 set_cpu_online(smp_processor_id(), false); 295 disable_local_APIC(); 296 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 297 298 for (;;) 299 halt(); 300 } 301 302 /* 303 * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power 304 * states (local apic timer and TSC stop). 305 */ 306 static void amd_e400_idle(void) 307 { 308 /* 309 * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E 310 * gets set after static_cpu_has() places have been converted via 311 * alternatives. 312 */ 313 if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 314 default_idle(); 315 return; 316 } 317 318 tick_broadcast_enter(); 319 320 default_idle(); 321 322 /* 323 * The switch back from broadcast mode needs to be called with 324 * interrupts disabled. 325 */ 326 local_irq_disable(); 327 tick_broadcast_exit(); 328 local_irq_enable(); 329 } 330 331 /* 332 * Intel Core2 and older machines prefer MWAIT over HALT for C1. 333 * We can't rely on cpuidle installing MWAIT, because it will not load 334 * on systems that support only C1 -- so the boot default must be MWAIT. 335 * 336 * Some AMD machines are the opposite, they depend on using HALT. 337 * 338 * So for default C1, which is used during boot until cpuidle loads, 339 * use MWAIT-C1 on Intel HW that has it, else use HALT. 340 */ 341 static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 342 { 343 if (c->x86_vendor != X86_VENDOR_INTEL) 344 return 0; 345 346 if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) 347 return 0; 348 349 return 1; 350 } 351 352 /* 353 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT 354 * with interrupts enabled and no flags, which is backwards compatible with the 355 * original MWAIT implementation. 356 */ 357 static __cpuidle void mwait_idle(void) 358 { 359 if (!current_set_polling_and_test()) { 360 trace_cpu_idle_rcuidle(1, smp_processor_id()); 361 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 362 mb(); /* quirk */ 363 clflush((void *)¤t_thread_info()->flags); 364 mb(); /* quirk */ 365 } 366 367 __monitor((void *)¤t_thread_info()->flags, 0, 0); 368 if (!need_resched()) 369 __sti_mwait(0, 0); 370 else 371 local_irq_enable(); 372 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 373 } else { 374 local_irq_enable(); 375 } 376 __current_clr_polling(); 377 } 378 379 void select_idle_routine(const struct cpuinfo_x86 *c) 380 { 381 #ifdef CONFIG_SMP 382 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 383 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 384 #endif 385 if (x86_idle || boot_option_idle_override == IDLE_POLL) 386 return; 387 388 if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { 389 pr_info("using AMD E400 aware idle routine\n"); 390 x86_idle = amd_e400_idle; 391 } else if (prefer_mwait_c1_over_halt(c)) { 392 pr_info("using mwait in idle threads\n"); 393 x86_idle = mwait_idle; 394 } else 395 x86_idle = default_idle; 396 } 397 398 void amd_e400_c1e_apic_setup(void) 399 { 400 if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { 401 pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); 402 local_irq_disable(); 403 tick_broadcast_force(); 404 local_irq_enable(); 405 } 406 } 407 408 void __init arch_post_acpi_subsys_init(void) 409 { 410 u32 lo, hi; 411 412 if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) 413 return; 414 415 /* 416 * AMD E400 detection needs to happen after ACPI has been enabled. If 417 * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in 418 * MSR_K8_INT_PENDING_MSG. 419 */ 420 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 421 if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) 422 return; 423 424 boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); 425 426 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 427 mark_tsc_unstable("TSC halt in AMD C1E"); 428 pr_info("System has AMD C1E enabled\n"); 429 } 430 431 static int __init idle_setup(char *str) 432 { 433 if (!str) 434 return -EINVAL; 435 436 if (!strcmp(str, "poll")) { 437 pr_info("using polling idle threads\n"); 438 boot_option_idle_override = IDLE_POLL; 439 cpu_idle_poll_ctrl(true); 440 } else if (!strcmp(str, "halt")) { 441 /* 442 * When the boot option of idle=halt is added, halt is 443 * forced to be used for CPU idle. In such case CPU C2/C3 444 * won't be used again. 445 * To continue to load the CPU idle driver, don't touch 446 * the boot_option_idle_override. 447 */ 448 x86_idle = default_idle; 449 boot_option_idle_override = IDLE_HALT; 450 } else if (!strcmp(str, "nomwait")) { 451 /* 452 * If the boot option of "idle=nomwait" is added, 453 * it means that mwait will be disabled for CPU C2/C3 454 * states. In such case it won't touch the variable 455 * of boot_option_idle_override. 456 */ 457 boot_option_idle_override = IDLE_NOMWAIT; 458 } else 459 return -1; 460 461 return 0; 462 } 463 early_param("idle", idle_setup); 464 465 unsigned long arch_align_stack(unsigned long sp) 466 { 467 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 468 sp -= get_random_int() % 8192; 469 return sp & ~0xf; 470 } 471 472 unsigned long arch_randomize_brk(struct mm_struct *mm) 473 { 474 return randomize_page(mm->brk, 0x02000000); 475 } 476 477 /* 478 * Return saved PC of a blocked thread. 479 * What is this good for? it will be always the scheduler or ret_from_fork. 480 */ 481 unsigned long thread_saved_pc(struct task_struct *tsk) 482 { 483 struct inactive_task_frame *frame = 484 (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); 485 return READ_ONCE_NOCHECK(frame->ret_addr); 486 } 487 488 /* 489 * Called from fs/proc with a reference on @p to find the function 490 * which called into schedule(). This needs to be done carefully 491 * because the task might wake up and we might look at a stack 492 * changing under us. 493 */ 494 unsigned long get_wchan(struct task_struct *p) 495 { 496 unsigned long start, bottom, top, sp, fp, ip, ret = 0; 497 int count = 0; 498 499 if (!p || p == current || p->state == TASK_RUNNING) 500 return 0; 501 502 if (!try_get_task_stack(p)) 503 return 0; 504 505 start = (unsigned long)task_stack_page(p); 506 if (!start) 507 goto out; 508 509 /* 510 * Layout of the stack page: 511 * 512 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) 513 * PADDING 514 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING 515 * stack 516 * ----------- bottom = start 517 * 518 * The tasks stack pointer points at the location where the 519 * framepointer is stored. The data on the stack is: 520 * ... IP FP ... IP FP 521 * 522 * We need to read FP and IP, so we need to adjust the upper 523 * bound by another unsigned long. 524 */ 525 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; 526 top -= 2 * sizeof(unsigned long); 527 bottom = start; 528 529 sp = READ_ONCE(p->thread.sp); 530 if (sp < bottom || sp > top) 531 goto out; 532 533 fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); 534 do { 535 if (fp < bottom || fp > top) 536 goto out; 537 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); 538 if (!in_sched_functions(ip)) { 539 ret = ip; 540 goto out; 541 } 542 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); 543 } while (count++ < 16 && p->state != TASK_RUNNING); 544 545 out: 546 put_task_stack(p); 547 return ret; 548 } 549